## **Data Intialisation**

In [1]:
import nltk
nltk.download('punkt')
import pandas as pd

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)
import pandas as pd

Mounted at /content/drive


In [3]:
df=pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/text_emotion.csv')

In [4]:
df.head()

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...


In [5]:
df['sentiment'].unique()

array(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
       'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'],
      dtype=object)

In [6]:
df.shape

(40000, 4)

In [7]:
df['sentiment'].value_counts()

neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: sentiment, dtype: int64

In [8]:
# Dropping rows with other emotion labels
df = df.drop(df[df.sentiment == 'anger'].index)
df = df.drop(df[df.sentiment == 'boredom'].index)
df = df.drop(df[df.sentiment == 'enthusiasm'].index)
df = df.drop(df[df.sentiment == 'empty'].index)
df = df.drop(df[df.sentiment == 'fun'].index)
df = df.drop(df[df.sentiment == 'relief'].index)
df = df.drop(df[df.sentiment == 'surprise'].index)
df = df.drop(df[df.sentiment == 'hate'].index)
df = df.drop(df[df.sentiment == 'love'].index)
df = df.drop(df[df.sentiment == 'neutral'].index)
df = df.drop(df[df.sentiment == 'worry'].index)

In [9]:
df=df.drop(["tweet_id","author"],axis=1)

In [10]:
df.head()

Unnamed: 0,sentiment,content
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
6,sadness,"I should be sleep, but im not! thinking about ..."
8,sadness,@charviray Charlene my love. I miss you
9,sadness,@kelcouch I'm sorry at least it's Friday?


In [11]:
df.shape
df['sentiment'].value_counts()

happiness    5209
sadness      5165
Name: sentiment, dtype: int64

## **Data Preprocessing**

In [12]:
from sklearn import preprocessing
#Encoding output labels 'sadness' as '1' & 'happiness' as '0'
lbl_enc = preprocessing.LabelEncoder()
df['label']= lbl_enc.fit_transform(df.sentiment.values)

In [13]:
print(df['sentiment'].value_counts())
print(df['label'].value_counts())

happiness    5209
sadness      5165
Name: sentiment, dtype: int64
0    5209
1    5165
Name: label, dtype: int64


In [None]:
#Encoding output labels 'happiness' as '0' 
#Encoding output labels 'sadness' as '1' 

In [14]:
#Making all review to lowercase
df['content'] = df['content'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [15]:
# Removing Punctuation, Symbols
df['content'] = df['content'].str.replace('[^\w\s]',' ')

In [16]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stops = set(stopwords.words("english"))
stops.remove("not")
stops.remove("but")
stops.remove("no")
df['content'] = df['content'].apply(lambda x: " ".join(x for x in x.split() if x not in stops))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [17]:
import re
#Correcting Letter Repetitions
def de_repeat(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

df['content'] = df['content'].apply(lambda x: " ".join(de_repeat(x) for x in x.split()))

In [18]:
# Code to find the top 10,000 rarest words appearing in the data
freq = pd.Series(' '.join(df['content']).split()).value_counts()[-10000:]

# Removing all those rarely appearing words from the data
freq = list(freq.index)
df['content'] = df['content'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))

In [19]:
df.head()

Unnamed: 0,sentiment,content,label
1,sadness,n bed headache ughh waitin call,1
2,sadness,funeral ceremony gloomy friday,1
6,sadness,sleep but im not thinking old friend want but ...,1
8,sadness,love miss,1
9,sadness,sorry least friday,1


In [20]:
#Function to split string to tokens
def identify_tokens(row):
    tokens = nltk.word_tokenize(row)
    token_words = [w for w in tokens if w.isalpha()]
    return token_words

In [21]:
#Tokenization of DataFrame
df['content'] = df["content"].apply(identify_tokens)

In [22]:
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()
#Funtion for lemmatizing the list of words
def lem_list(row):
    lemmatized_list = [lemmatizer.lemmatize(word) for word in row]
    return (lemmatized_list)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [23]:
#Lemmatize of the dataframe
df['content'] = df['content'].apply(lem_list)

In [24]:
df.head()

Unnamed: 0,sentiment,content,label
1,sadness,"[n, bed, headache, ughh, waitin, call]",1
2,sadness,"[funeral, ceremony, gloomy, friday]",1
6,sadness,"[sleep, but, im, not, thinking, old, friend, w...",1
8,sadness,"[love, miss]",1
9,sadness,"[sorry, least, friday]",1


In [25]:
#Function to join the processed words
def rejoin_words(row):
    joined_words = ( " ".join(row))
    return joined_words

In [26]:
#Joining the processed words in the data_frame
df['content'] = df["content"].apply(rejoin_words)

In [27]:
df.head()

Unnamed: 0,sentiment,content,label
1,sadness,n bed headache ughh waitin call,1
2,sadness,funeral ceremony gloomy friday,1
6,sadness,sleep but im not thinking old friend want but ...,1
8,sadness,love miss,1
9,sadness,sorry least friday,1


# **Splitting dataset I**
### **NUMPY array**

In [28]:
#x contains review and y contain sentiment
x=df.iloc[:,1].values
y=df.iloc[:,2].values

In [29]:
from sklearn.model_selection import train_test_split
#Splitting into training and testing data
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.33)

In [30]:
x_test.shape

(3424,)

In [31]:
x_train.shape

(6950,)

# **Splitting Dataset II**
### **Dataframe**

In [32]:
from sklearn.model_selection import train_test_split
x1_train,x1_test,y1_train,y1_test=train_test_split(df[['content']],df[['label']])

In [33]:
x1_test.shape

(2594, 1)

In [34]:
x1_train.shape

(7780, 1)

# **Feature Extraction**

### **TFIDF Vectorizer**

In [35]:
from sklearn.naive_bayes import MultinomialNB 
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [36]:
# Extracting TF-IDF parameters
tfidf = TfidfVectorizer(max_features=1000, analyzer='word',ngram_range=(1,3))
x_train_tfidf = tfidf.fit_transform(x_train)
x_val_tfidf = tfidf.fit_transform(x_test)

### **Count Vectorizer**

In [37]:
from sklearn.feature_extraction.text import CountVectorizer

In [38]:
# Extracting Count Vectors Parameters
count_vect = CountVectorizer(analyzer='word')
count_vect.fit(df['content'])
x_train_count =  count_vect.transform(x_train)
x_val_count =  count_vect.transform(x_test)

### **Feature Extraction Using Lexical Methods**

In [39]:
pip install --upgrade vaderSentiment

Collecting vaderSentiment
[?25l  Downloading https://files.pythonhosted.org/packages/76/fc/310e16254683c1ed35eeb97386986d6c00bc29df17ce280aed64d55537e9/vaderSentiment-3.3.2-py2.py3-none-any.whl (125kB)
[K     |██▋                             | 10kB 12.8MB/s eta 0:00:01[K     |█████▏                          | 20kB 19.1MB/s eta 0:00:01[K     |███████▉                        | 30kB 21.9MB/s eta 0:00:01[K     |██████████▍                     | 40kB 15.5MB/s eta 0:00:01[K     |█████████████                   | 51kB 8.5MB/s eta 0:00:01[K     |███████████████▋                | 61kB 9.7MB/s eta 0:00:01[K     |██████████████████▏             | 71kB 8.5MB/s eta 0:00:01[K     |████████████████████▉           | 81kB 9.3MB/s eta 0:00:01[K     |███████████████████████▍        | 92kB 9.3MB/s eta 0:00:01[K     |██████████████████████████      | 102kB 8.1MB/s eta 0:00:01[K     |████████████████████████████▋   | 112kB 8.1MB/s eta 0:00:01[K     |███████████████████████████████▏|

In [40]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

In [41]:
x1_train.head()

Unnamed: 0,content
28474,morning sunshine
27067,watching gon na grand day
39436,way watch star trek imax baby
34802,although killing right la
5960,got people let skip sci pratical becos skola i...


In [43]:
x1_train['negative'] = x1_train['content'].apply(lambda x: analyser.polarity_scores(x)["neg"])
x1_test['negative'] = x1_test['content'].apply(lambda x: analyser.polarity_scores(x)["neg"])
x1_train['positive'] = x1_train['content'].apply(lambda x: analyser.polarity_scores(x)["pos"])
x1_test['positive'] = x1_test['content'].apply(lambda x: analyser.polarity_scores(x)["pos"])
x1_train['neutral'] = x1_train['content'].apply(lambda x: analyser.polarity_scores(x)["neu"])
x1_test['neutral'] = x1_test['content'].apply(lambda x: analyser.polarity_scores(x)["neu"])
x1_train['compound'] = x1_train['content'].apply(lambda x: analyser.polarity_scores(x)["compound"])
x1_test['compound'] = x1_test['content'].apply(lambda x: analyser.polarity_scores(x)["compound"])

In [44]:
from textblob import TextBlob
x1_train['subjectivity'] = x1_train['content'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
x1_test['subjectivity'] = x1_test['content'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
x1_train['polarity'] = x1_train['content'].apply(lambda x: TextBlob(x).sentiment.polarity)
x1_test['polarity'] = x1_test['content'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [59]:
x1_train.head()

Unnamed: 0,content,negative,positive,neutral,compound,subjectivity,polarity
28474,morning sunshine,0.0,0.762,0.238,0.4939,0.0,0.0
27067,watching gon na grand day,0.0,0.429,0.571,0.4588,1.0,0.5
39436,way watch star trek imax baby,0.0,0.0,1.0,0.0,0.0,0.0
34802,although killing right la,0.595,0.0,0.405,-0.6597,0.535714,0.285714
5960,got people let skip sci pratical becos skola i...,0.0,0.249,0.751,0.5621,0.45,0.4


# **Training of Models**
## **I)Using TF-IDF Vectorizer**

In [46]:
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB 
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [64]:
# Model 1: Multinomial Naive Bayes Classifier
nb = MultinomialNB()
nb.fit(x_train_tfidf, y_train)
y_pred = nb.predict(x_val_tfidf)
print(accuracy_score(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))

0.5262850467289719
[[895 837]
 [785 907]]
              precision    recall  f1-score   support

           0       0.53      0.52      0.52      1732
           1       0.52      0.54      0.53      1692

    accuracy                           0.53      3424
   macro avg       0.53      0.53      0.53      3424
weighted avg       0.53      0.53      0.53      3424



In [65]:
# Model 2: Linear SVM
lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(x_train_tfidf, y_train)
y_pred = lsvm.predict(x_val_tfidf)
print(accuracy_score(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))

0.5084696261682243
[[945 948]
 [735 796]]
              precision    recall  f1-score   support

           0       0.56      0.50      0.53      1893
           1       0.46      0.52      0.49      1531

    accuracy                           0.51      3424
   macro avg       0.51      0.51      0.51      3424
weighted avg       0.52      0.51      0.51      3424



In [66]:
# Model 3: Logistic Regression
logreg = LogisticRegression(C=1)
logreg.fit(x_train_tfidf, y_train)
y_pred = logreg.predict(x_val_tfidf)
print(accuracy_score(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))

0.5204439252336449
[[924 886]
 [756 858]]
              precision    recall  f1-score   support

           0       0.55      0.51      0.53      1810
           1       0.49      0.53      0.51      1614

    accuracy                           0.52      3424
   macro avg       0.52      0.52      0.52      3424
weighted avg       0.52      0.52      0.52      3424



In [67]:
# Model 4: Random Forest Classifier
rf = RandomForestClassifier(n_estimators=500)
rf.fit(x_train_tfidf, y_train)
y_pred = rf.predict(x_val_tfidf)
print(accuracy_score(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))

0.5143107476635514
[[815 798]
 [865 946]]
              precision    recall  f1-score   support

           0       0.49      0.51      0.49      1613
           1       0.54      0.52      0.53      1811

    accuracy                           0.51      3424
   macro avg       0.51      0.51      0.51      3424
weighted avg       0.52      0.51      0.51      3424



## **II) Using Count Vectorizer**

In [68]:
# Model 1: Multinomial Naive Bayes Classifier
nb = MultinomialNB()
nb.fit(x_train_count, y_train)
y_pred = nb.predict(x_val_count)
print(accuracy_score(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))

0.8057827102803738
[[1351  336]
 [ 329 1408]]
              precision    recall  f1-score   support

           0       0.80      0.80      0.80      1687
           1       0.81      0.81      0.81      1737

    accuracy                           0.81      3424
   macro avg       0.81      0.81      0.81      3424
weighted avg       0.81      0.81      0.81      3424



In [69]:
# Model 2: Linear SVM
lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(x_train_count, y_train)
y_pred = lsvm.predict(x_val_count)
print(accuracy_score(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))

0.803446261682243
[[1401  394]
 [ 279 1350]]
              precision    recall  f1-score   support

           0       0.83      0.78      0.81      1795
           1       0.77      0.83      0.80      1629

    accuracy                           0.80      3424
   macro avg       0.80      0.80      0.80      3424
weighted avg       0.81      0.80      0.80      3424



In [70]:
# Model 3: Logistic Regression
logreg = LogisticRegression(C=1)
logreg.fit(x_train_count, y_train)
y_pred = logreg.predict(x_val_count)
print(accuracy_score(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))

0.8081191588785047
[[1382  359]
 [ 298 1385]]
              precision    recall  f1-score   support

           0       0.82      0.79      0.81      1741
           1       0.79      0.82      0.81      1683

    accuracy                           0.81      3424
   macro avg       0.81      0.81      0.81      3424
weighted avg       0.81      0.81      0.81      3424



In [71]:
# Model 4: Random Forest Classifier
rf = RandomForestClassifier(n_estimators=500)
rf.fit(x_train_count, y_train)
y_pred = rf.predict(x_val_count)
print(accuracy_score(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))

0.7964369158878505
[[1347  364]
 [ 333 1380]]
              precision    recall  f1-score   support

           0       0.80      0.79      0.79      1711
           1       0.79      0.81      0.80      1713

    accuracy                           0.80      3424
   macro avg       0.80      0.80      0.80      3424
weighted avg       0.80      0.80      0.80      3424



## **III) Using Vader Sentiments**

In [90]:
# Model 1: Linear SVM
lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(x1_train.drop(['content'], axis=1), y1_train)
y_pred = lsvm.predict(x1_test.drop(['content'], axis=1))
print(accuracy_score(y_pred, y1_test))
print(confusion_matrix(y_pred, y1_test))
print(classification_report(y_pred, y1_test))

0.7478797224363917
[[1168  537]
 [ 117  772]]
              precision    recall  f1-score   support

           0       0.91      0.69      0.78      1705
           1       0.59      0.87      0.70       889

    accuracy                           0.75      2594
   macro avg       0.75      0.78      0.74      2594
weighted avg       0.80      0.75      0.75      2594



  y = column_or_1d(y, warn=True)


In [91]:
# Model 2: Logistic Regression
logreg = LogisticRegression(C=1)
logreg.fit(x1_train.drop(['content'], axis=1), y1_train)
y_pred = logreg.predict(x1_test.drop(['content'], axis=1))
print(accuracy_score(y_pred, y1_test))
print(confusion_matrix(y_pred, y1_test))
print(classification_report(y_pred, y1_test))

0.7490362374710872
[[978 344]
 [307 965]]
              precision    recall  f1-score   support

           0       0.76      0.74      0.75      1322
           1       0.74      0.76      0.75      1272

    accuracy                           0.75      2594
   macro avg       0.75      0.75      0.75      2594
weighted avg       0.75      0.75      0.75      2594



  y = column_or_1d(y, warn=True)


In [92]:
# Model 3: Random Forest Classifier
rf = RandomForestClassifier(n_estimators=500)
rf.fit(x1_train.drop(['content'], axis=1), y1_train)
y_pred = rf.predict(x1_test.drop(['content'], axis=1))
print(accuracy_score(y_pred, y1_test))
print(confusion_matrix(y_pred, y1_test))
print(classification_report(y_pred, y1_test))

  This is separate from the ipykernel package so we can avoid doing imports until


0.7567463377023901
[[1055  401]
 [ 230  908]]
              precision    recall  f1-score   support

           0       0.82      0.72      0.77      1456
           1       0.69      0.80      0.74      1138

    accuracy                           0.76      2594
   macro avg       0.76      0.76      0.76      2594
weighted avg       0.77      0.76      0.76      2594



Here maximum accuracy is obtained when we use count vectorizer features and Logistic Regression model

In [82]:
tweetpos =pd.DataFrame(['''I am very happy today! The atmosphere looks cheerful.Things are looking great. It was such a good day.Success is right around the corner. 
            Lets celebrate this victory.Everything is more beautiful when you experience them with a smile!'''])
tweetneg=pd.DataFrame(['''Now this is my worst, okay? But I am gonna get better.I am tired, boss. Tired of being on the road, lonely as a sparrow in the rain.
           I am tired of all the pain I feel.This is quite depressing. I am filled with sorrow.His death broke my heart. It was a sad day.'''])


In [85]:
# Doing some preprocessing on these tweets as done before
tweetpos[0] = tweetpos[0].str.replace('[^\w\s]',' ')
from nltk.corpus import stopwords
stop = stopwords.words('english')
tweetpos[0] = tweetpos[0].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
from textblob import Word
tweetpos[0] = tweetpos[0].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

# Extracting Count Vectors feature from our tweets
tweetpos_count = count_vect.transform(tweetpos[0])

#Predicting the emotion of the tweet using our already trained linear SVM
tweetpos_pred = logreg.predict(tweetpos_count)
print(tweetpos_pred)

[0]


In [86]:
# Doing some preprocessing on these tweets as done before
tweetneg[0] = tweetneg[0].str.replace('[^\w\s]',' ')
from nltk.corpus import stopwords
stop = stopwords.words('english')
tweetneg[0] = tweetneg[0].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
from textblob import Word
tweetneg[0] = tweetneg[0].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

# Extracting Count Vectors feature from our tweets
tweetneg_count = count_vect.transform(tweetneg[0])

#Predicting the emotion of the tweet using our already trained linear SVM
tweetneg_pred = logreg.predict(tweetneg_count)
print(tweetneg_pred)

[1]
