In [1]:
import numpy as np # linear algebra
import pandas as pd 

In [2]:
text_emotion = pd.read_csv("text_emotion.csv")

In [3]:
text_emotion.head(10)

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...
5,1956968477,worry,xxxPEACHESxxx,Re-pinging @ghostridah14: why didn't you go to...
6,1956968487,sadness,ShansBee,"I should be sleep, but im not! thinking about ..."
7,1956968636,worry,mcsleazy,Hmmm. http://www.djhero.com/ is down
8,1956969035,sadness,nic0lepaula,@charviray Charlene my love. I miss you
9,1956969172,sadness,Ingenue_Em,@kelcouch I'm sorry at least it's Friday?


In [4]:
text_emotion = text_emotion.drop('author', axis=1)

In [5]:
text_emotion.head(5)

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


Preprocessing the Data

In [6]:
text_emotion['content'] = text_emotion['content'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [7]:
text_emotion.head

<bound method NDFrame.head of          tweet_id   sentiment  \
0      1956967341       empty   
1      1956967666     sadness   
2      1956967696     sadness   
3      1956967789  enthusiasm   
4      1956968416     neutral   
...           ...         ...   
39995  1753918954     neutral   
39996  1753919001        love   
39997  1753919005        love   
39998  1753919043   happiness   
39999  1753919049        love   

                                                 content  
0      @tiffanylue i know i was listenin to bad habit...  
1      layin n bed with a headache ughhhh...waitin on...  
2                    funeral ceremony...gloomy friday...  
3                   wants to hang out with friends soon!  
4      @dannycastillo we want to trade with someone w...  
...                                                  ...  
39995                                   @johnlloydtaylor  
39996                      happy mothers day all my love  
39997  happy mother's day to all the mommi

In [8]:
#Removing Punctuation, Symbols
text_emotion['content'] = text_emotion['content'].str.replace('[^\w\s]',' ')

In [9]:
#Removing Stop Words using NLTK
from nltk.corpus import stopwords
stop = stopwords.words('english')
text_emotion['content'] = text_emotion['content'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [10]:
text_emotion.head(5)

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,tiffanylue know listenin bad habit earlier sta...
1,1956967666,sadness,layin n bed headache ughhhh waitin call
2,1956967696,sadness,funeral ceremony gloomy friday
3,1956967789,enthusiasm,wants hang friends soon
4,1956968416,neutral,dannycastillo want trade someone houston ticke...


In [13]:
from textblob import Word
text_emotion['content'] = text_emotion['content'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

In [14]:
import re
def de_repeat(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

In [15]:
text_emotion['content'] = text_emotion['content'].apply(lambda x: " ".join(de_repeat(x) for x in x.split()))

In [16]:
# Code to find the top 10,000 rarest words appearing in the data
freq = pd.Series(' '.join(text_emotion['content']).split()).value_counts()[-10000:]

In [17]:
 #Removing all those rarely appearing words from the data
freq = list(freq.index)
text_emotion['content'] = text_emotion['content'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))

In [18]:
from sklearn import preprocessing
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(text_emotion.sentiment.values)

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(text_emotion.content.values, y, stratify=y, random_state=42, test_size=0.1, shuffle=True)

In [20]:
# Extracting TF-IDF parameters
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=1000, analyzer='word',ngram_range=(1,5))

In [21]:
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.fit_transform(X_val)

In [22]:
# Extracting Count Vectors Parameters
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(analyzer='word')
count_vect.fit(text_emotion['content'])

CountVectorizer()

In [23]:
X_train_count =  count_vect.transform(X_train)
X_val_count =  count_vect.transform(X_val)

In [24]:
from sklearn.metrics import accuracy_score

In [25]:
# Model 1: Multinomial Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
y_pred = nb.predict(X_val_tfidf)
print('naive bayes TF-IDF accuracy %s' % accuracy_score(y_pred, y_val))

naive bayes TF-IDF accuracy 0.2285


In [26]:
# Model 2: Linear SVM
from sklearn.linear_model import SGDClassifier
lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(X_train_tfidf, y_train)
y_pred = lsvm.predict(X_val_tfidf)
print('svm using tfidf accuracy %s' % accuracy_score(y_pred, y_val))

svm using tfidf accuracy 0.196


In [27]:
# Model 3: logistic regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1)
logreg.fit(X_train_tfidf, y_train)
y_pred = logreg.predict(X_val_tfidf)
print('log reg tfidf accuracy %s' % accuracy_score(y_pred, y_val))

log reg tfidf accuracy 0.23075


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
# Model 4: Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=500)
rf.fit(X_train_tfidf, y_train)
y_pred = rf.predict(X_val_tfidf)
print('random forest tfidf accuracy %s' % accuracy_score(y_pred, y_val))

random forest tfidf accuracy 0.22775


In [31]:
 #Model 1: Multinomial Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_count, y_train)
y_pred = nb.predict(X_val_count)
print('naive bayes count vectors accuracy %s' % accuracy_score(y_pred, y_val))


naive bayes count vectors accuracy 0.32875


In [32]:
# Model 2: Linear SVM
from sklearn.linear_model import SGDClassifier
lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(X_train_count, y_train)
y_pred = lsvm.predict(X_val_count)
print('lsvm using count vectors accuracy %s' % accuracy_score(y_pred, y_val))

lsvm using count vectors accuracy 0.3305


In [33]:
# Model 3: Logistic Regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1)
logreg.fit(X_train_count, y_train)
y_pred = logreg.predict(X_val_count)
print('log reg count vectors accuracy %s' % accuracy_score(y_pred, y_val))

log reg count vectors accuracy 0.345


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [34]:
# Model 4: Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=500)
rf.fit(X_train_count, y_train)
y_pred = rf.predict(X_val_count)
print('random forest with count vectors accuracy %s' % accuracy_score(y_pred, y_val))

random forest with count vectors accuracy 0.3175


In [35]:
tweets = pd.DataFrame(['I am very happy today! The atmosphere looks cheerful',
                       'His death broke my heart. It was a sad day',
                      'I am very happy today!',
                      'cant fall asleep',
                      'Happy Mothers Day All my love',
                      'please to meet you',
                      'she is crying'])

In [36]:
tweets[0] = tweets[0].str.replace('[^\w\s]',' ')
from nltk.corpus import stopwords
stop = stopwords.words('english')
tweets[0] = tweets[0].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [37]:
from textblob import Word
tweets[0] = tweets[0].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

In [38]:
# Extracting Count Vectors feature from our tweets
tweet_count = count_vect.transform(tweets[0])