In [5]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [6]:
emotion = pd.read_csv('combined_emotion.csv')

In [7]:
emotion.head()

Unnamed: 0,sentence,emotion
0,i just feel really helpless and heavy hearted,fear
1,ive enjoyed being able to slouch about relax a...,sad
2,i gave up my internship with the dmrg and am f...,fear
3,i dont know i feel so lost,sad
4,i am a kindergarten teacher and i am thoroughl...,fear


In [8]:
sentiment = pd.read_csv('combined_sentiment_data.csv')
sentiment.head()

Unnamed: 0,sentence,sentiment
0,So there is no way for me to plug it in here i...,negative
1,"Good case, Excellent value.",positive
2,Great for the jawbone.,positive
3,Tied to charger for conversations lasting more...,negative
4,The mic is great.,positive


In [9]:
print(emotion.shape)
print(sentiment.shape)

(422746, 2)
(3309, 2)


In [10]:
print('Emotions are: ', emotion['emotion'].unique())
print('Sentiments are: ', sentiment['sentiment'].unique())

Emotions are:  ['fear' 'sad' 'love' 'joy' 'suprise' 'anger']
Sentiments are:  ['negative' 'positive']


In [11]:
emotion.isnull().sum()

sentence    0
emotion     0
dtype: int64

In [12]:
sentiment.isnull().sum()

sentence     0
sentiment    0
dtype: int64

In [13]:
print(emotion.duplicated().sum())
print(sentiment.duplicated().sum())

6623
19


In [14]:
emotion.drop_duplicates(inplace = True)
sentiment.drop_duplicates(inplace = True)
print(emotion.duplicated().sum())
print(sentiment.duplicated().sum())

0
0


In [15]:
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

In [16]:
emotion['emotion'].value_counts()

emotion
joy        140779
sad        120989
anger       57235
fear        47664
love        34497
suprise     14959
Name: count, dtype: int64

In [17]:

plot_title = f"Emotions found"
fig = px.histogram( x=emotion['emotion'], template="plotly_dark",
                   title=plot_title)
fig.update_traces(marker_color = ['salmon', 'burlywood', 'fuchsia', 'forestgreen',
                                  'purple', 'sienna'], marker_line_width = 1,
                 marker_line_color = 'darkslategrey')
fig.update_xaxes(title = 'Emotions')
fig.update_layout(showlegend=False)
fig.show()

In [18]:
plot_title = f"Proportion Of Sentiments"
fig = px.pie( values=sentiment['sentiment'].value_counts(), 
              names=sentiment['sentiment'].value_counts().index.unique(), 
              template="plotly_dark",
              title=plot_title)
fig.update_traces(marker_colors = ['salmon', 'forestgreen'], textinfo = 'label+percent',
                  marker_line_width = 2, marker_line_color = 'saddlebrown',
                  textfont_color = 'saddlebrown')
fig.update_layout(showlegend=False)
fig.show()

In [19]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [21]:
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
import spacy


In [22]:
from bs4 import BeautifulSoup
import nltk
nltk.download('stopwords')
from tqdm import tqdm_notebook
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shamimkhan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
def remove_html(text):
  bs = BeautifulSoup(text, 'html.parser')
  return ' ' + bs.get_text() + ' '

def keep_only_letters(text):
  text = re.sub(r'[^a-zA-Z\s]', ' ', text)
  return text

def convert_to_lowercase(text):
  return text.lower()

def clean_reviews(text):
  text = remove_html(text)
  text = keep_only_letters(text)
  text = convert_to_lowercase(text)
  return text


emotion['sentence'] = emotion['sentence'].apply(clean_reviews)
sentiment['sentene'] = sentiment['sentence'].apply(clean_reviews)

In [24]:
english_stop_words = nltk.corpus.stopwords.words('english')

print(f"Total number of English stopwords: {len(english_stop_words)}")
print("\nFirst 20 stopwords:")
print(english_stop_words[:20])

Total number of English stopwords: 198

First 20 stopwords:
['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been']


In [25]:
def remove_stop_words(text):
  for stopword in english_stop_words:
    stopword = ' ' + stopword + ' '
    text = text.replace(stopword, ' ')
    return text

emotion['sentence'] = emotion['sentence'].apply(remove_stop_words)
sentiment['sentence'] = sentiment['sentence'].apply(remove_stop_words)

In [26]:
def text_stemming(text):
  stemmer = nltk.porter.PorterStemmer()
  stemmed = ' '.join([stemmer.stem(token) for token in text.split()])
  return stemmed

emotion['sentence'] = emotion['sentence'].apply(text_stemming)
sentiment['sentence'] = sentiment['sentence'].apply(text_stemming)

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
import sklearn

In [28]:
emotion_train = emotion[:40000]
emotion_test = emotion[40000:]

vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(use_idf = True, ngram_range=(1,1))
tfidf_features_train = vectorizer.fit_transform(emotion['sentence'])
tfidf_features_test = vectorizer.transform(emotion['sentence'])
print(tfidf_features_train.shape, tfidf_features_test.shape)

(416123, 51907) (416123, 51907)


In [29]:
sentiment_train = sentiment[:1500]
sentiment_test = sentiment[1500:]

vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(use_idf = True, ngram_range=(1,1))
tfidf_sfeatures_train = vectorizer.fit_transform(sentiment['sentence'])
tfidf_sfeatures_test = vectorizer.transform(sentiment['sentence'])
print(tfidf_sfeatures_train.shape, tfidf_sfeatures_test.shape)

(3290, 5417) (3290, 5417)


In [30]:
emotion_sentences = emotion.iloc[:, 0]
emotion_labels = emotion.iloc[:, 1]

print(len(emotion_sentences), len(emotion_labels))

416123 416123


In [31]:
sentiment_sentences = sentiment.iloc[:, 0]
sentiment_labels = sentiment.iloc[:, 1]

print(len(sentiment_sentences), len(sentiment_labels))

3290 3290


In [32]:
clf = sklearn.linear_model.LogisticRegression()
clf.fit(tfidf_features_train, emotion_labels)


lbfgs failed to converge after 100 iteration(s) (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [33]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [34]:
predictions = clf.predict(tfidf_features_test)
print(classification_report(emotion_labels, predictions))
print(confusion_matrix(emotion_labels, predictions))

              precision    recall  f1-score   support

       anger       0.89      0.89      0.89     57235
        fear       0.85      0.85      0.85     47664
         joy       0.90      0.92      0.91    140779
        love       0.81      0.70      0.75     34497
         sad       0.92      0.93      0.93    120989
     suprise       0.78      0.72      0.75     14959

    accuracy                           0.89    416123
   macro avg       0.86      0.84      0.85    416123
weighted avg       0.89      0.89      0.89    416123

[[ 51102   1530   1423    306   2761    113]
 [  1647  40729   1297    225   2268   1498]
 [  1171   1030 130169   4582   2765   1062]
 [   371    255   8613  24061   1089    108]
 [  2813   2022   2574    599 112645    336]
 [   127   2249   1293    106    389  10795]]
