In [1]:
import numpy as np
import pandas as pd
import nltk

In [2]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving clean_data.csv to clean_data.csv
User uploaded file "clean_data.csv" with length 909558 bytes


In [85]:
df = pd.read_csv("clean_data.csv")

In [86]:
df.head()

Unnamed: 0,sentiment,text
0,joy,on days when i feel close to my partner and ot...
1,fear,every time i imagine that someone i love or i ...
2,anger,when i had been obviously unjustly treated and...
3,sadness,when i think about the short time that we live...
4,disgust,at a gathering i found myself involuntarily si...


In [87]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7565 entries, 0 to 7564
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  7565 non-null   object
 1   text       7565 non-null   object
dtypes: object(2)
memory usage: 118.3+ KB


## The imported data has already been pre-processed to some extent, with all special characters removed and and all words converted to lower case. We shall now proceed by removing stopwords from the text 

In [88]:
#ensuring all data is of string type
for i in range(0,len(df)-1):
    if type(df.iloc[i]['text']) != str:
        df.iloc[i]['text'] = str(df.iloc[i]['text'])

In [89]:
df.head()

Unnamed: 0,sentiment,text
0,joy,on days when i feel close to my partner and ot...
1,fear,every time i imagine that someone i love or i ...
2,anger,when i had been obviously unjustly treated and...
3,sadness,when i think about the short time that we live...
4,disgust,at a gathering i found myself involuntarily si...


## Removing stopwords

In [90]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [91]:
stop = stopwords.words('english')

In [92]:
df.head()

Unnamed: 0,sentiment,text
0,joy,on days when i feel close to my partner and ot...
1,fear,every time i imagine that someone i love or i ...
2,anger,when i had been obviously unjustly treated and...
3,sadness,when i think about the short time that we live...
4,disgust,at a gathering i found myself involuntarily si...


In [93]:
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [94]:
tokens = df.apply(lambda x : word_tokenize(x['text']), axis=1)

In [95]:
tokens

0       [on, days, when, i, feel, close, to, my, partn...
1       [every, time, i, imagine, that, someone, i, lo...
2       [when, i, had, been, obviously, unjustly, trea...
3       [when, i, think, about, the, short, time, that...
4       [at, a, gathering, i, found, myself, involunta...
                              ...                        
7560    [two, years, back, someone, invited, me, to, b...
7561    [i, had, taken, the, responsibility, to, do, s...
7562    [i, was, at, home, and, i, heard, a, loud, sou...
7563    [i, did, not, do, the, homework, that, the, te...
7564    [i, had, shouted, at, my, younger, brother, an...
Length: 7565, dtype: object

In [96]:
nltk.download('averaged_perceptron_tagger')
# tagged_tokens = nltk.pos_tag(tokens[0])

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [97]:
sentence = "i am loving it"
nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))

In [98]:
nltk_tagged

[('i', 'NN'), ('am', 'VBP'), ('loving', 'VBG'), ('it', 'PRP')]

In [99]:
# tagged_tokens

In [100]:
tagged_tokens = []
for i in range(0, len(tokens) ):
    tagged_tokens += [nltk.pos_tag(tokens[i])]

In [101]:
# tagged_tokens

In [102]:
df['tagged_tokens'] = tagged_tokens

In [103]:
df.head()

Unnamed: 0,sentiment,text,tagged_tokens
0,joy,on days when i feel close to my partner and ot...,"[(on, IN), (days, NNS), (when, WRB), (i, NN), ..."
1,fear,every time i imagine that someone i love or i ...,"[(every, DT), (time, NN), (i, JJ), (imagine, V..."
2,anger,when i had been obviously unjustly treated and...,"[(when, WRB), (i, NN), (had, VBD), (been, VBN)..."
3,sadness,when i think about the short time that we live...,"[(when, WRB), (i, NN), (think, VBP), (about, I..."
4,disgust,at a gathering i found myself involuntarily si...,"[(at, IN), (a, DT), (gathering, NN), (i, NN), ..."


In [104]:
df['tokens'] = tokens

In [105]:
df.head()

Unnamed: 0,sentiment,text,tagged_tokens,tokens
0,joy,on days when i feel close to my partner and ot...,"[(on, IN), (days, NNS), (when, WRB), (i, NN), ...","[on, days, when, i, feel, close, to, my, partn..."
1,fear,every time i imagine that someone i love or i ...,"[(every, DT), (time, NN), (i, JJ), (imagine, V...","[every, time, i, imagine, that, someone, i, lo..."
2,anger,when i had been obviously unjustly treated and...,"[(when, WRB), (i, NN), (had, VBD), (been, VBN)...","[when, i, had, been, obviously, unjustly, trea..."
3,sadness,when i think about the short time that we live...,"[(when, WRB), (i, NN), (think, VBP), (about, I...","[when, i, think, about, the, short, time, that..."
4,disgust,at a gathering i found myself involuntarily si...,"[(at, IN), (a, DT), (gathering, NN), (i, NN), ...","[at, a, gathering, i, found, myself, involunta..."


In [106]:
# function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

In [107]:
def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

In [108]:
cf = df
lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [109]:
cf['text'] = cf['text'].apply(lambda x : lemmatize_sentence(x))

In [110]:
cf.head()

Unnamed: 0,sentiment,text,tagged_tokens,tokens
0,joy,on day when i feel close to my partner and oth...,"[(on, IN), (days, NNS), (when, WRB), (i, NN), ...","[on, days, when, i, feel, close, to, my, partn..."
1,fear,every time i imagine that someone i love or i ...,"[(every, DT), (time, NN), (i, JJ), (imagine, V...","[every, time, i, imagine, that, someone, i, lo..."
2,anger,when i have be obviously unjustly treat and ha...,"[(when, WRB), (i, NN), (had, VBD), (been, VBN)...","[when, i, had, been, obviously, unjustly, trea..."
3,sadness,when i think about the short time that we live...,"[(when, WRB), (i, NN), (think, VBP), (about, I...","[when, i, think, about, the, short, time, that..."
4,disgust,at a gathering i find myself involuntarily sit...,"[(at, IN), (a, DT), (gathering, NN), (i, NN), ...","[at, a, gathering, i, found, myself, involunta..."


In [111]:
df['text'] = cf['text']

In [112]:
ef = df
ef['text'] = ef['text'].apply(lambda x : " ".join(lemmatizer.lemmatize(word) for word in x.split()))

In [113]:
ef.head()

Unnamed: 0,sentiment,text,tagged_tokens,tokens
0,joy,on day when i feel close to my partner and oth...,"[(on, IN), (days, NNS), (when, WRB), (i, NN), ...","[on, days, when, i, feel, close, to, my, partn..."
1,fear,every time i imagine that someone i love or i ...,"[(every, DT), (time, NN), (i, JJ), (imagine, V...","[every, time, i, imagine, that, someone, i, lo..."
2,anger,when i have be obviously unjustly treat and ha...,"[(when, WRB), (i, NN), (had, VBD), (been, VBN)...","[when, i, had, been, obviously, unjustly, trea..."
3,sadness,when i think about the short time that we live...,"[(when, WRB), (i, NN), (think, VBP), (about, I...","[when, i, think, about, the, short, time, that..."
4,disgust,at a gathering i find myself involuntarily sit...,"[(at, IN), (a, DT), (gathering, NN), (i, NN), ...","[at, a, gathering, i, found, myself, involunta..."


In [114]:
df['text'] = df['text'].apply(lambda x : " ".join(word for word in x.split() if word not in stop))

In [115]:
print(df['text'].tail(), "\n")
print(ef['text'].tail())

7560    two year back someone invite tutor granddaught...
7561    take responsibility something prepare however ...
7562    home hear loud sound spit outside door think o...
7563             homework teacher ask u scold immediately
7564        shout young brother always afraid call loudly
Name: text, dtype: object 

7560    two year back someone invite tutor granddaught...
7561    take responsibility something prepare however ...
7562    home hear loud sound spit outside door think o...
7563             homework teacher ask u scold immediately
7564        shout young brother always afraid call loudly
Name: text, dtype: object


In [116]:
df.head()

Unnamed: 0,sentiment,text,tagged_tokens,tokens
0,joy,day feel close partner friend feel peace also ...,"[(on, IN), (days, NNS), (when, WRB), (i, NN), ...","[on, days, when, i, feel, close, to, my, partn..."
1,fear,every time imagine someone love could contact ...,"[(every, DT), (time, NN), (i, JJ), (imagine, V...","[every, time, i, imagine, that, someone, i, lo..."
2,anger,obviously unjustly treat possibility elucidate,"[(when, WRB), (i, NN), (had, VBD), (been, VBN)...","[when, i, had, been, obviously, unjustly, trea..."
3,sadness,think short time live relate period life think...,"[(when, WRB), (i, NN), (think, VBP), (about, I...","[when, i, think, about, the, short, time, that..."
4,disgust,gathering find involuntarily sit next two peop...,"[(at, IN), (a, DT), (gathering, NN), (i, NN), ...","[at, a, gathering, i, found, myself, involunta..."


In [117]:
df.drop(columns = ['tagged_tokens', 'tokens'], inplace = True)

In [118]:
df.head()

Unnamed: 0,sentiment,text
0,joy,day feel close partner friend feel peace also ...
1,fear,every time imagine someone love could contact ...
2,anger,obviously unjustly treat possibility elucidate
3,sadness,think short time live relate period life think...
4,disgust,gathering find involuntarily sit next two peop...


In [119]:
from sklearn.model_selection import train_test_split

In [120]:
df['text'] = df.apply(lambda x : word_tokenize(x['text']), axis=1)
df.head()

Unnamed: 0,sentiment,text
0,joy,"[day, feel, close, partner, friend, feel, peac..."
1,fear,"[every, time, imagine, someone, love, could, c..."
2,anger,"[obviously, unjustly, treat, possibility, eluc..."
3,sadness,"[think, short, time, live, relate, period, lif..."
4,disgust,"[gathering, find, involuntarily, sit, next, tw..."


In [121]:
df['text']=[" ".join(x) for x in df['text'].values]

In [122]:
from sklearn.preprocessing import LabelEncoder

In [123]:
le = LabelEncoder()

In [124]:
x = df['text']

In [125]:
x

0       day feel close partner friend feel peace also ...
1       every time imagine someone love could contact ...
2          obviously unjustly treat possibility elucidate
3       think short time live relate period life think...
4       gathering find involuntarily sit next two peop...
                              ...                        
7560    two year back someone invite tutor granddaught...
7561    take responsibility something prepare however ...
7562    home hear loud sound spit outside door think o...
7563             homework teacher ask u scold immediately
7564        shout young brother always afraid call loudly
Name: text, Length: 7565, dtype: object

In [126]:
y = df['sentiment']

In [127]:
y

0           joy
1          fear
2         anger
3       sadness
4       disgust
         ...   
7560      anger
7561    sadness
7562    disgust
7563      shame
7564      guilt
Name: sentiment, Length: 7565, dtype: object

In [128]:
y.value_counts()

joy        1088
fear       1088
anger      1085
sadness    1081
disgust    1079
guilt      1074
shame      1070
Name: sentiment, dtype: int64

In [129]:
xtrain, xtest, ytrain, ytest = train_test_split(df['text'], df['sentiment'], random_state = 100, stratify = df['sentiment'])
ytrain

5855      shame
4680      guilt
1078        joy
508     sadness
3966       fear
         ...   
7097      anger
3818      anger
5028    disgust
3246    disgust
2518    disgust
Name: sentiment, Length: 5673, dtype: object

In [130]:
ytrain = le.fit_transform(ytrain)
ytest = le.transform(ytest)

In [131]:
ytrain

array([6, 3, 4, ..., 1, 1, 1])

In [132]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [133]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [134]:
t = TfidfVectorizer(analyzer = 'word', token_pattern=r'\w{1,}', max_features=5000)

In [135]:
t = t.fit(df['text'])

In [136]:
xtrain_vect = t.transform(xtrain)

In [137]:
xtest_vect = t.transform(xtest)

In [138]:
tree_model = DecisionTreeClassifier().fit(xtrain_vect, ytrain)

In [139]:
tree_pred = tree_model.predict(xtest_vect)

In [140]:
tree_cr = classification_report(ytest, tree_pred)

In [141]:
print(tree_cr)

              precision    recall  f1-score   support

           0       0.34      0.37      0.35       271
           1       0.44      0.48      0.46       270
           2       0.49      0.58      0.53       272
           3       0.42      0.35      0.38       269
           4       0.57      0.60      0.59       272
           5       0.54      0.48      0.51       270
           6       0.43      0.39      0.41       268

    accuracy                           0.46      1892
   macro avg       0.46      0.46      0.46      1892
weighted avg       0.46      0.46      0.46      1892



In [142]:
from sklearn.ensemble import RandomForestClassifier

In [143]:
rf_model = RandomForestClassifier().fit(xtrain_vect, ytrain)

In [144]:
rf_pred = rf_model.predict(xtest_vect)
rf_cr = classification_report(ytest, rf_pred)
print(rf_cr)

              precision    recall  f1-score   support

           0       0.42      0.39      0.41       271
           1       0.56      0.56      0.56       270
           2       0.55      0.74      0.63       272
           3       0.50      0.41      0.45       269
           4       0.60      0.69      0.64       272
           5       0.65      0.59      0.62       270
           6       0.54      0.44      0.48       268

    accuracy                           0.55      1892
   macro avg       0.54      0.55      0.54      1892
weighted avg       0.54      0.55      0.54      1892



In [145]:
from sklearn.svm import SVC

In [146]:
svm_model = SVC(decision_function_shape = 'ovr').fit(xtrain_vect, ytrain)
svm_pred = svm_model.predict(xtest_vect)
svm_cr =  classification_report(ytest, svm_pred)
print(svm_cr)

              precision    recall  f1-score   support

           0       0.37      0.51      0.43       271
           1       0.57      0.60      0.58       270
           2       0.70      0.64      0.67       272
           3       0.45      0.41      0.43       269
           4       0.66      0.71      0.68       272
           5       0.72      0.54      0.62       270
           6       0.50      0.46      0.48       268

    accuracy                           0.55      1892
   macro avg       0.57      0.55      0.55      1892
weighted avg       0.57      0.55      0.56      1892



In [147]:
from xgboost import XGBClassifier

In [148]:
x_model = XGBClassifier().fit(xtrain_vect, ytrain)

In [149]:
x_pred = x_model.predict(xtest_vect)
x_cr = classification_report(ytest, x_pred)
print(x_cr)

              precision    recall  f1-score   support

           0       0.31      0.51      0.38       271
           1       0.58      0.51      0.54       270
           2       0.69      0.61      0.65       272
           3       0.47      0.41      0.44       269
           4       0.59      0.68      0.63       272
           5       0.65      0.54      0.59       270
           6       0.54      0.41      0.47       268

    accuracy                           0.52      1892
   macro avg       0.55      0.52      0.53      1892
weighted avg       0.55      0.52      0.53      1892



In [150]:
import re
def text_cleaner(sentence):
    s = {'v2' : [str(sentence)]}
    ef = pd.DataFrame(data = s)
    ef['v2'] = ef['v2'].apply(lambda x: " ".join(x.lower() for x in x.split()))
    ef['v2'] = ef['v2'].str.replace('[^\w\s]','')
    ef['v2'] = ef['v2'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
    ef['v2'] = ef['v2'].apply(lambda x: " ".join([lemmatizer.lemmatize(word) for word in x.split()]))
    ef['v2'] = ef.apply(lambda x: word_tokenize(x['v2']), axis=1)
    ef['v2'] = [" ".join(x) for x in ef['v2'].values]
    x = t.transform(ef['v2'])
    return x

In [151]:
s1 = "i am angry with this phone"
print(svm_model.predict(text_cleaner(s1)))
print(rf_model.predict(text_cleaner(s1)))

[0]
[0]


In [152]:
from sklearn.svm import LinearSVC

In [153]:
linear_svm = LinearSVC().fit(xtrain_vect, ytrain)
lsvm_pred = linear_svm.predict(xtest_vect)
lsvm_cr = classification_report(ytest, lsvm_pred)

In [154]:
print(lsvm_cr)

              precision    recall  f1-score   support

           0       0.41      0.46      0.43       271
           1       0.58      0.58      0.58       270
           2       0.68      0.68      0.68       272
           3       0.41      0.39      0.40       269
           4       0.63      0.70      0.66       272
           5       0.64      0.60      0.62       270
           6       0.52      0.46      0.48       268

    accuracy                           0.55      1892
   macro avg       0.55      0.55      0.55      1892
weighted avg       0.55      0.55      0.55      1892



In [155]:
from sklearn.multiclass import OneVsRestClassifier

In [156]:
m = OneVsRestClassifier(RandomForestClassifier()).fit(xtrain_vect, ytrain)
m_pred = m.predict(xtest_vect)
m_cr = classification_report(ytest, m_pred)
print(m_cr)

              precision    recall  f1-score   support

           0       0.44      0.41      0.42       271
           1       0.61      0.57      0.59       270
           2       0.59      0.76      0.66       272
           3       0.48      0.39      0.43       269
           4       0.59      0.69      0.64       272
           5       0.62      0.61      0.62       270
           6       0.55      0.47      0.50       268

    accuracy                           0.56      1892
   macro avg       0.55      0.56      0.55      1892
weighted avg       0.55      0.56      0.55      1892



In [157]:
sentiments = ['joy', 'anger', 'fear', 'disgust', 'guilt', 'shame', 'sadness']
se = le.transform(sentiments)
se

array([4, 0, 2, 1, 3, 6, 5])

In [158]:
y.value_counts()

joy        1088
fear       1088
anger      1085
sadness    1081
disgust    1079
guilt      1074
shame      1070
Name: sentiment, dtype: int64

In [159]:
print(rf_model.predict(text_cleaner("I am guilty at this phone")))

[3]


In [160]:
print(m.predict(text_cleaner("this phone was real")))

[2]


In [161]:
import spacy

In [162]:
nlp = spacy.load('en')

In [163]:
text = "Nokia 6.1 plus has a bad display"
text = nlp(text)

In [164]:
labels = set([w.label_ for w in text.ents]) 

In [165]:
def cleanup(token, lower = True):
    if lower:
       token = token.lower()
    return token.strip()

In [166]:
for label in labels: 
    entities = [cleanup(e.string, lower=False) for e in text.ents if label==e.label_] 
    entities = list(set(entities)) 
    print(label,entities)

ORG ['Nokia 6.1']


The OneVsRestClassifier method has given us the best results till now, so we will proceed to save that model for future use.

In [167]:
import pickle

In [173]:
pickle.dump(m, open("onevsrestclassifier_rf.pickle", "wb"))

In [174]:
from google.colab import drive
drive.mount('drive')

Mounted at drive


In [175]:
!cp onevsrestclassifier_rf.pickle "drive/My Drive/"