In [1]:
import pandas as pd
import re
import emot
from emot.emo_unicode import UNICODE_EMO, EMOTICONS
import pandas as pd
import numpy as np
import nltk
# text preprocessing
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re

# plots and metrics
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

# feature extraction / vectorization
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

# save and load a file
import pickle

In [2]:
df000 = pd.read_csv('isear.csv')
df000['Emotion'].unique()

array(['joy', 'fear', 'anger', 'sadness', 'disgust', 'shame', 'guilt'],
      dtype=object)

In [3]:
df000

Unnamed: 0,Emotion,Text
0,joy,"During the period of falling in love, each tim..."
1,fear,When I was involved in a traffic accident.
2,anger,When I was driving home after several days of...
3,sadness,When I lost the person who meant the most to me.
4,disgust,The time I knocked a deer down - the sight of ...
...,...,...
7468,anger,Two years back someone invited me to be the tu...
7469,sadness,I had taken the responsibility to do something...
7470,disgust,I was at home and I heard a loud sound of spit...
7471,shame,I did not do the homework that the teacher had...


In [4]:
df00 = pd.read_csv('text_emotion.csv')
df00['sentiment'].unique()

array(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
       'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'],
      dtype=object)

In [5]:
df00 = df00.drop(['tweet_id','author'], axis = 1)
df00['Emotion'] = df00['sentiment']
df00['Text'] = df00['content']
df00 = df00.drop(['content','sentiment'], axis = 1)
df00['Emotion'] = df00['Emotion'].replace(['empty'],['neutral'])
df00['Emotion'] = df00['Emotion'].replace(['love'],['joy'])
df00['Emotion'] = df00['Emotion'].replace(['fun'],['joy'])

In [6]:
df0 = pd.read_csv('emotion-stimulus.csv')
df0['Emotion'].unique()

array(['happy', 'sad', 'surprise', 'disgust', 'anger', 'fear', 'shame'],
      dtype=object)

In [7]:
df0['Emotion'] = df0['Emotion'].replace(['happy'],['joy'])
df0

Unnamed: 0,Emotion,Text
0,joy,I suppose I am happy being so ` tiny' ; it mea...
1,joy,Lennox has always truly wanted to fight for th...
2,joy,"He was a professional musician now , still sen..."
3,joy,Holmes is happy having the freedom of the hous...
4,joy,I had problems with tutors trying to encourage...
...,...,...
2409,shame,He gets real humiliated and has to leave .
2410,shame,They aimed for higher status jobs and felt hum...
2411,shame,He cursed his lack of self-control ; he knew t...
2412,shame,Sometimes I've thought I 'll never forget wha...


In [8]:
df1 = pd.read_csv('data_test.csv')
df1.head()

Unnamed: 0,Emotion,Text
0,sadness,I experienced this emotion when my grandfather...
1,neutral,"when I first moved in , I walked everywhere ...."
2,anger,"` Oh ! "" she bleated , her voice high and rath..."
3,fear,"However , does the right hon. Gentleman recogn..."
4,sadness,My boyfriend didn't turn up after promising th...


In [9]:
df1['Emotion'].unique()

array(['sadness', 'neutral', 'anger', 'fear', 'joy'], dtype=object)

In [10]:
def preprocess_and_tokenize(data):    

    #remove html markup
    data = re.sub("(<.*?>)", "", data)

    #remove urls
    data = re.sub(r'http\S+', '', data)
    
    #remove hashtags and @names
    data= re.sub(r"(#[\d\w\.]+)", '', data)
    data= re.sub(r"(@[\d\w\.]+)", '', data)

    #remove punctuation and non-ascii digits
    data = re.sub("(\\W|\\d)", " ", data)
    
    #remove whitespace
    data = data.strip()
    
    # tokenization with nltk
    data = word_tokenize(data)
    
    # stemming with nltk
    #porter = PorterStemmer()
    #stem_data = [porter.stem(word) for word in data]
        
    WNlemma = nltk.WordNetLemmatizer()
    stem_data = [WNlemma.lemmatize(t) for t in data]    
    return data

In [11]:
def join(text):
    text2 = ' '.join(text)
    return text2    

In [12]:
df2 = pd.read_csv('data_train.csv')
df2.head()

Unnamed: 0,Emotion,Text
0,neutral,There are tons of other paintings that I thin...
1,sadness,"Yet the dog had grown old and less capable , a..."
2,fear,When I get into the tube or the train without ...
3,fear,This last may be a source of considerable disq...
4,anger,She disliked the intimacy he showed towards so...


In [13]:
df = df1.append(df2).append(df0).append(df00).append(df000)
df['Emotion'] = df['Emotion'].replace(['sadness','happiness'],['sad','joy'])
df = df[df['Emotion'] != 'relief']
df['Emotion'].unique()

array(['sad', 'neutral', 'anger', 'fear', 'joy', 'surprise', 'disgust',
       'shame', 'enthusiasm', 'worry', 'hate', 'boredom', 'guilt'],
      dtype=object)

In [14]:
df['Text'] = df['Text'].str.lower()

In [15]:
def convert_emojis(text):
    for emot in UNICODE_EMO:
        text = text.replace(emot, "_".join(UNICODE_EMO[emot].replace(",","").replace(":","").split()))
    return text

In [16]:
df['Text'] = df['Text'].apply(convert_emojis)

In [17]:
df['Text'] = df['Text'].apply(preprocess_and_tokenize)
df['Text'] = df['Text'].apply(join)
df

Unnamed: 0,Emotion,Text
0,sad,i experienced this emotion when my grandfather...
1,neutral,when i first moved in i walked everywhere but ...
2,anger,oh she bleated her voice high and rather indig...
3,fear,however does the right hon gentleman recognise...
4,sad,my boyfriend didn t turn up after promising th...
...,...,...
7468,anger,two years back someone invited me to be the tu...
7469,sad,i had taken the responsibility to do something...
7470,disgust,i was at home and i heard a loud sound of spit...
7471,shame,i did not do the homework that the teacher had...


In [18]:
f1 = pd.read_json('Aithon level -3/aithon_level_3_2020-09-22T05_11_55.txt')

In [19]:
test = f1['text'].apply(convert_emojis)

In [20]:
test

0        Curve flattening? Kenya records 48 new virus c...
1        Victoria and Melbourne Covid trend map: where ...
2        NSW and Sydney Covid trend map: where coronavi...
3        IT’S BAKE OFF DAY! raising_handsmedium-light_s...
4        @DanielAndrewsMP The Liberal party bots are ou...
                               ...                        
17593    Kanyakumari District Covid19 Updates:\n\nCoron...
17594    Coimbatore District Covid19 Updates:\n\nCorona...
17595    @drharshvardhan Ji\n\nThink of these Corona wa...
17596    Health Minister Harsh Vardhan says, corona vac...
17597    @Ravi_freebird @WildCat_Mahi @loverlady12345 @...
Name: text, Length: 17598, dtype: object

In [21]:
test = test.apply(preprocess_and_tokenize)
test = test.apply(join)

In [22]:
df['Emotion'].unique()

array(['sad', 'neutral', 'anger', 'fear', 'joy', 'surprise', 'disgust',
       'shame', 'enthusiasm', 'worry', 'hate', 'boredom', 'guilt'],
      dtype=object)

In [23]:
df['Lables'] = df['Emotion'].replace(df['Emotion'].unique(), [0,1,2,3,4,5,6,7,8,9,10,11,12])

In [24]:
df

Unnamed: 0,Emotion,Text,Lables
0,sad,i experienced this emotion when my grandfather...,0
1,neutral,when i first moved in i walked everywhere but ...,1
2,anger,oh she bleated her voice high and rather indig...,2
3,fear,however does the right hon gentleman recognise...,3
4,sad,my boyfriend didn t turn up after promising th...,0
...,...,...,...
7468,anger,two years back someone invited me to be the tu...,2
7469,sad,i had taken the responsibility to do something...,0
7470,disgust,i was at home and i heard a loud sound of spit...,6
7471,shame,i did not do the homework that the teacher had...,7


In [25]:
def reduce_lengthening(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

In [26]:
df['Text'] = df['Text'].apply(reduce_lengthening)

In [27]:
test = test.apply(reduce_lengthening)

In [28]:
X_train = df['Text']
X_train.head()

0    i experienced this emotion when my grandfather...
1    when i first moved in i walked everywhere but ...
2    oh she bleated her voice high and rather indig...
3    however does the right hon gentleman recognise...
4    my boyfriend didn t turn up after promising th...
Name: Text, dtype: object

In [29]:
y_train = df['Lables']
y_train

0        0
1        1
2        2
3        3
4        0
        ..
7468     2
7469     0
7470     6
7471     7
7472    12
Name: Lables, Length: 59688, dtype: int64

In [30]:
X_test = test

In [31]:
X_train = X_train.reset_index?

In [None]:
X_train = X_train.reset_index

In [32]:
X_train = X_train.reset_index(drop = True)
#X_train = X_train['Text']
X_train

0        i experienced this emotion when my grandfather...
1        when i first moved in i walked everywhere but ...
2        oh she bleated her voice high and rather indig...
3        however does the right hon gentleman recognise...
4        my boyfriend didn t turn up after promising th...
                               ...                        
59683    two years back someone invited me to be the tu...
59684    i had taken the responsibility to do something...
59685    i was at home and i heard a loud sound of spit...
59686    i did not do the homework that the teacher had...
59687    i had shouted at my younger brother and he was...
Name: Text, Length: 59688, dtype: object

In [40]:
from imblearn.under_sampling import ClusterCentroids
clc = ClusterCentroids()
X_train2,y_train2 = clc.fit_resample(X = X_train_vectorized[:-1000],y = y_train[:-1000])
X_val = X_train_vectorized[-1000:]
y_val = y_train[-1000:]
#clc.fit_resample?

In [34]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer().fit(X_train[:-1000])

X_train_vectorized = vect.transform(X_train)

In [62]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver = 'liblinear',max_iter = 1000, C = 100)
model.fit(X_train2, y_train2)

#predictions = model.predict(vect.transform(X_test))

LogisticRegression(C=100, max_iter=1000, solver='liblinear')

In [73]:
from imblearn.under_sampling import CondensedNearestNeighbour
clc = CondensedNearestNeighbour()
X_train3,y_train3 = clc.fit_resample(X = X_train_vectorized[:-1000],y = y_train[:-1000])
X_val2 = X_train_vectorized[-1000:]
y_val2 = y_train[-1000:]
#clc.fit_resample?

In [74]:
from sklearn.linear_model import LogisticRegression

model2 = LogisticRegression(max_iter = 1000, C = 40)
model2.fit(X_train3, y_train3)

LogisticRegression(C=40, max_iter=1000)

In [75]:
model2.score(X_train3,y_train3)

0.9861881139966929

In [76]:
model2.score(X_val2,y_val2)

0.51

In [65]:
nb = MultinomialNB()

nb.fit(X_train2, y_train2)

#ynb_pred = nb.predict(X_test_vect)


MultinomialNB()

In [68]:
nb.score(X_train2, y_train2)

0.6102277610657499

In [69]:
nb.score(X_val, y_val)

0.365

In [109]:
rf = RandomForestClassifier()
rf.fit(X_train_vectorized[:-1000], y_train[:-1000])

RandomForestClassifier()

In [None]:
rf.score(X_train_vectorized[-1000:], y_train[-1000:])

In [70]:
svc = LinearSVC(tol=1e-05, max_iter = 1000)
svc.fit(X_train2, y_train2)

LinearSVC(tol=1e-05)

In [71]:
svc.score(X_train2,y_train2)

0.8899871078642029

In [72]:
svc.score(X_val,y_val)

0.385

In [115]:
test


0        Curve flattening Kenya records new virus cases...
1        Victoria and Melbourne Covid trend map where c...
2        NSW and Sydney Covid trend map where coronavir...
3        IT S BAKE OFF DAY raising_handsmedium light_sk...
4        The Liberal party bots are out in force even t...
                               ...                        
17593    Kanyakumari District Covid Updates Corona Spre...
17594    Coimbatore District Covid Updates Corona Sprea...
17595    Ji Think of these Corona warriors who died in ...
17596    Health Minister Harsh Vardhan says corona vacc...
17597    COVID Pandemic India ve cases increases with s...
Name: text, Length: 17598, dtype: object

In [129]:
f1 = pd.read_json('Aithon level -3/aithon_level_3_2020-09-22T05_11_55.txt')
f1 = f1.append(pd.read_json('Aithon level -3/aithon_level_3_2020-09-22T05_31_23.txt'))
f1 = f1.append(pd.read_json('Aithon level -3/aithon_level_3_2020-09-22T06_01_22.txt'))
f1 = f1.append(pd.read_json('Aithon level -3/aithon_level_3_2020-09-22T06_31_22.txt'))
f1 = f1.append(pd.read_json('Aithon level -3/aithon_level_3_2020-09-22T07_01_23.txt'))
f1 = f1.append(pd.read_json('Aithon level -3/aithon_level_3_2020-09-22T07_31_21.txt'))
f1 = f1.append(pd.read_json('Aithon level -3/aithon_level_3_2020-09-22T08_01_22.txt'))
f1 = f1.append(pd.read_json('Aithon level -3/aithon_level_3_2020-09-22T08_31_22.txt'))
f1 = f1.append(pd.read_json('Aithon level -3/aithon_level_3_2020-09-22T09_01_23.txt'))
f1 = f1.append(pd.read_json('Aithon level -3/aithon_level_3_2020-09-22T09_31_23.txt'))
f1 = f1.append(pd.read_json('Aithon level -3/aithon_level_3_2020-09-22T10_01_23.txt'))
f1 = f1.append(pd.read_json('Aithon level -3/aithon_level_3_2020-09-22T10_31_25.txt'))
f1 = f1.append(pd.read_json('Aithon level -3/aithon_level_3_2020-09-22T11_01_25.txt'))
f1 = f1.append(pd.read_json('Aithon level -3/aithon_level_3_2020-09-22T11_31_25.txt'))
f1 = f1.append(pd.read_json('Aithon level -3/aithon_level_3_2020-09-22T12_01_27.txt'))
f1 = f1.append(pd.read_json('Aithon level -3/aithon_level_3_2020-09-22T12_31_25.txt'))
f1 = f1.append(pd.read_json('Aithon level -3/aithon_level_3_2020-09-22T13_01_28.txt'))
f1 = f1.append(pd.read_json('Aithon level -3/aithon_level_3_2020-09-22T13_32_14.txt'))
f1 = f1.append(pd.read_json('Aithon level -3/aithon_level_3_2020-09-22T14_01_58.txt'))
f1 = f1.append(pd.read_json('Aithon level -3/aithon_level_3_2020-09-22T14_31_58.txt'))
f1 = f1.append(pd.read_json('Aithon level -3/aithon_level_3_2020-09-22T15_31_24.txt'))
f1 = f1.append(pd.read_json('Aithon level -3/aithon_level_3_2020-09-22T16_01_24.txt'))
f1 = f1.append(pd.read_json('Aithon level -3/aithon_level_3_2020-09-22T16_31_22.txt'))
f1 = f1.append(pd.read_json('Aithon level -3/aithon_level_3_2020-09-22T17_01_22.txt'))
f1 = f1.append(pd.read_json('Aithon level -3/aithon_level_3_2020-09-22T17_31_21.txt'))
f1 = f1.append(pd.read_json('Aithon level -3/aithon_level_3_2020-09-22T18_01_20.txt'))
f1 = f1.append(pd.read_json('Aithon level -3/aithon_level_3_2020-09-22T18_31_21.txt'))
f1 = f1.append(pd.read_json('Aithon level -3/aithon_level_3_2020-09-22T19_01_22.txt'))

In [122]:
f1 = f1.reset_index(drop = True)
f1

Unnamed: 0,text,location,date,time
0,Curve flattening? Kenya records 48 new virus c...,IN,Sep 22,05:08:45
1,Victoria and Melbourne Covid trend map: where ...,"Erbil, Iraq",Sep 22,05:08:34
2,NSW and Sydney Covid trend map: where coronavi...,"Melbourne, Australia",Sep 22,05:08:33
3,IT’S BAKE OFF DAY! 🙌🏼\n\nWho else will be tuni...,"Melbourne, Australia",Sep 22,05:06:02
4,@DanielAndrewsMP The Liberal party bots are ou...,Fareham,Sep 22,05:05:34
...,...,...,...,...
496443,@atomicallyblond @AEHALL1983 @piersmorgan If c...,Canada,Sep 19,16:31:50
496444,Here are few positive as well as negative impa...,,Sep 19,16:31:02
496445,@Terminator_BS_ Corona virus is a strand of vi...,Global,Sep 19,16:31:00
496446,Union Minister Dr. Jitendra Singh reviews COVI...,"Grove City, OH",Sep 19,16:30:46


In [123]:
test = f1['text'].apply(convert_emojis)
test = test.apply(preprocess_and_tokenize)
test = test.apply(join)
test = test.apply(reduce_lengthening)

In [124]:
predictions = svc.predict(vect.transform(test))

In [130]:
f1['Emotion'] = predictions

In [131]:
f1['Emotion'] = f1['Emotion'].replace([0,1,2,3,4,5,6,7,8,9,10,11,12], df['Emotion'].unique())

In [132]:
f1.to_csv('Classified.csv')

In [128]:
f1

0             joy
1             joy
2            fear
3             joy
4           anger
           ...   
496443        joy
496444      worry
496445      worry
496446        joy
496447    neutral
Name: Emotion, Length: 496448, dtype: object

In [142]:
#from wordcloud import WordCloud 
#import matplotlib.pyplot as plt
#all_words = ' '.join([tws for tws in f1['text']])
wordcloud = WordCloud(width = 1000, height = 600, random_state = 21, max_font_size = 119).generate(all_words)
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.savefig('wordcloud')

MemoryError: 

In [150]:
date = f1['date'].unique()
len(date)

10

In [151]:
f1

Unnamed: 0,text,location,date,time,Emotion
0,Curve flattening? Kenya records 48 new virus c...,IN,Sep 22,05:08:45,joy
1,Victoria and Melbourne Covid trend map: where ...,"Erbil, Iraq",Sep 22,05:08:34,joy
2,NSW and Sydney Covid trend map: where coronavi...,"Melbourne, Australia",Sep 22,05:08:33,fear
3,IT’S BAKE OFF DAY! 🙌🏼\n\nWho else will be tuni...,"Melbourne, Australia",Sep 22,05:06:02,joy
4,@DanielAndrewsMP The Liberal party bots are ou...,Fareham,Sep 22,05:05:34,anger
...,...,...,...,...,...
17601,@atomicallyblond @AEHALL1983 @piersmorgan If c...,Canada,Sep 19,16:31:50,joy
17602,Here are few positive as well as negative impa...,,Sep 19,16:31:02,worry
17603,@Terminator_BS_ Corona virus is a strand of vi...,Global,Sep 19,16:31:00,worry
17604,Union Minister Dr. Jitendra Singh reviews COVI...,"Grove City, OH",Sep 19,16:30:46,joy


In [154]:
f2 = f1[f1['date'] == date[0]]
f2.to_csv('1.csv')
f3 = f1[f1['date'] == date[1]]
f3.to_csv('2.csv')
f4 = f1[f1['date'] == date[2]]
f4.to_csv('3.csv')
f5 = f1[f1['date'] == date[3]]
f5.to_csv('4.csv')
f6 = f1[f1['date'] == date[4]]
f6.to_csv('5.csv')
f7 = f1[f1['date'] == date[5]]
f7.to_csv('6.csv')
f8 = f1[f1['date'] == date[6]]
f8.to_csv('7.csv')
f9 = f1[f1['date'] == date[7]]
f9.to_csv('8.csv')
f10 = f1[f1['date'] == date[8]]
f10.to_csv('9.csv')
f11 = f1[f1['date'] == date[9]]
f11.to_csv('10.csv')
