In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from sklearn.metrics import f1_score,accuracy_score,precision_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from wordcloud import WordCloud

### Load Data

In [2]:
train_df = pd.read_csv('./data/train.txt',sep=";",names=["text","emotion"])
test_df = pd.read_csv('./data/test.txt',sep=";",names=["text","emotion"])

In [3]:
train_df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [4]:
test_df.head()

Unnamed: 0,text,emotion
0,im feeling rather rotten so im not very ambiti...,sadness
1,im updating my blog because i feel shitty,sadness
2,i never make her separate from me because i do...,sadness
3,i left with my bouquet of red and yellow tulip...,joy
4,i was feeling a little vain when i did this one,sadness


### Dataset preprocessing

In [5]:
print(f'Training data: {len(train_df)}\nTesting data: {len(test_df)}')

Training data: 16000
Testing data: 2000


##### Testing data insufficient, both datesets will be combined so that there are sufficient training data

In [6]:
full_df = pd.concat([train_df,test_df],axis=0,ignore_index=True)

In [7]:
full_df

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger
...,...,...
17995,i just keep feeling like someone is being unki...,anger
17996,im feeling a little cranky negative after this...,anger
17997,i feel that i am useful to my people and that ...,joy
17998,im feeling more comfortable with derby i feel ...,joy


In [8]:
# check if data is balanced or not
full_df.emotion.value_counts()

emotion
joy         6057
sadness     5247
anger       2434
fear        2161
love        1463
surprise     638
Name: count, dtype: int64

In [9]:
# check for any null data
full_df.isnull().sum()

text       0
emotion    0
dtype: int64

In [10]:
# check for any duplicated data
full_df.duplicated().sum()

1

In [11]:
# remove duplicated rows
index = full_df[full_df.duplicated() == True].index
full_df.drop(index,axis = 0,inplace=True)
full_df.reset_index(inplace=True,drop=True)

### Text processing


In [12]:
# identify any abnormal strings in the text columns
abnormal = full_df[full_df['text'].str.contains('http')]
abnormal

Unnamed: 0,text,emotion
125,i feel they are pretty safe on my blog img src...,joy
323,i stopped feeling so exhausted a href http pro...,sadness
462,i feel so dazed a href http twitter,surprise
866,i feel unwelcome at work sometimes and think p...,sadness
967,i a href http feeling groggy,sadness
...,...,...
17601,im feeling cranky a href http doingaone eighty,anger
17645,i feel special a href http facsimilogos,joy
17657,i feel numb the end of the world as we know it...,sadness
17854,i really need to find my nitch up here in vt i...,sadness


In [13]:
# strings to removed: blog,src,img,href,http,https,twitter
# create function to remove unnecessary strings
def text_process_1(text):
    pattern = r"blog|src|img|href|http|https|twitter"
    result = re.sub(pattern,"",text)
    return result

In [14]:
# Lemmatization function
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))
def text_process_2(text):
    tokens = word_tokenize(text) # generate tokens
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words] # filter out stopwords
    lemmatized_tokens = [lemmatizer.lemmatize(cleaned_tokens) for cleaned_tokens in filtered_tokens] # lemmatize tokens
    result = " ".join(lemmatized_tokens)
    return result

In [15]:
# implement text_process_1 function on text column
full_df['text'] = full_df['text'].apply(text_process_1)

In [16]:
# implement text_process_2 function on text column
full_df['text'] = full_df['text'].apply(text_process_2)

In [17]:
# final processed datafram
full_df

Unnamed: 0,text,emotion
0,didnt feel humiliated,sadness
1,go feeling hopeless damned hopeful around some...,sadness
2,im grabbing minute post feel greedy wrong,anger
3,ever feeling nostalgic fireplace know still pr...,love
4,feeling grouchy,anger
...,...,...
17994,keep feeling like someone unkind wrong think g...,anger
17995,im feeling little cranky negative doctor appoi...,anger
17996,feel useful people give great feeling achievement,joy
17997,im feeling comfortable derby feel though start...,joy
