In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk 

In [2]:
df = pd.read_csv('mood_data.txt', names=['Text', 'Emotion'], sep=';')

In [3]:
df.head()

Unnamed: 0,Text,Emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [4]:
df.shape

(16000, 2)

In [5]:
# Load the required libraries for cleaning
import string,re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [6]:
# Create a function to generate cleaned data from raw text
def clean_text(mood):
    mood = word_tokenize(mood) # Create tokens
    mood= " ".join(mood) # Join tokens
    mood = [char for char in mood if char not in string.punctuation] # Remove punctuations
    mood = ''.join(mood) # Join the leters
    mood = [word for word in mood.split() if mood.lower() not in stopwords.words('english')] # Remove common english words (I, you, we,...)
    return " ".join(mood)

In [7]:
df['cleaned_text'] = df['Text'].apply(clean_text)
df

Unnamed: 0,Text,Emotion,cleaned_text
0,i didnt feel humiliated,sadness,i didnt feel humiliated
1,i can go from feeling so hopeless to so damned...,sadness,i can go from feeling so hopeless to so damned...
2,im grabbing a minute to post i feel greedy wrong,anger,im grabbing a minute to post i feel greedy wrong
3,i am ever feeling nostalgic about the fireplac...,love,i am ever feeling nostalgic about the fireplac...
4,i am feeling grouchy,anger,i am feeling grouchy
...,...,...,...
15995,i just had a very brief time in the beanbag an...,sadness,i just had a very brief time in the beanbag an...
15996,i am now turning and i feel pathetic that i am...,sadness,i am now turning and i feel pathetic that i am...
15997,i feel strong and good overall,joy,i feel strong and good overall
15998,i feel like this was such a rude comment and i...,anger,i feel like this was such a rude comment and i...


In [9]:
df["cleaned_text"].head()

0                              i didnt feel humiliated
1    i can go from feeling so hopeless to so damned...
2     im grabbing a minute to post i feel greedy wrong
3    i am ever feeling nostalgic about the fireplac...
4                                 i am feeling grouchy
Name: cleaned_text, dtype: object

In [11]:
features = df['cleaned_text']
processed_features = []

for sentence in range(0, len(features)):
    # Remove all the special characters
    processed_feature = re.sub(r'\W', ' ', str(features[sentence]))
    
    # Remove single characters appearing in the text except the start
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)
    
    # Remove single characters appearing at the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 
    
    # Substitute multiple spaces with a single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)
    
    
    # Convert to lowercase
    processed_feature = processed_feature.lower()

    processed_features.append(processed_feature)

In [12]:
processed_features[:5]

['i didnt feel humiliated',
 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
 'im grabbing minute to post feel greedy wrong',
 'i am ever feeling nostalgic about the fireplace will know that it is still on the property',
 'i am feeling grouchy']

In [13]:
df['processed_text'] = processed_features
df

Unnamed: 0,Text,Emotion,cleaned_text,processed_text
0,i didnt feel humiliated,sadness,i didnt feel humiliated,i didnt feel humiliated
1,i can go from feeling so hopeless to so damned...,sadness,i can go from feeling so hopeless to so damned...,i can go from feeling so hopeless to so damned...
2,im grabbing a minute to post i feel greedy wrong,anger,im grabbing a minute to post i feel greedy wrong,im grabbing minute to post feel greedy wrong
3,i am ever feeling nostalgic about the fireplac...,love,i am ever feeling nostalgic about the fireplac...,i am ever feeling nostalgic about the fireplac...
4,i am feeling grouchy,anger,i am feeling grouchy,i am feeling grouchy
...,...,...,...,...
15995,i just had a very brief time in the beanbag an...,sadness,i just had a very brief time in the beanbag an...,i just had very brief time in the beanbag and ...
15996,i am now turning and i feel pathetic that i am...,sadness,i am now turning and i feel pathetic that i am...,i am now turning and feel pathetic that am sti...
15997,i feel strong and good overall,joy,i feel strong and good overall,i feel strong and good overall
15998,i feel like this was such a rude comment and i...,anger,i feel like this was such a rude comment and i...,i feel like this was such rude comment and im ...


In [14]:
final_df = df[["processed_text","Emotion"]]
final_df

Unnamed: 0,processed_text,Emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing minute to post feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger
...,...,...
15995,i just had very brief time in the beanbag and ...,sadness
15996,i am now turning and feel pathetic that am sti...,sadness
15997,i feel strong and good overall,joy
15998,i feel like this was such rude comment and im ...,anger


In [15]:
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [16]:
def tokenize(text): 
    tk = TweetTokenizer()
    return tk.tokenize(text)

vectorizer = CountVectorizer(analyzer = 'word',tokenizer = tokenize,lowercase = True,ngram_range=(1, 1))

In [17]:
# Generate unique words from the processed data by applying Count Vectorizer along with TweetTokenizer
count= vectorizer.fit_transform(final_df['processed_text'])

In [18]:
count.shape

(16000, 15206)

In [19]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score

In [20]:

X = df['processed_text'].values
y = df['Emotion'].values

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=100, test_size=0.3)

In [21]:
vectorizer = TfidfVectorizer(max_features=1000)
X_train_idf = vectorizer.fit_transform(X_train)
X_test_idf = vectorizer.transform(X_test)

In [22]:
# Print idf values
df_idf = pd.DataFrame(vectorizer.idf_, index=vectorizer.get_feature_names(),columns=["idf_weights"])
# Sort ascending
df_idf.sort_values(by=['idf_weights'],ascending = False).head()



Unnamed: 0,idf_weights
blah,7.758809
chest,7.684701
pregnant,7.433387
computer,7.379319
dream,7.379319


In [23]:
mnb = MultinomialNB()
mnb.fit(X_train_idf, y_train)

In [24]:
pred_mnb = mnb.predict(X_test_idf)

# Calculate accuracy of predicted values
acc = accuracy_score(y_test, pred_mnb)


results = pd.DataFrame([['Multinomial Naive Bayes', acc]],
               columns = ['Model', 'Accuracy'])

print(results)

                     Model  Accuracy
0  Multinomial Naive Bayes  0.740625


In [25]:
from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier()
clf_rf.fit(X_train_idf, y_train)

# Predict using testing data
y_pred_rf = clf_rf.predict(X_test_idf)

# Calculate accuracy
acc = accuracy_score(y_test, y_pred_rf)

model_results = pd.DataFrame([['Random Forest(Gini)', acc]],
               columns = ['Model', 'Accuracy'])

results = results.append(model_results, ignore_index = True)
print(results)

                     Model  Accuracy
0  Multinomial Naive Bayes  0.740625
1      Random Forest(Gini)  0.833542


  results = results.append(model_results, ignore_index = True)


In [26]:
from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier(criterion='entropy')
clf_rf.fit(X_train_idf, y_train)

# Predict using testing data
y_pred_rf = clf_rf.predict(X_test_idf)

# Calculate accuracy
acc = accuracy_score(y_test, y_pred_rf)

model_results = pd.DataFrame([['Random Forest(Entropy)', acc]],
               columns = ['Model', 'Accuracy'])

results = results.append(model_results, ignore_index = True)

  results = results.append(model_results, ignore_index = True)


In [27]:
from sklearn.svm import SVC
clf_svc = SVC()
clf_rf.fit(X_train_idf, y_train)

# Predict using testing data
y_pred_rf = clf_rf.predict(X_test_idf)

# Calculate accuracy
acc = accuracy_score(y_test, y_pred_rf)

model_results = pd.DataFrame([['SVC by SVM ', acc]],
               columns = ['Model', 'Accuracy'])

results = results.append(model_results, ignore_index = True)
print(results)

                     Model  Accuracy
0  Multinomial Naive Bayes  0.740625
1      Random Forest(Gini)  0.833542
2   Random Forest(Entropy)  0.811042
3              SVC by SVM   0.810208


  results = results.append(model_results, ignore_index = True)


In [28]:

confusion_matrix(y_test,y_pred_rf)

array([[ 498,   17,   59,    4,   33,    0],
       [  26,  447,   90,    4,   31,   17],
       [  19,   10, 1459,   55,   51,    9],
       [   4,    3,   90,  274,    2,    0],
       [  39,   39,  222,   12, 1089,    6],
       [   0,   38,   25,    0,    6,  122]], dtype=int64)

In [None]:
3
