In [1]:
#Installing all the packages
import pandas as pd
import numpy as np 
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

In [2]:
!git clone https://github.com/dosacat/NLPDataset.git

Cloning into 'NLPDataset'...
remote: Enumerating objects: 10, done.[K
remote: Counting objects: 100% (10/10), done.[K
remote: Compressing objects: 100% (9/9), done.[K
remote: Total 10 (delta 2), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (10/10), 2.14 MiB | 3.68 MiB/s, done.


In [3]:
#Ensuring dataset loaded correctly
%cd NLPDataset/
df_train= pd.read_csv('tweet_and_emotion.csv')
df_train.shape
df_train.head()

/content/NLPDataset


Unnamed: 0,tweet,emotions
0,@ZubairSabirPTI pls dont insult the word 'Molna',anger
1,@ArcticFantasy I would have almost took offens...,anger
2,@IllinoisLoyalty that Rutgers game was an abom...,anger
3,@CozanGaming that's what lisa asked before she...,anger
4,Sometimes I get mad over something so minuscul...,anger


In [5]:
df_train.head()
# df_train['sentiment'].unique()
df_train['emotions'].unique()
labels=df_train["emotions"].values


In [6]:
import re
def remove_pattern(text,pattern):
    
    # re.findall() finds the pattern i.e @user and puts it in a list for further task
    r = re.findall(pattern,text)
    
    # re.sub() removes @user from the sentences in the dataset
    for i in r:
        text = re.sub(i,"",text)
    
    return text

df_train['cleaned_data'] = np.vectorize(remove_pattern)(df_train['tweet'], "@[\w]*")
df_train.head()


Unnamed: 0,tweet,emotions,cleaned_data
0,@ZubairSabirPTI pls dont insult the word 'Molna',anger,pls dont insult the word 'Molna'
1,@ArcticFantasy I would have almost took offens...,anger,I would have almost took offense to this if I...
2,@IllinoisLoyalty that Rutgers game was an abom...,anger,that Rutgers game was an abomination. An affr...
3,@CozanGaming that's what lisa asked before she...,anger,that's what lisa asked before she started rag...
4,Sometimes I get mad over something so minuscul...,anger,Sometimes I get mad over something so minuscul...


In [7]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import string
from nltk import word_tokenize
from nltk.stem import PorterStemmer

porter = PorterStemmer()
stop_words= stopwords.words('english')



def cleaning (text):
    # this code will remove the punctuation from the text
    text= ''.join([word.lower() for word in text if word not in string.punctuation])
    # this code will remove URL from the text
    text = re.sub(r'http\S+', '', text)
    
    # tokenization with nltk
    text = word_tokenize(text)
    
    # stemming with nltk and removing stop words 
    text = [porter.stem(word) for word in text if word not in stop_words]
    return text
nltk.download('punkt')

df_train['cleaned_data']= df_train['cleaned_data'].apply(lambda x: cleaning(x)) 

df_train.head(5)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0,tweet,emotions,cleaned_data
0,@ZubairSabirPTI pls dont insult the word 'Molna',anger,"[pl, dont, insult, word, molna]"
1,@ArcticFantasy I would have almost took offens...,anger,"[would, almost, took, offens, actual, snap]"
2,@IllinoisLoyalty that Rutgers game was an abom...,anger,"[rutger, game, abomin, affront, god, man, must..."
3,@CozanGaming that's what lisa asked before she...,anger,"[that, lisa, ask, start, rage, call, heh]"
4,Sometimes I get mad over something so minuscul...,anger,"[sometim, get, mad, someth, minuscul, tri, rui..."


In [8]:

df_train['cleaned_data'] = df_train['cleaned_data'].apply(lambda x: ' '.join([w for w in x if len(w)>1]))

df_train.head(5)

Unnamed: 0,tweet,emotions,cleaned_data
0,@ZubairSabirPTI pls dont insult the word 'Molna',anger,pl dont insult word molna
1,@ArcticFantasy I would have almost took offens...,anger,would almost took offens actual snap
2,@IllinoisLoyalty that Rutgers game was an abom...,anger,rutger game abomin affront god man must never ...
3,@CozanGaming that's what lisa asked before she...,anger,that lisa ask start rage call heh
4,Sometimes I get mad over something so minuscul...,anger,sometim get mad someth minuscul tri ruin someb...


#FEATURE EXTRACTION TF-IDF WITH DECISION TREES

TF-IDF APPROACH

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf=TfidfVectorizer(max_df=0.80, min_df=2,max_features=2500,stop_words='english')

tfidf_matrix=tfidf.fit_transform(df_train['cleaned_data'])

df_tfidf = pd.DataFrame(tfidf_matrix.todense())



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10620,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10621,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10622,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df_tfidf, labels, test_size=0.2, random_state=0)

#THE MODEL

In [12]:
from sklearn.ensemble import RandomForestClassifier

text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)
text_classifier.fit(x_train, y_train)

#Evaluating model performance

In [13]:
predictions = text_classifier.predict(x_test)

In [17]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print("Model Accuracy",accuracy_score(y_test, predictions),"\n")


print(classification_report(y_test,predictions))
print(confusion_matrix(y_test,predictions))

Model Accuracy 0.7124705882352941 

              precision    recall  f1-score   support

       anger       0.87      0.73      0.79       342
        fear       0.85      0.75      0.79       461
         joy       0.78      0.71      0.74       424
     neutral       0.52      0.80      0.63       448
     sadness       0.74      0.59      0.66       450

    accuracy                           0.71      2125
   macro avg       0.75      0.71      0.72      2125
weighted avg       0.75      0.71      0.72      2125

[[248   8  10  63  13]
 [  7 344  14  65  31]
 [  3  10 300  98  13]
 [  6  15  34 358  35]
 [ 22  30  25 109 264]]
