In [1]:
!gdown 1Q1eogBKp_5z25g0VELTSDuWpdmRiiJ-O

Downloading...
From: https://drive.google.com/uc?id=1Q1eogBKp_5z25g0VELTSDuWpdmRiiJ-O
To: /content/Emotion_classify_Data.csv
  0% 0.00/614k [00:00<?, ?B/s]100% 614k/614k [00:00<00:00, 140MB/s]


In [2]:
import pandas as pd

#read the data into a pandas dataframe
df = pd.read_csv("/content/Emotion_classify_Data.csv")
print(df.shape)
df.head(5)

(5937, 2)


Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear


In [3]:
df.Emotion.value_counts()

anger    2000
joy      2000
fear     1937
Name: Emotion, dtype: int64

In [4]:
#Add the new column which gives a unique number to each of these labels 

df['Emotion_num'] = df['Emotion'].map({
    'joy' : 0, 
    'fear': 1, 
    'anger': 2
})

#checking the results 
df.head(5)


Unnamed: 0,Comment,Emotion,Emotion_num
0,i seriously hate one subject to death but now ...,fear,1
1,im so full of life i feel appalled,anger,2
2,i sit here to write i start to dig out my feel...,fear,1
3,ive been really angry with r and i feel like a...,joy,0
4,i feel suspicious if there is no one outside l...,fear,1



# Modelling without Pre-processing Text data

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.Comment, 
    df.Emotion_num, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=df.Emotion_num
)

In [6]:
X_train.shape

(4749,)

In [7]:
y_train.value_counts()

0    1600
2    1600
1    1549
Name: Emotion_num, dtype: int64

In [8]:
y_test.value_counts()

0    400
2    400
1    388
Name: Emotion_num, dtype: int64

# Model1

* use CountVectorizer with only trigrams.
* use RandomForest as the classifier.
* print the classification report.



In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from  sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
     

In [10]:

clf = Pipeline([
    ('vectorizer_3_3_gram', CountVectorizer(ngram_range = (3, 3))),        #using the ngram_range parameter 
     ('classifier',  RandomForestClassifier())         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.60      0.26      0.37       400
           1       0.37      0.81      0.50       388
           2       0.54      0.21      0.30       400

    accuracy                           0.42      1188
   macro avg       0.50      0.43      0.39      1188
weighted avg       0.50      0.42      0.39      1188



# Model2

* using CountVectorizer with both unigram and bigrams.
* use Multinomial Naive Bayes as the classifier.
* print the classification report.

In [11]:

clf = Pipeline([
    ('vectorizer_1_2_gram', CountVectorizer(ngram_range = (1, 2))),        #using the ngram_range parameter 
     ('classifier',  MultinomialNB())         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.86      0.87       400
           1       0.87      0.83      0.85       388
           2       0.83      0.88      0.85       400

    accuracy                           0.86      1188
   macro avg       0.86      0.86      0.86      1188
weighted avg       0.86      0.86      0.86      1188



# Model3

* using CountVectorizer with both unigram and Bigrams.
* use RandomForest as the classifier.
* print the classification report.

In [12]:

clf = Pipeline([
    ('vectorizer_1_2_gram', CountVectorizer(ngram_range = (1, 2))),        #using the ngram_range parameter 
     ('classifier',   RandomForestClassifier())         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.96      0.90       400
           1       0.95      0.89      0.92       388
           2       0.94      0.86      0.90       400

    accuracy                           0.91      1188
   macro avg       0.91      0.91      0.91      1188
weighted avg       0.91      0.91      0.91      1188



## Model4

* using TF-IDF vectorizer for Pre-processing the text.
* use RandomForest as the classifier.
* print the classification report

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [14]:
clf = Pipeline([
    ('vectorizer_tf-idf', TfidfVectorizer()),        #using the ngram_range parameter 
     ('classifier',   RandomForestClassifier())         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.94      0.91       400
           1       0.91      0.91      0.91       388
           2       0.93      0.87      0.90       400

    accuracy                           0.91      1188
   macro avg       0.91      0.91      0.91      1188
weighted avg       0.91      0.91      0.91      1188



# Use text pre-processing to remove stop words, punctuations and apply lemmatization

In [15]:
import spacy

# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm") 


#use this utility function to get the preprocessed text data
def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

In [16]:
df['preprocessed_Comment'] = df['Comment'].apply(preprocess) 

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    df.preprocessed_Comment, 
    df.Emotion_num,
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=df.Emotion_num
)

# Let's check the scores with our best model till now
 * Random Forest

Model5

* using CountVectorizer with both unigrams and bigrams.
* use RandomForest as the classifier.
* print the classification report.

In [18]:
clf = Pipeline([
    ('vectorizer_tf-idf', CountVectorizer()),        #using the ngram_range parameter 
     ('classifier',   RandomForestClassifier())         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.95      0.95       400
           1       0.93      0.92      0.93       388
           2       0.92      0.92      0.92       400

    accuracy                           0.93      1188
   macro avg       0.93      0.93      0.93      1188
weighted avg       0.93      0.93      0.93      1188



**Model 5 performs the best**

link-->https://colab.research.google.com/drive/1x2-oHvSU13N3ka8O9PJhMBwgpmVWlQyR?usp=sharing