In [1]:
import numpy as np
import pandas as pd

In [2]:
tweets_df = pd.read_csv('tweet_and_emotion.csv')

In [3]:
tweets_df.head()

Unnamed: 0,tweet,emotions
0,@ZubairSabirPTI pls dont insult the word 'Molna',anger
1,@ArcticFantasy I would have almost took offens...,anger
2,@IllinoisLoyalty that Rutgers game was an abom...,anger
3,@CozanGaming that's what lisa asked before she...,anger
4,Sometimes I get mad over something so minuscul...,anger


In [4]:
tweets_df['emotions'].unique()

array(['anger', 'fear', 'joy', 'sadness', 'neutral'], dtype=object)

In [5]:
from nltk.corpus import stopwords
import string
from nltk import word_tokenize
from nltk.stem import PorterStemmer
import re

In [6]:
porter = PorterStemmer()
stop_words= stopwords.words('english')

In [7]:
def cleaning (text):
    # this code will remove the punctuation from the text
    text= ''.join([word.lower() for word in text if word not in string.punctuation])
    # this code will remove URL from the text
    text = re.sub(r'http\S+', '', text)
    
    # tokenization with nltk
    text = word_tokenize(text)
    
    # stemming with nltk and removing stop words 
    text = [porter.stem(word) for word in text if word not in stop_words]
    return text

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer(analyzer=cleaning)
X = tfidf_vect.fit_transform(tweets_df['tweet'])
y = tweets_df['emotions']

In [9]:
X_tfidf_df = pd.DataFrame(X.toarray())
X_tfidf_df.columns = tfidf_vect.get_feature_names_out()

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [11]:
from sklearn.svm import SVC
svm_classifier = SVC(kernel='linear', C=1.0, gamma='auto')
svm_classifier.fit(X_train, y_train)

In [12]:
y_pred = svm_classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
confusion_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test,y_pred, target_names = tweets_df['emotions'].unique())
print("OVERALL")
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred, average='weighted'))
print('Recall:', recall_score(y_test, y_pred, average='weighted'))
print('F1 score:', f1_score(y_test, y_pred, average='weighted'))
print(report)
print(confusion_matrix)

OVERALL
Accuracy: 0.744
Precision: 0.7574563970350268
Recall: 0.744
F1 score: 0.7467984104404549
              precision    recall  f1-score   support

       anger       0.86      0.79      0.82       364
        fear       0.84      0.80      0.82       420
         joy       0.81      0.73      0.77       422
     sadness       0.60      0.78      0.67       471
     neutral       0.72      0.64      0.68       448

    accuracy                           0.74      2125
   macro avg       0.76      0.75      0.75      2125
weighted avg       0.76      0.74      0.75      2125

[[286   9   6  36  27]
 [  9 334  14  38  25]
 [  5  14 307  74  22]
 [ 10  17  38 366  40]
 [ 22  25  13 100 288]]
