


# **Sentiment Analysis From Textual Data Using NLP**

# **`Dataset`**

In [1]:
import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from imblearn.over_sampling import RandomOverSampler
nltk.download('stopwords')
nltk.download('wordnet')


# data from https://www.kaggle.com/datasets/pashupatigupta/emotion-detection-from-text?resource=download
data = pd.read_csv('tweet_emotion.csv')

data.head(4)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!


# **Dataset Analysis**

In [2]:
data.shape


(40000, 3)

In [3]:
data['sentiment'].unique()

array(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', nan,
       'surprise', 'love', 'fun', 'hate', 'happiness', 'boredom',
       'relief', 'anger'], dtype=object)

In [4]:
data.groupby('sentiment')['sentiment'].agg('count')

sentiment
anger          110
boredom        177
empty          814
enthusiasm     747
fun           1754
happiness     5145
hate          1302
love          3811
neutral       8503
relief        1507
sadness       5084
surprise      2154
worry         8343
Name: sentiment, dtype: int64

Droping tweet_id column

In [5]:
data=data.drop(['tweet_id'],axis='columns')
data.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


Checking if there is any empty values

In [6]:
data.isnull().sum()

sentiment    549
content        0
dtype: int64

Droping null values

In [7]:
data=data.dropna()
data.isnull().sum()

sentiment    0
content      0
dtype: int64

In [8]:
data.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


Encode sentiment labels to numerical values

In [11]:
label_encoder = LabelEncoder()
data['sentiment'] = label_encoder.fit_transform(data['sentiment'])
data.head()

Unnamed: 0,sentiment,content
0,2,@tiffanylue i know i was listenin to bad habi...
1,10,Layin n bed with a headache ughhhh...waitin o...
2,10,Funeral ceremony...gloomy friday...
3,3,wants to hang out with friends SOON!
4,8,@dannycastillo We want to trade with someone w...


Text preprocessing

In [13]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word.lower()) for word in tokens if word.isalpha() and word.lower() not in stop_words]
    return ' '.join(tokens)

data['content'] = data['content'].apply(preprocess_text)

data.head()

Unnamed: 0,sentiment,content
0,2,know listenin bad habit earlier started freaki...
1,10,layin n bed headache
2,10,funeral
3,3,want hang friend
4,8,want trade someone houston one


Balancing Dataset

In [18]:
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(data['content'].values.reshape(-1, 1), data['sentiment'])

Vectorize the text data using TF-IDF

In [19]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(X_resampled.ravel())
y = y_resampled

Spliting data into train and test sets. Here 80% is traing data and 20% is testing data.

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

 Initialize the classifiers with hyperparameter tuning

In [21]:
svm_classifier = SVC(kernel='linear', C=1)
rf_classifier = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)
nb_classifier = MultinomialNB(alpha=0.1)

 Performing k-fold cross-validation for each model

In [22]:
def evaluate_model(classifier, name):
    scores = cross_val_score(classifier, X_train, y_train, cv=5, scoring='accuracy')
    print(f"{name} Cross-Validation Scores: {scores}")
    print(f"Mean {name} Cross-Validation Score: {np.mean(scores)}")
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    print(f"{name} Test Accuracy: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))

 Evaluating the models

In [23]:
evaluate_model(svm_classifier, "SVM")
evaluate_model(rf_classifier, "Random Forest")
evaluate_model(nb_classifier, "Naive Bayes")

SVM Cross-Validation Scores: [0.58483632 0.57983716 0.57972408 0.5763881  0.58469976]
Mean SVM Cross-Validation Score: 0.581097084289037
SVM Test Accuracy: 0.5952144020264157
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1732
           1       0.92      1.00      0.96      1730
           2       0.68      0.79      0.73      1657
           3       0.67      0.83      0.74      1697
           4       0.53      0.63      0.57      1693
           5       0.38      0.31      0.34      1703
           6       0.72      0.78      0.75      1696
           7       0.57      0.49      0.53      1768
           8       0.23      0.25      0.24      1720
           9       0.56      0.64      0.60      1644
          10       0.43      0.38      0.40      1656
          11       0.53      0.45      0.49      1700
          12       0.30      0.18      0.22      1712

    accuracy                           0.60     22108
   macro avg 

In [None]:
import matplotlib.pyplot as plt

N, bins, patches  = plt.hist(correlation)


patches[1].set_facecolor('blue')
patches[2].set_facecolor('green')
patches[3].set_facecolor('red')
# rest are default colors

plt.xlabel('Correlation')
plt.ylabel('Count of columns')
plt.title('Count of columns in each correlation group')

plt.show()

Since all of the word frequencies(x-variables) have almost no correlation with whether the email being spam or not (y), it can be concluded that the x-variables are not independent among each other. The x-variables are most likely dependant among each other.

##### Scaling

In [None]:
from sklearn.preprocessing import MaxAbsScaler

mas = MaxAbsScaler()
mas.fit(xTrain_cv)
xTrainScaled = mas.transform(xTrain_cv)
xTestScaled = mas.transform(xTest_cv)
# All x variables are scaled to be 0 to 1

### Modelling Data

In [None]:
def modelData(model, scaled=True):
  x = xTrainScaled if scaled else xTrain_cv
  y = yTrain
  model.fit(x,y)
  return model

In [None]:
from sklearn.metrics import precision_score, f1_score, recall_score, roc_curve, roc_auc_score, ConfusionMatrixDisplay


def display_score(trained_model, scaled = True, probability = True):
  x = xTestScaled if scaled else xTest_cv
  y = yTest
  y_pred = trained_model.predict(x)
  name = str(type(trained_model).__name__)
  name += " <Scaled Data>" if scaled else ""

  acc = trained_model.score(x, y)
  prec = precision_score(y, y_pred)
  f1scre = f1_score(y, y_pred)
  recal = recall_score(y, y_pred)

  print(f"Accuracy: {acc}\nPrecision: {prec}\nF1 Score: {f1scre}\nRecall Score: {recal}\n")

  fpr, tpr = None, None
  if probability:
    y_score = trained_model.predict_proba(x)
    y_score = y_score[:, 1]
    rocAuc = roc_auc_score(y, y_score)
    fpr, tpr, _ = roc_curve(y, y_score)
    print(f"ROC AUC score: {rocAuc}")

  mat = ConfusionMatrixDisplay.from_predictions( y, y_pred)
  plt.title(f"Confusion matrix for {name}")
  plt.show()

  return {"name":name , "acc": acc, "prec":prec, "f1scre":f1scre, "recal":recal, "fpr":fpr, "tpr":tpr, "mat": mat}

In [None]:
customTests = [
  "Hello sir! When is the deadline for CSE422 project report submission?",
  "Dear Students, The University is happy to offer a student transport service and wishes to keep the price as low as feasible while covering the cost of the service. You all know that the price of fuel has increased significantly: the price of diesel has increased by 42.5% (Previous price- Tk.80, New price Tk.114).Bus fares have already been raised across the country. Despite the increase in the fuel price, the University will keep the student transport fares unchanged for the remainder of the current semester. There will be a need, however, to increase the fare to Tk. 90 from Tk.70 with effect from the start of the fall semester. The University hopes that you will understand the pressures that have led to this change. Best regards, Office of the Registrar",
  "Click here to get free discord nitro"
  ]
customTests_cv = cv.transform(customTests)
customTestsScaled = mas.transform(customTests_cv)

def getCustomTestResults(model, scaled=True):

  results = model.predict(customTestsScaled if scaled else customTests_cv)
  assert len(results) == len(customTests), f"length of results, {len(results)} and tests, {len(customTests)} are unequal"
  for i in range(len(results)):
    spam = "spam" if results[i] == 1 else "ok"
    if len(customTests[i]) > 80:
      print(f"{spam}  -->  \"{customTests[i][0:35]} ... {customTests[i][-35:]}\"")
    else:
      print(f"{spam}  -->  \"{customTests[i]}\"")

##### Multinomial Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB as MNB

# scaled
mnbModelScaled = modelData(MNB())

mnbS = display_score(mnbModelScaled)

getCustomTestResults(mnbModelScaled)

In [None]:
mnbModel = modelData(MNB(), False)

mnb = display_score(mnbModel, False)

getCustomTestResults(mnbModel, False)

##### Support Vector Classifier

In [None]:
from sklearn.svm import SVC

# scaled
svcModelScaled = modelData( SVC(kernel="linear", probability = True) )

svcS = display_score(svcModelScaled, probability= True )

getCustomTestResults(svcModelScaled )

In [None]:
svcModel = modelData( SVC(kernel="linear", probability=True) , False)

svc = display_score(svcModel, False, True)

getCustomTestResults(svcModel, False)

##### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier as RFC

# scaled
rfcModelScaled = modelData(RFC(n_estimators=50))

rfcS = display_score(rfcModelScaled)

getCustomTestResults(rfcModelScaled)

In [None]:
rfcModel = modelData(RFC(n_estimators=50), False)

rfc = display_score(rfcModel, False)

getCustomTestResults(rfcModel, False)

### Results

In [None]:
# create data
def algoResArray(algo):
  return [ algo["name"], algo["acc"], algo["prec"], algo["f1scre"], algo["recal"] ]

def displayScore(algo1, algo2):
  df = pd.DataFrame([
    algoResArray(algo1),
    algoResArray(algo2),
  ],

  columns=[ 'Algorithm', "accuracy", "precision", "f1 score", "recall" ])

  df.plot(
    x='Algorithm',
    kind='bar',
    stacked=False,
    title='Algorigthm score comparison',
    ylim=(0.8,1)
  )

displayScore(mnb, mnbS)
displayScore(svc, svcS)
displayScore(rfc, rfcS)


##### ROC-Curve

In [None]:
plt.plot(mnb["fpr"], mnb["tpr"], label= "Multinomial Naive Bayes")

plt.plot(svc["fpr"], svc["tpr"], label= "Support Vector Classifier")

plt.plot(rfc["fpr"], rfc["tpr"], label= "Random Forest Classifier")

plt.title('ROC Curves for different algorithms')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()