# Sentiment Analysis using Traditional models

In [115]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import re, nltk, spacy, string
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer,TfidfVectorizer
from textblob import TextBlob
from wordcloud import WordCloud, STOPWORDS
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from nltk.corpus import wordnet
import en_core_web_sm
nlp = en_core_web_sm.load()
#EDA

from collections import Counter
from nltk.util import ngrams
from wordcloud import WordCloud


#nltk
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

import tensorflow as tf
import tensorflow_hub as hub

import matplotlib.pyplot as plt

#preprocess
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from collections import Counter



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [116]:
df = pd.read_csv("/content/processed_reviews (3).csv")

In [117]:
df['labels'] = df['category'].map({'Positive': 1, 'Neutral': 0, 'Negative': -1})

In [118]:
df.head()

Unnamed: 0,category,cleaned_review_body,labels
0,Positive,great experience atom start easy process open ...,1
1,Positive,straight forward clear instruction offering go...,1
2,Positive,done doddle happy atom interest rate best took...,1
3,Positive,happy service provided atom app clear easy pro...,1
4,Positive,app user friendly customer care friendly effic...,1


# checking if the dataset is balanced

In [119]:
data = df['category'].value_counts(normalize=True).mul(100).reset_index()
data

Unnamed: 0,index,category
0,Positive,95.579134
1,Negative,2.988506
2,Neutral,1.432361


Here we can clearly see that the dataset is imbalnced hence we have to balance the dataset inorder to get the proper result from model


# Split the dataset into train and test

In [120]:
X = df['cleaned_review_body']
y = df['category']

# Stratify for y, very important to keep the class balance in train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify=y)

# Feature Engineering

In [121]:
from nltk.corpus import stopwords
# Initialize the vectorizer
vectorizer = TfidfVectorizer( stop_words='english',max_features=5000)

# Fit it on X_train
vectorizer.fit(X_train)


# Transform
X_train_vec = vectorizer.transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [122]:
len(vectorizer.get_feature_names_out())

3382

In [123]:
words = vectorizer.get_feature_names_out()
word_counts = X_train_vec.toarray().sum(axis=0)

In [124]:
words_df = pd.DataFrame({"token": words, "count": word_counts})

words_df.sort_values(by="count", ascending=False).head(10)

Unnamed: 0,token,count
925,easy,375.100024
30,account,303.246224
2373,rate,283.131656
168,app,240.630796
3206,use,222.206667
1290,good,219.171585
2705,set,200.259138
242,atom,181.125413
2034,open,173.699409
2634,saving,164.470911


The above code gives the words and its tokens

## SMOTE

In [125]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_vec, y_train)
print("Class distribution after SMOTE:", Counter(y_train_resampled))

Class distribution after SMOTE: Counter({'Positive': 3783, 'Negative': 3783, 'Neutral': 3783})


# Model training

## Logistic Regression

In [126]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
# Build a Logistic Regression model
classifier = LogisticRegression(max_iter=1000)  # Increase max_iter if needed
classifier.fit(X_train_resampled, y_train_resampled)

# predict on test data
y_pred = classifier.predict(X_test_vec)

# Evaluate the model
report = classification_report(y_test, y_pred)
print(report)



              precision    recall  f1-score   support

    Negative       0.59      0.78      0.67        51
     Neutral       0.07      0.04      0.05        24
    Positive       0.98      0.98      0.98      1622

    accuracy                           0.96      1697
   macro avg       0.55      0.60      0.57      1697
weighted avg       0.96      0.96      0.96      1697



In [127]:
from sklearn.model_selection import GridSearchCV

# Define a parameter grid to search over
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'max_iter': [500,1000],
}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=LogisticRegression(), param_grid=param_grid, cv=5)

# Fit the grid search to your training data
grid_search.fit(X_train_resampled, y_train_resampled)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)


Best Hyperparameters: {'C': 10, 'max_iter': 500}


In [114]:
classifier = LogisticRegression(C=10,max_iter=500)
classifier.fit(X_train_resampled, y_train_resampled)

# predict on test data
y_pred = classifier.predict(X_test_vec)

# Evaluate the model
report = classification_report(y_test, y_pred)
print(report)


              precision    recall  f1-score   support

    Negative       0.60      0.57      0.59        51
     Neutral       0.25      0.17      0.20        24
    Positive       0.98      0.99      0.98      1622

    accuracy                           0.96      1697
   macro avg       0.61      0.57      0.59      1697
weighted avg       0.96      0.96      0.96      1697



## Random Forest

In [129]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)  # You can adjust the number of trees (n_estimators)

# Fit the model training data
rf_classifier.fit(X_train_resampled, y_train_resampled)

# predict on test data
y_pred_rf = rf_classifier.predict(X_test_vec)

# Evaluate the Random Forest model
report_rf = classification_report(y_test, y_pred_rf)
print(report_rf)


              precision    recall  f1-score   support

    Negative       1.00      0.18      0.30        51
     Neutral       0.00      0.00      0.00        24
    Positive       0.96      1.00      0.98      1622

    accuracy                           0.96      1697
   macro avg       0.65      0.39      0.43      1697
weighted avg       0.95      0.96      0.95      1697



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [106]:
# Define a parameter grid to search over
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'max_features': [ 'sqrt', 'log2'],  # Number of features to consider at each split
}

# Create a GridSearchCV object
rf_grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                               param_grid=param_grid,
                               cv=5)

# Fit the grid search on training data
rf_grid_search.fit(X_train_resampled, y_train_resampled)

# best hyperparameters
best_params_rf = rf_grid_search.best_params_
print("Best Hyperparameters for Random Forest:", best_params_rf)


Best Hyperparameters for Random Forest: {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}


In [107]:
#  Random Forest Classifier with the best hyperparameters
best_rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=None, max_features='log2', min_samples_leaf=1, min_samples_split=2, random_state=42)

# Fit the model on training data
best_rf_classifier.fit(X_train_resampled, y_train_resampled)

# predict on test data
y_pred_best_rf = best_rf_classifier.predict(X_test_vec)

# Evaluate the Random Forest model
report_best_rf = classification_report(y_test, y_pred_best_rf)
print(report_best_rf)


              precision    recall  f1-score   support

    Negative       0.71      0.10      0.17        51
     Neutral       0.00      0.00      0.00        24
    Positive       0.96      1.00      0.98      1622

    accuracy                           0.96      1697
   macro avg       0.56      0.37      0.38      1697
weighted avg       0.94      0.96      0.94      1697



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## SVM

In [109]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Support Vector Machine (SVM) Classifier with the best hyperparameters
best_svm_classifier = SVC(kernel='linear', C=1.0, random_state=42)

# Fit the model on training data
best_svm_classifier.fit(X_train_resampled, y_train_resampled)

# predict on the test data
y_pred_best_svm = best_svm_classifier.predict(X_test_vec)

# Evaluate the SVM model
report_best_svm = classification_report(y_test, y_pred_best_svm)
print(report_best_svm)


              precision    recall  f1-score   support

    Negative       0.67      0.47      0.55        51
     Neutral       0.36      0.17      0.23        24
    Positive       0.97      0.99      0.98      1622

    accuracy                           0.96      1697
   macro avg       0.67      0.54      0.59      1697
weighted avg       0.96      0.96      0.96      1697



In [132]:


# Define a parameter grid to search over
param_grid = {
    'C': [0.1, 1, 10],                  # Regularization parameter
    'kernel': ['linear','rbf']       # Kernel type (linear or radial basis function
}

# Create a GridSearchCV object
svm_grid_search = GridSearchCV(estimator=SVC(random_state=42),
                               param_grid=param_grid,
                               cv=5,
                               scoring='accuracy')

# Fit the grid search to your resampled training data
svm_grid_search.fit(X_train_resampled, y_train_resampled)

# Get the best hyperparameters
best_params_svm = svm_grid_search.best_params_
print("Best Hyperparameters for SVM:", best_params_svm)



Best Hyperparameters for SVM: {'C': 1, 'kernel': 'linear'}


In [133]:
#  SVM model with the best hyperparameters
best_svm_classifier = SVC(C=best_params_svm['C'],
                          kernel=best_params_svm['kernel'],

                          random_state=42)

# Fit the model on training data
best_svm_classifier.fit(X_train_resampled, y_train_resampled)

# Mpredict on the test data
y_pred_best_svm = best_svm_classifier.predict(X_test_vec)

# Evaluate the SVM model
report_best_svm = classification_report(y_test, y_pred_best_svm)
print(report_best_svm)


              precision    recall  f1-score   support

    Negative       0.69      0.57      0.62        51
     Neutral       0.20      0.04      0.07        24
    Positive       0.97      0.99      0.98      1622

    accuracy                           0.96      1697
   macro avg       0.62      0.53      0.56      1697
weighted avg       0.95      0.96      0.96      1697

