# Read Data

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


## Required imports

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Bidirectional, GlobalMaxPooling1D, Dense, BatchNormalization, Activation
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report

In [None]:
path="/content/drive/MyDrive/ML_project_datasheets/Final_code/labeled_data.csv"
data=pd.read_csv(path)

In [None]:
data['reviews_text_new'] = data['After_lemmatization'].copy()

In [None]:
Data = data[['reviews_text_new', 'Overall Sentiment']]

In [None]:
Data.head()

Unnamed: 0,reviews_text_new,Overall Sentiment
0,thought depressing,Positive
1,one time favorite flick adult themed comedy dr...,Positive
2,love love love great movie,Positive
3,product deliver excellent time,Positive
4,terrific movie,Neutral


## Encode lables

In [None]:
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(Data['Overall Sentiment'])

In [None]:
# Create a CountVectorizer to convert text to a bag-of-words matrix
bow_counts = CountVectorizer()
bow_data = bow_counts.fit_transform(Data['reviews_text_new'])

In [None]:
bow_data.shape

(500000, 198145)

In [None]:
X = bow_data
y = labels

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Ensemble classifier

## Voting classifier using random under sampled data

In [None]:
from sklearn.model_selection import train_test_split
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X, y)
# Split the data into training and testing sets
X_train_rus,X_test_rus,y_train_rus,y_test_rus=train_test_split(X_rus, y_rus,test_size=0.1, random_state=0)

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Define voting classifier using LR,NB and RF classifiers.
lrc = LogisticRegression(n_jobs=-1,random_state=0)
svc = SVC(C = 1,random_state=0)
rfc = RandomForestClassifier(n_jobs=-1,random_state=0)

voting_clf = VotingClassifier(estimators=[('lrc', lrc), ('svc', svc), ('rfct', rfc)],
                               voting='hard')

voting_clf.fit(X_train_rus, y_train_rus)

In [None]:
from sklearn.metrics import accuracy_score
for clf in (lrc, svc, rfc, voting_clf):
  clf.fit(X_train_rus, y_train_rus)


LogisticRegression 0.842417825246294
SVC 0.8383666329067305
RandomForestClassifier 0.8075223275941442


In [None]:
  y_pred = clf.predict(X_test_rus)
  print(clf.__class__.__name__, accuracy_score(y_test_rus, y_pred))

In [None]:
# voting_clf.fit(X_train_sm, y_train_sm)
y_pred = voting_clf.predict(X_test_rus)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Model evaluation
accuracy = accuracy_score(y_test_rus, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Classification report and confusion matrix
print("\nClassification Report:")
print(classification_report(y_test_rus, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test_rus, y_pred))

Accuracy: 0.85

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.85      0.85      7183
           1       0.83      0.88      0.85      7329
           2       0.88      0.81      0.84      7210

    accuracy                           0.85     21722
   macro avg       0.85      0.85      0.85     21722
weighted avg       0.85      0.85      0.85     21722


Confusion Matrix:
[[6138  550  495]
 [ 576 6434  319]
 [ 607  791 5812]]


# LSTM(Multi layer)

**Dropout Layer:**
Dropout layers help prevent overfitting by randomly setting a fraction of input units to 0 at each update during training. It can be added after the LSTM layer to regularize the network.

**Bidirectional LSTM:**
Bidirectional LSTMs process the input sequence in both forward and backward directions. They can capture contextual information from both past and future time steps.

**GlobalMaxPooling1D Layer:**
This layer can be added to reduce the spatial dimensions of the LSTM output and focus on the most important features.

**Dense Layers:**
After LSTM layers, you can add one or more dense layers to map the learned features to the output classes.

**Batch Normalization:**
Batch normalization can be added to normalize the activations of the network, potentially improving training stability and convergence.

## LSTM (multi layer) with unbalanced data

In [None]:
embedding_dim = 50
lstm_units = 64
output_units=3

In [None]:
model = Sequential()
model.add(Embedding(input_dim=bow_data.shape[1], output_dim=embedding_dim, input_length=max_sequence_length))
model.add(LSTM(units=lstm_units, return_sequences=True))  # Ensure return_sequences=True for sequences
model.add(Dropout(rate=0.5))
model.add(GlobalMaxPooling1D())
model.add(BatchNormalization())
model.add(Dense(units=output_units, activation='softmax'))

In [None]:
from tensorflow.keras.optimizers import Adam

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train the model (assuming you have X_train, y_train as your training data)
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7b4c582ce6e0>

In [None]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

# Print classification report
print(classification_report(y_test, y_pred_classes))
# print(classification_report(y_test_bow,test_pred_lr_all))

              precision    recall  f1-score   support

           0       0.79      0.57      0.66     21651
           1       0.74      0.80      0.77     28033
           2       0.90      0.94      0.92    100316

    accuracy                           0.86    150000
   macro avg       0.81      0.77      0.78    150000
weighted avg       0.86      0.86      0.85    150000



## LSTM (multi layer) with randomely undersampled data (test size=0.2)

In [None]:
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X, y)

In [None]:
X_rus.shape

(217218, 100)

In [None]:
X_train_rus,X_test_rus,y_train_rus,y_test_rus=train_test_split(X_rus, y_rus,test_size=0.2, random_state=0)

In [None]:
embedding_dim = 50
lstm_units = 64
output_units=3

In [None]:
model = Sequential()
model.add(Embedding(input_dim=bow_data.shape[1], output_dim=embedding_dim, input_length=100))
model.add(LSTM(units=lstm_units, return_sequences=True))  # Ensure return_sequences=True for sequences
model.add(Dropout(rate=0.5))
model.add(GlobalMaxPooling1D())
model.add(BatchNormalization())
model.add(Dense(units=output_units, activation='softmax'))

In [None]:
# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train the model (assuming you have X_train, y_train as your training data)
model.fit(X_train_rus, y_train_rus, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
 520/4345 [==>...........................] - ETA: 18:51 - loss: 0.6783 - accuracy: 0.7093

In [None]:
y_pred = model.predict(X_test_rus)
y_pred_classes = np.argmax(y_pred, axis=1)

# Print classification report
print(classification_report(y_test_rus, y_pred_classes))
# print(classification_report(y_test_bow,test_pred_lr_all))

## metric
1. Per-Class Metrics:
Precision, recall, and F1-score for each class (e.g., for each label in your classification task).

2. Macro Avg:
The unweighted average of precision, recall, and F1-score across all classes. Each class contributes equally to this average.

3. Weighted Avg:
The weighted average of precision, recall, and F1-score across all classes. Each class's contribution is weighted by its support (the number of true instances).

4. Micro Avg:
The precision, recall, and F1-score calculated globally by counting the total true positives, false negatives, and false positives across all classes.

5. Accuracy:
The overall accuracy of the model, calculated as the number of correctly classified samples divided by the total number of samples.

## LSTM (multi layer) with randomely undersampled data (test size= 0.1)

In [None]:
X_train_rus,X_test_rus,y_train_rus,y_test_rus=train_test_split(X_rus, y_rus,test_size=0.1, random_state=0)

In [None]:
embedding_dim = 50
lstm_units = 64
output_units=3

In [None]:
model = Sequential()
model.add(Embedding(input_dim=bow_data.shape[1], output_dim=embedding_dim, input_length=100))
model.add(LSTM(units=lstm_units, return_sequences=True))  # Ensure return_sequences=True for sequences
model.add(Dropout(rate=0.5))
model.add(GlobalMaxPooling1D())
model.add(BatchNormalization())
model.add(Dense(units=output_units, activation='softmax'))

In [None]:
# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train the model (assuming you have X_train, y_train as your training data)
model.fit(X_train_rus, y_train_rus, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7e5cf37d3760>

In [None]:
y_pred = model.predict(X_test_rus)
y_pred_classes = np.argmax(y_pred, axis=1)

# Print classification report
print(classification_report(y_test_rus, y_pred_classes))

              precision    recall  f1-score   support

           0       0.88      0.61      0.72      7183
           1       0.84      0.72      0.77      7329
           2       0.64      0.93      0.76      7210

    accuracy                           0.75     21722
   macro avg       0.79      0.75      0.75     21722
weighted avg       0.79      0.75      0.75     21722

