In [2]:
import pandas as pd

In [3]:
data = pd.read_csv(
    '/content/drive/MyDrive/Data Science with Advanced Gen AI Internship/Internship Tasks/preprocessed_data.csv'
)

# Quick check
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7014 entries, 0 to 7013
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Final Text  7014 non-null   object
 1   Sentiment   7014 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 109.7+ KB


In [4]:
data.head()

Unnamed: 0,Final Text,Sentiment
0,nice product. nice product good quality but pr...,1
1,don t waste your money. they didn t supplied y...,0
2,did not meet expectations. worst product. dama...,0
3,fair. quite o. k. but nowadays the quality of ...,0
4,over priced. over pricedjust 620 ..from retail...,0


In [5]:
from sklearn.model_selection import train_test_split

X = data['Final Text']
y = data['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.2,
                                                    stratify = y,         # preserves class proportions
                                                    random_state = 42)

# used stratified splitting, so the train and test sets preserve the same proportion of positive and negative samples.

In [6]:
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)
print()
print("Train class distribution:\n", y_train.value_counts(normalize=True))
print()
print("Test class distribution:\n", y_test.value_counts(normalize=True))


Train shape: (5611,)
Test shape: (1403,)

Train class distribution:
 Sentiment
1    0.767065
0    0.232935
Name: proportion, dtype: float64

Test class distribution:
 Sentiment
1    0.766928
0    0.233072
Name: proportion, dtype: float64


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score

In [8]:
tfidf = TfidfVectorizer(
    max_features = 5000,   # limiting vocab size
    ngram_range = (1,2),   # unigrams + bigrams
    stop_words = 'english'
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [9]:
X_train_tfidf

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 44757 stored elements and shape (5611, 5000)>

In [10]:
X_test_tfidf

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 10244 stored elements and shape (1403, 5000)>

## Logistic Regression

In [11]:
# model - 1
model = LogisticRegression(random_state=42)
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

print("----- Simple Logistic Regression -----")

# ===== Train Performance =====
print("\n")
print("-----  Train Performance  -----")
y_pred_train = model.predict(X_train_tfidf)

print("Train Confusion Matrix")
print(confusion_matrix(y_train, y_pred_train))

print("Train Classification Report")
print(classification_report(y_train, y_pred_train, digits=2))

print("\n")

# ===== Test Performance =====
print("-----  Test Performance  -----")
y_pred_test = model.predict(X_test_tfidf)

print("Test Confusion Matrix")
print(confusion_matrix(y_test, y_pred_test))

print("Test Classification Report")
print(classification_report(y_test, y_pred_test, digits=2))


----- Simple Logistic Regression -----


-----  Train Performance  -----
Train Confusion Matrix
[[ 704  603]
 [  63 4241]]
Train Classification Report
              precision    recall  f1-score   support

           0       0.92      0.54      0.68      1307
           1       0.88      0.99      0.93      4304

    accuracy                           0.88      5611
   macro avg       0.90      0.76      0.80      5611
weighted avg       0.89      0.88      0.87      5611



-----  Test Performance  -----
Test Confusion Matrix
[[ 158  169]
 [  32 1044]]
Test Classification Report
              precision    recall  f1-score   support

           0       0.83      0.48      0.61       327
           1       0.86      0.97      0.91      1076

    accuracy                           0.86      1403
   macro avg       0.85      0.73      0.76      1403
weighted avg       0.85      0.86      0.84      1403



In [12]:
# model - 2
lr_balanced = LogisticRegression(random_state=42, class_weight='balanced')
lr_balanced.fit(X_train_tfidf, y_train)

# ===== train performance =====
y_pred_train = lr_balanced.predict(X_train_tfidf)

print("Train Confusion Matrix")
print(confusion_matrix(y_train, y_pred_train))

print("Train Classification Report")
print(classification_report(y_train, y_pred_train, digits=2))

# ===== test performance =====
y_pred_test = lr_balanced.predict(X_test_tfidf)

print("Test Confusion Matrix")
print(confusion_matrix(y_test, y_pred_test))

print("Test Classificatio Report")
print(classification_report(y_test, y_pred_test))


Train Confusion Matrix
[[1100  207]
 [ 475 3829]]
Train Classification Report
              precision    recall  f1-score   support

           0       0.70      0.84      0.76      1307
           1       0.95      0.89      0.92      4304

    accuracy                           0.88      5611
   macro avg       0.82      0.87      0.84      5611
weighted avg       0.89      0.88      0.88      5611

Test Confusion Matrix
[[230  97]
 [153 923]]
Test Classificatio Report
              precision    recall  f1-score   support

           0       0.60      0.70      0.65       327
           1       0.90      0.86      0.88      1076

    accuracy                           0.82      1403
   macro avg       0.75      0.78      0.76      1403
weighted avg       0.83      0.82      0.83      1403



In [13]:
# [[TN  FP]
#  [FN  TP]]

In [14]:
# model - 3
from sklearn.model_selection import StratifiedKFold, cross_val_score

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
model_3 = LogisticRegression(random_state=42, class_weight='balanced')

cv_scores = cross_val_score(model_3, X_train_tfidf, y_train, cv=skf, scoring='f1_macro')
# scoring = 'f1_macro' => Computes F1 for each class separately and takes simple average
# This is exactly what we want for imbalanced sentiment analysis.

print(f"CV F1-scores: {cv_scores}")
print(f"Mean CV F1-score: {cv_scores.mean():.4f}")

CV F1-scores: [0.77108495 0.76420399 0.76933757 0.77769983 0.77658213]
Mean CV F1-score: 0.7718


- Using 5-fold cross-validated Logistic Regression optimized for F1-score, the model achieved a stable mean F1-score of ~0.77, indicating good generalization performance despite class imbalance.

In [15]:
from sklearn.model_selection import cross_val_predict

print('Train Performance on CV')
y_train_cv_pred = cross_val_predict(model_3,
                                  X_train_tfidf,
                                  y_train,
                                  cv=skf)

print(classification_report(y_train, y_train_cv_pred))
print('\n')
print('Test Performance on CV')
y_test_cv_pred = cross_val_predict(
    model_3,
    X_test_tfidf,
    y_test,
    cv=skf
)

print(classification_report(y_test, y_test_cv_pred))


Train Performance on CV
              precision    recall  f1-score   support

           0       0.61      0.72      0.66      1307
           1       0.91      0.86      0.88      4304

    accuracy                           0.83      5611
   macro avg       0.76      0.79      0.77      5611
weighted avg       0.84      0.83      0.83      5611



Test Performance on CV
              precision    recall  f1-score   support

           0       0.58      0.63      0.60       327
           1       0.88      0.86      0.87      1076

    accuracy                           0.81      1403
   macro avg       0.73      0.74      0.74      1403
weighted avg       0.81      0.81      0.81      1403



In [16]:
feature_names = tfidf.get_feature_names_out()
feature_names

array(['02 matches', '07', '07 new', ..., 'yr 350', 'yr shuttle', 'yrs'],
      dtype=object)

In [17]:
coefficients = model.coef_[0]
coefficients

array([-0.15211049, -0.10717918, -0.10717918, ..., -0.08891404,
       -0.11577605, -0.10872471])

In [18]:
feature_importance = pd.DataFrame({
    'word': feature_names,
    'coefficient': coefficients
})

feature_importance = feature_importance.sort_values(
    by='coefficient', ascending=False
)


In [19]:
# Top Positive Words (Sentiment = 1)
top_positive_words = feature_importance.head(20)
top_positive_words

# These are the words that strongly push toward positive reviews.

Unnamed: 0,word,coefficient
231,best,4.223695
2985,good quality,3.03736
4667,thanks,2.549029
138,awesome,2.519837
3129,great,2.5062
4607,superb,2.439659
1046,delivery,2.422247
1701,excellent,2.420001
2975,good product,2.333299
3548,nice,2.176484


In [20]:
# Top Negative Words (Sentiment = 0)
top_negative_words = feature_importance.tail(20)
top_negative_words

# These are the words pushing predictions toward negative sentiment.
# These are the pain points of customers who write negative reviews.

Unnamed: 0,word,coefficient
3898,product bad,-1.36484
361,broken,-1.378364
3615,ok,-1.387446
3984,product quality,-1.458955
1382,don,-1.473534
308,box,-1.486427
651,damage,-1.56293
4434,shuttles,-1.606333
3393,low,-1.645917
4494,slow,-1.745932


- Using TF-IDF with Logistic Regression allows direct interpretability through model coefficients. Positive coefficients indicate words contributing to positive sentiment, while negative coefficients contribute to negative sentiment. This provides transparency into model decisions.

## Linear SVM

In [21]:
# SVC(kernel='linear')

In [22]:
from sklearn.svm import LinearSVC

# Model - 1:
svm_model = LinearSVC(random_state=42)

# Train
svm_model.fit(X_train_tfidf, y_train)

# Predictions
y_pred_train = svm_model.predict(X_train_tfidf)
y_pred_test = svm_model.predict(X_test_tfidf)

print("----- Linear SVM (Default) -----")

print("\nTrain Performance")
print(confusion_matrix(y_train, y_pred_train))
print(classification_report(y_train, y_pred_train, digits=2))

print("\nTest Performance")
print(confusion_matrix(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test, digits=2))


----- Linear SVM (Default) -----

Train Performance
[[1025  282]
 [  56 4248]]
              precision    recall  f1-score   support

           0       0.95      0.78      0.86      1307
           1       0.94      0.99      0.96      4304

    accuracy                           0.94      5611
   macro avg       0.94      0.89      0.91      5611
weighted avg       0.94      0.94      0.94      5611


Test Performance
[[ 190  137]
 [  58 1018]]
              precision    recall  f1-score   support

           0       0.77      0.58      0.66       327
           1       0.88      0.95      0.91      1076

    accuracy                           0.86      1403
   macro avg       0.82      0.76      0.79      1403
weighted avg       0.85      0.86      0.85      1403



In [23]:
# Linear SVM with class_weight='balanced'

svm_2 = LinearSVC(
    class_weight='balanced',
    random_state=42
)

svm_2.fit(X_train_tfidf, y_train)

y_pred_train_bal = svm_2.predict(X_train_tfidf)
y_pred_test_bal = svm_2.predict(X_test_tfidf)

print("----- Linear SVM (Balanced) -----")

print("\nTrain Performance")
print(confusion_matrix(y_train, y_pred_train_bal))
print(classification_report(y_train, y_pred_train_bal, digits=2))

print("\nTest Performance")
print(confusion_matrix(y_test, y_pred_test_bal))
print(classification_report(y_test, y_pred_test_bal, digits=2))


----- Linear SVM (Balanced) -----

Train Performance
[[1181  126]
 [ 262 4042]]
              precision    recall  f1-score   support

           0       0.82      0.90      0.86      1307
           1       0.97      0.94      0.95      4304

    accuracy                           0.93      5611
   macro avg       0.89      0.92      0.91      5611
weighted avg       0.93      0.93      0.93      5611


Test Performance
[[216 111]
 [135 941]]
              precision    recall  f1-score   support

           0       0.62      0.66      0.64       327
           1       0.89      0.87      0.88      1076

    accuracy                           0.82      1403
   macro avg       0.75      0.77      0.76      1403
weighted avg       0.83      0.82      0.83      1403



In [24]:
# SVM with CV
import numpy as np

cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

svm_cv = LinearSVC(random_state=42)

cv_f1_scores = cross_val_score(
    svm_cv,
    X_train_tfidf,
    y_train,
    scoring='f1_macro',
    cv=cv,
    n_jobs=-1
)

print("Linear SVM (Default) CV F1-scores:", cv_f1_scores)
print("Mean CV F1-score:", np.mean(cv_f1_scores))


Linear SVM (Default) CV F1-scores: [0.76211513 0.77146931 0.78067654 0.76493132 0.77649511]
Mean CV F1-score: 0.7711374817469906


- After performing cross-validation with macro F1-score, both Logistic Regression and Linear SVM achieved comparable performance (~0.77). This indicated that further gains would require richer text representations rather than changing classifiers

## Word2Vec Workflow

- Step 1: Tokenization

In [25]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('punkt_tab')

tokenized_texts = data['Final Text'].apply(word_tokenize)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


- Step 2: Train Word2Vec Model

In [26]:
!pip install gensim



In [27]:
from gensim.models import Word2Vec

w2v_model = Word2Vec(
    sentences = tokenized_texts,
    vector_size = 100,
    window = 5,
    min_count = 2,
    workers = 4,
    sg = 1  # Skip-gram -> better for small datasets
)

- Step - 3: Convert Rebiew to Vector

In [28]:
import numpy as np

def document_vector(doc):
  vectors = [
      w2v_model.wv[word] for word in doc if word in w2v_model.wv
  ]

  return np.mean(vectors, axis = 0) if len(vectors) > 0 else np.zeros(w2v_model.vector_size)


In [29]:
X_w2v = np.vstack(tokenized_texts.apply(document_vector))
y = data['Sentiment'].values

In [30]:
# every review is 100 dim-vector

- Step 4: Train Test Split

In [31]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_w2v,
    y,
    test_size = 0.2,
    stratify = y,
    random_state = 42
)

- Step 5: Logistic Regression on Word2Vec

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

lr_w2v = LogisticRegression(
    class_weight = 'balanced',
    max_iter = 1000,
    random_state = 42
)

lr_w2v.fit(X_train, y_train)

print('===== Train Performances =====')
y_pred_train = lr_w2v.predict(X_train)
print(confusion_matrix(y_train, y_pred_train))
print(classification_report(y_train, y_pred_train))
print('\n')

print('===== Test Performances =====')
y_pred_test = lr_w2v.predict(X_test)
print(confusion_matrix(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test))

===== Train Performances =====
[[ 989  318]
 [ 763 3541]]
              precision    recall  f1-score   support

           0       0.56      0.76      0.65      1307
           1       0.92      0.82      0.87      4304

    accuracy                           0.81      5611
   macro avg       0.74      0.79      0.76      5611
weighted avg       0.84      0.81      0.82      5611



===== Test Performances =====
[[252  75]
 [200 876]]
              precision    recall  f1-score   support

           0       0.56      0.77      0.65       327
           1       0.92      0.81      0.86      1076

    accuracy                           0.80      1403
   macro avg       0.74      0.79      0.76      1403
weighted avg       0.84      0.80      0.81      1403



- Step 6: Cross-Validation (Macro F1)

In [33]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

skf = StratifiedKFold(n_splits = 5,
                      shuffle = True,
                      random_state = 42)

cv = cross_val_score(lr_w2v,
                     X_train,
                     y_train,
                     scoring = 'f1_macro',
                     cv = skf,
                     n_jobs = -1)

print("Word2Vec CV F1 scores:", cv)
print("Mean CV Macro F1:", cv.mean())

Word2Vec CV F1 scores: [0.76576337 0.75621587 0.75730806 0.76213455 0.72978222]
Mean CV Macro F1: 0.7542408138255732


- Despite capturing semantic similarity, Word2Vec with average pooling underperformed TF-IDF for this sentiment classification task.

In [34]:
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Pipeline: Scaling + Linear SVM
svm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', LinearSVC(
        C=1.0,
        class_weight='balanced',
        max_iter=5000,
        random_state=42
    ))
])

# Stratified CV
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# CV evaluation (Macro F1)
cv_f1_scores = cross_val_score(
    svm_pipeline,
    X_w2v,
    y,
    cv=skf,
    scoring='f1_macro'
)

print("Word2Vec + Linear SVM CV F1 scores:", cv_f1_scores)
print("Mean CV Macro F1:", cv_f1_scores.mean())


Word2Vec + Linear SVM CV F1 scores: [0.75657898 0.78088966 0.75248732 0.78598392 0.80213066]
Mean CV Macro F1: 0.7756141084061655


In [35]:
# SVM is distance- and margin-based → feature scale directly affects the decision boundary
# Logistic Regression is coefficient-based → scale matters less (but still matters in practice)

### Gaussian NB

In [36]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix

# Model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Train performance
print("===== Train Performance =====")
y_train_pred = nb_model.predict(X_train)
print(confusion_matrix(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred, digits=2))

# Test performance
print("\n===== Test Performance =====")
y_test_pred = nb_model.predict(X_test)
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred, digits=2))


===== Train Performance =====
[[ 902  405]
 [ 997 3307]]
              precision    recall  f1-score   support

           0       0.47      0.69      0.56      1307
           1       0.89      0.77      0.83      4304

    accuracy                           0.75      5611
   macro avg       0.68      0.73      0.69      5611
weighted avg       0.79      0.75      0.76      5611


===== Test Performance =====
[[228  99]
 [257 819]]
              precision    recall  f1-score   support

           0       0.47      0.70      0.56       327
           1       0.89      0.76      0.82      1076

    accuracy                           0.75      1403
   macro avg       0.68      0.73      0.69      1403
weighted avg       0.79      0.75      0.76      1403



In [38]:
# Gaussian NB with CV
from sklearn.model_selection import StratifiedKFold, cross_val_score

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = cross_val_score(
    GaussianNB(),
    X_train,
    y_train,
    cv=skf,
    scoring='f1_macro'
)

print("Word2Vec + GaussianNB CV F1 scores:", cv_scores)
print("Mean CV Macro F1:", cv_scores.mean())


Word2Vec + GaussianNB CV F1 scores: [0.70182653 0.68620037 0.68538244 0.69777129 0.68269801]
Mean CV Macro F1: 0.6907757267221302


- Even after trying probabilistic models like Naive Bayes on dense Word2Vec embeddings, performance plateaued. This motivated the shift toward contextual transformers like BERT.

## BERT

In [3]:
!pip install transformers torch accelerate -q

In [4]:
import pandas as pd
import numpy as np
import torch

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

from transformers import (
    BertTokenizerFast,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments
)

- Loading Preprocessed Data

In [8]:
data = pd.read_csv('/content/drive/MyDrive/Data Science with Advanced Gen AI Internship/Internship Tasks/preprocessed_data.csv')
data.head()

Unnamed: 0,Final Text,Sentiment
0,nice product. nice product good quality but pr...,1
1,don t waste your money. they didn t supplied y...,0
2,did not meet expectations. worst product. dama...,0
3,fair. quite o. k. but nowadays the quality of ...,0
4,over priced. over pricedjust 620 ..from retail...,0


In [9]:
texts = data['Final Text'].astype(str).tolist()
labels = data['Sentiment'].tolist()

- Train, Validation, Test Split

In [7]:
X_temp, X_test, y_temp, y_test = train_test_split(
    texts,
    labels,
    test_size = 0.15,
    stratify = labels,
    random_state = 42
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp,
    y_temp,
    test_size = 0.176,
    stratify = y_temp,
    random_state = 42
)

- Tokenization

In [10]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

def tokenize(texts):
  return tokenizer(
      texts,
      padding = True,
      truncation = True,
      max_length = 128,
      return_tensors = 'pt'
  )

train_encodings = tokenize(X_train)
val_encodings = tokenize(X_val)
test_encodings = tokenize(X_test)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [12]:
train_encodings

{'input_ids': tensor([[  101,  2074, 10166,  ...,     0,     0,     0],
        [  101,  3432, 12476,  ...,     0,     0,     0],
        [  101,  2074, 10166,  ...,     0,     0,     0],
        ...,
        [  101,  3432, 12476,  ...,     0,     0,     0],
        [  101,  2515,  1996,  ...,     0,     0,     0],
        [  101,  6919,  1012,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [13]:
a = 'Hi how are you'
tokenize(a)

{'input_ids': tensor([[ 101, 7632, 2129, 2024, 2017,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [14]:
# working well

- Dataset Wrapper

In [15]:
class ReviewDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __len__(self):
    return len(self.labels)

  def __getitem__(self, idx):
    item = {k: v[idx] for k, v in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])

    return item

In [16]:
train_dataset = ReviewDataset(train_encodings, y_train)
val_dataset = ReviewDataset(val_encodings, y_val)
test_dataset = ReviewDataset(test_encodings, y_test)


- Loading BERT Model

In [17]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 2
)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
classifier.bias                            | MISSING    | 
classifier.weight                          | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


- Training Configuration

In [18]:
import torch
torch.cuda.is_available()


True

In [20]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",   # ✅ v5.0.0 uses eval_strategy
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    logging_steps=100,
    report_to="none"
)


- Trainer Setup

In [21]:
import transformers
print(transformers.__version__)


5.0.0


In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

- Train BERT

In [23]:
trainer.train()


Epoch,Training Loss,Validation Loss
1,0.314545,0.264384
2,0.247489,0.297125
3,0.203576,0.293349


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.1.attention.output.LayerNorm.weight', 'bert.encoder.layer.1.attention.output.LayerNorm.bias', 'bert.encoder.layer.1.output.LayerNorm.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.2.attention.output.LayerNorm.weight', 'bert.encoder.layer.2.attention.output.LayerNorm.bias', 'bert.encoder.layer.2.output.LayerNorm.weight', 'bert.encoder.layer.2.output.LayerNorm.bias', 'bert.encoder.layer.3.attention.output.LayerNorm.weight', 'bert.encoder.layer.3.attention.output.LayerNorm.bias', 'bert.encoder.layer.3.output.LayerNorm.weight', 'bert.encoder.layer.3.output.LayerNorm.bias', 'bert.encoder.layer.4.attention.output.La

TrainOutput(global_step=921, training_loss=0.26369876530219627, metrics={'train_runtime': 360.7959, 'train_samples_per_second': 40.835, 'train_steps_per_second': 2.553, 'total_flos': 969103794654720.0, 'train_loss': 0.26369876530219627, 'epoch': 3.0})

We progressively evolved from sparse representations (TF-IDF) to dense embeddings (Word2Vec) and finally to contextual embeddings using BERT, observing consistent improvements in macro F1-score and robustness in sentiment classification.

- Evaluation on Test Set

In [24]:
predictions = trainer.predict(test_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)

print("Test Confusion Matrix")
print(confusion_matrix(y_test, y_pred))

print("\nTest Classification Report")
print(classification_report(y_test, y_pred, digits=3))


Test Confusion Matrix
[[191  54]
 [ 51 757]]

Test Classification Report
              precision    recall  f1-score   support

           0      0.789     0.780     0.784       245
           1      0.933     0.937     0.935       808

    accuracy                          0.900      1053
   macro avg      0.861     0.858     0.860      1053
weighted avg      0.900     0.900     0.900      1053



- Comparision with Previous Models

| Model                             | Macro F1   |
| --------------------------------- | ---------- |
| TF-IDF + Logistic Regression (CV) | ~0.77      |
| Linear SVM (CV)                   | ~0.77      |
| Word2Vec + LR / SVM               | ~0.75–0.77 |
| Naive Bayes                       | ~0.69      |
| **BERT (Final)**               | **0.86**   |


- We experimented with classical ML models using TF-IDF and Word2Vec embeddings, followed by a Transformer-based encoder (BERT).
- While traditional approaches achieved macro F1-scores around 0.77, the BERT-based model significantly improved performance, achieving a macro F1-score of 0.86 on the test set.
- This demonstrates the effectiveness of contextual embeddings in capturing sentiment nuances in customer reviews.

In [25]:
trainer.save_model()

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]