# Interactive Customer Feedback Sentiment Predictor
This project aims to build a sentiment analysis model using the Amazon Reviews Dataset. The objective is to predict customer sentiment based on review text.



In [10]:
# Importing libraries
import os
import re
import time
import numpy as np
import pandas as pd
import joblib
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from transformers import (
    DistilBertTokenizer,
    DistilBertModel,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from datasets import Dataset
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.optim.lr_scheduler import StepLR
from transformers import AdamW
from tqdm import tqdm



# Load the dataset
df = pd.read_csv('Reviews.csv')

print(df.shape)
print(80*'*')
print(df.info())
print(80*'*')
print(df.head())



(568454, 10)
********************************************************************************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      568454 non-null  int64 
 1   ProductId               568454 non-null  object
 2   UserId                  568454 non-null  object
 3   ProfileName             568428 non-null  object
 4   HelpfulnessNumerator    568454 non-null  int64 
 5   HelpfulnessDenominator  568454 non-null  int64 
 6   Score                   568454 non-null  int64 
 7   Time                    568454 non-null  int64 
 8   Summary                 568427 non-null  object
 9   Text                    568454 non-null  object
dtypes: int64(5), object(5)
memory usage: 43.4+ MB
None
********************************************************************************
   Id   ProductId     

Now we have the data, we can proceed with the next steps for building the **Interactive Customer Feedback Sentiment Predictor**.

### Let's start with **Step 1: Data Preprocessing**.

1. **Data Cleaning and Dropping Unnecessary Columns**:
   We'll drop irrelevant columns like `Id`, `ProductId`, `UserId`, `ProfileName`, `HelpfulnessNumerator`, `HelpfulnessDenominator`, `Time`, and `Summary`.

In [4]:
# Drop irrelevant columns
df_cleaned = df.drop(['Id', 'ProductId', 'UserId', 'ProfileName', 
                      'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Time', 'Summary'], axis=1)

# Check for missing values
print(df_cleaned.isnull().sum())


Score    0
Text     0
dtype: int64


Great! There are no missing values in the `Score` and `Text` columns, so we can proceed with the next step.

### Step 2: Text Preprocessing for NLP

We’ll clean the text data by:
- Lowercasing the text.
- Removing punctuation, special characters, and numbers.
- Removing stopwords (commonly used words like "the", "is", etc., that don't add much meaning).


In [5]:
# Download necessary NLTK data
# nltk.download('stopwords')
# nltk.download('punkt')

# Define stopwords
stop_words = set(stopwords.words('english'))

# Function for cleaning the text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation, numbers, and special characters
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Tokenize the text
    words = word_tokenize(text)
    
    # Remove stopwords
    words = [word for word in words if word not in stop_words]
    
    # Join the words back into a single string
    return ' '.join(words)

# Apply the preprocessing function to the 'Text' column
df_cleaned['cleaned_text'] = df_cleaned['Text'].apply(preprocess_text)

# Preview the cleaned data
df_cleaned[['Text', 'cleaned_text']].head()


Unnamed: 0,Text,cleaned_text
0,I have bought several of the Vitality canned d...,bought several vitality canned dog food produc...
1,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled jumbo salted peanutsth...
2,This is a confection that has been around a fe...,confection around centuries light pillowy citr...
3,If you are looking for the secret ingredient i...,looking secret ingredient robitussin believe f...
4,Great taffy at a great price. There was a wid...,great taffy great price wide assortment yummy ...


### Next Step: Sentiment Labeling
We'll create a `Sentiment` column based on the `Score`:
- Scores 4 and 5 → Positive sentiment
- Scores 1 and 2 → Negative sentiment
- Score 3 → Neutral sentiment


In [6]:
def label_sentiment(score):
    if score in [4, 5]:
        return 'Positive'
    elif score in [1, 2]:
        return 'Negative'
    else:
        return 'Neutral'

# Apply sentiment labeling
df_cleaned['Sentiment'] = df_cleaned['Score'].apply(label_sentiment)

# Preview the updated dataset
df_cleaned[['cleaned_text', 'Sentiment']].head()


Unnamed: 0,cleaned_text,Sentiment
0,bought several vitality canned dog food produc...,Positive
1,product arrived labeled jumbo salted peanutsth...,Negative
2,confection around centuries light pillowy citr...,Positive
3,looking secret ingredient robitussin believe f...,Negative
4,great taffy great price wide assortment yummy ...,Positive


### 1. **Splitting the Dataset:**
We'll split the dataset into training and testing sets to train and evaluate our sentiment prediction model.


### 2. **Text Vectorization:**
We’ll convert the text into a format that the model can work with, such as using a **TF-IDF vectorizer**.

In [17]:
# Splitting the data into training and testing sets
X = df_cleaned['cleaned_text']  
y = df_cleaned['Sentiment']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")


Training set size: 454763
Testing set size: 113691


In [6]:
# Initialize the TF-IDF vectorizer
tfidf = TfidfVectorizer(max_features=5000)  # Limiting to top 5000 features for performance

# Fit and transform the training data, transform the test data
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(f"Training TF-IDF matrix shape: {X_train_tfidf.shape}")
print(f"Testing TF-IDF matrix shape: {X_test_tfidf.shape}")




Training TF-IDF matrix shape: (454763, 5000)
Testing TF-IDF matrix shape: (113691, 5000)


###  **Model Training:**
Let’s train a simple classifier like **Logistic Regression** to predict the sentiment based on the TF-IDF features.

In [None]:
# Initialize the logistic regression model
model = LogisticRegression()

# Train the model on the TF-IDF vectors and labels
model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

# Evaluate the model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model accuracy: 86.69%

Classification Report:
               precision    recall  f1-score   support

    Negative       0.73      0.67      0.70     16181
     Neutral       0.51      0.19      0.27      8485
    Positive       0.90      0.97      0.93     89025

    accuracy                           0.87    113691
   macro avg       0.71      0.61      0.64    113691
weighted avg       0.85      0.87      0.85    113691



The **model accuracy** of ***86.69%*** is a decent starting point, but there is room for improvement, especially in distinguishing between the **Neutral** and **Negative** classes.

The **classification report** reveals the following:

- **Positive reviews** are predicted with strong performance, achieving a high precision of 0.90 and recall of 0.97, which shows the model is highly effective at identifying positive sentiment.
- **Negative reviews** are reasonably well predicted, with a precision of 0.73 and recall of 0.67, though these values suggest some room for improvement.
- **Neutral reviews**, however, show the weakest performance, with a precision of 0.51 and a recall of just 0.19, indicating that the model struggles to accurately classify neutral reviews.


The model has difficulty distinguishing between **Neutral** and **Negative** reviews, which is likely due to class imbalance or other challenges in the data, such as noise or insufficient features for proper classification.

In [9]:
# Let's try to Increase the iterations
model = LogisticRegression(max_iter=200, n_jobs=-1)

# Train the model on the TF-IDF vectors and labels
model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

# Evaluate the model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Model accuracy: 86.69%

Classification Report:
               precision    recall  f1-score   support

    Negative       0.73      0.67      0.70     16181
     Neutral       0.51      0.19      0.27      8485
    Positive       0.90      0.97      0.93     89025

    accuracy                           0.87    113691
   macro avg       0.71      0.61      0.64    113691
weighted avg       0.85      0.87      0.85    113691



In [10]:
# Try a different solver
model = LogisticRegression(solver='saga', n_jobs=-1)

# Train the model on the TF-IDF vectors and labels
model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

# Evaluate the model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Model accuracy: 86.70%

Classification Report:
               precision    recall  f1-score   support

    Negative       0.74      0.67      0.70     16181
     Neutral       0.51      0.19      0.27      8485
    Positive       0.90      0.97      0.93     89025

    accuracy                           0.87    113691
   macro avg       0.72      0.61      0.64    113691
weighted avg       0.85      0.87      0.85    113691



In [None]:
# Label encoding
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Define models with improved parameters
models = {
    "Random Forest": (RandomForestClassifier(n_estimators=200, max_depth=None, min_samples_split=2, class_weight='balanced', n_jobs=-1), {}),  # Adjusted parameters
}

# Train and evaluate each model
for model_name, (model, params) in models.items():
    print(f"Training {model_name}...")
    start_time = time.time()

    if params:
        grid_search = GridSearchCV(model, params, cv=5, n_jobs=-1)
        grid_search.fit(X_train_tfidf, y_train_encoded)
        model = grid_search.best_estimator_
    else:
        model.fit(X_train_tfidf, y_train_encoded)

    # Make predictions
    y_pred = model.predict(X_test_tfidf)
    
    # Decode predictions back to original labels
    y_pred_decoded = label_encoder.inverse_transform(y_pred)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred_decoded)
    print(f"{model_name} Accuracy: {accuracy * 100:.2f}%")
    
    # Print classification report
    print(f"Classification Report for {model_name}:\n", classification_report(y_test, y_pred_decoded))
    print(f"Time taken by {model_name}: {time.time() - start_time:.2f} seconds\n")


Training Logistic Regression...
Logistic Regression Accuracy: 86.68%
Classification Report for Logistic Regression:
               precision    recall  f1-score   support

    Negative       0.73      0.68      0.70     16181
     Neutral       0.49      0.21      0.30      8485
    Positive       0.90      0.96      0.93     89025

    accuracy                           0.87    113691
   macro avg       0.71      0.62      0.64    113691
weighted avg       0.85      0.87      0.85    113691

Time taken by Logistic Regression: 109.33 seconds

Training Random Forest...
Random Forest Accuracy: 89.45%
Classification Report for Random Forest:
               precision    recall  f1-score   support

    Negative       0.92      0.61      0.73     16181
     Neutral       0.98      0.39      0.56      8485
    Positive       0.89      0.99      0.94     89025

    accuracy                           0.89    113691
   macro avg       0.93      0.66      0.74    113691
weighted avg       0.90   

Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 80.38%
Classification Report for XGBoost:
               precision    recall  f1-score   support

    Negative       0.87      0.15      0.26     16181
     Neutral       0.71      0.01      0.03      8485
    Positive       0.80      1.00      0.89     89025

    accuracy                           0.80    113691
   macro avg       0.79      0.39      0.39    113691
weighted avg       0.80      0.80      0.73    113691

Time taken by XGBoost: 138.30 seconds

Training SVM...


KeyboardInterrupt: 

### Logistic Regression
- Accuracy: **86.68%** is a solid baseline, though there's room for improvement, particularly in distinguishing between **Neutral** and **Negative** reviews.
- The **classification report** reveals:
  - **Positive reviews** are predicted with high precision (0.90) and recall (0.96), indicating strong performance in identifying positive sentiment.
  - **Negative reviews** show decent performance with precision (0.73) and recall (0.68), though there's still some room for improvement.
  - **Neutral reviews** have notably lower performance, with precision (0.49) and recall (0.21), suggesting the model struggles to properly classify neutral reviews.

This indicates that while the model performs well on Positive reviews, it faces difficulties with **Neutral** and **Negative** reviews, likely due to class imbalance.

---

### Random Forest
- Accuracy: **89.45%**, showing an improvement over Logistic Regression.
- The **classification report** shows:
  - **Positive reviews** are predicted well with precision (0.89) and recall (0.99), indicating excellent performance.
  - **Negative reviews** show decent performance with precision (0.92) and recall (0.61), though recall could be further improved.
  - **Neutral reviews** are still challenging, with precision (0.98) but a low recall (0.39), indicating the model tends to over-predict Neutral reviews, but misses many actual Neutral cases.

The Random Forest model shows better overall performance than Logistic Regression, but still faces challenges with Neutral reviews, likely due to class imbalance.

---

### XGBoost
- Accuracy: **80.38%**, lower than both Logistic Regression and Random Forest.
- The **classification report** reveals:
  - **Positive reviews** are predicted with high precision (0.80) and recall (1.00), but performance on Negative and Neutral reviews is poor.
  - **Negative reviews** have poor recall (0.15) and precision (0.87), indicating significant difficulty in distinguishing them from other classes.
  - **Neutral reviews** show extremely poor recall (0.01), suggesting the model is almost unable to identify Neutral reviews at all.

The XGBoost model has the lowest performance, particularly on **Neutral** reviews, highlighting the challenges of dealing with class imbalance and noisy features.


In [None]:
# Define the fixed parameters and variations
n_estimators = 200
class_weight = 'balanced'

# Define the hyperparameter values
max_depths = [None, 5]        
min_samples_splits = [2, 10]  

results = []

# Iterate through combinations of hyperparameters
for max_depth in max_depths:
    for min_samples_split in min_samples_splits:
        print(f"Training Random Forest with max_depth={max_depth}, min_samples_split={min_samples_split}...")
        start_time = time.time()
        
        # Create and train the model
        rf_model = RandomForestClassifier(n_estimators=n_estimators, 
                                           max_depth=max_depth, 
                                           min_samples_split=min_samples_split,
                                           class_weight=class_weight,
                                           n_jobs=-1)
        rf_model.fit(X_train_tfidf, y_train_encoded)

        # Make predictions
        y_pred_rf = rf_model.predict(X_test_tfidf)

        # Decode predictions back to original labels
        y_pred_rf_decoded = label_encoder.inverse_transform(y_pred_rf)

        # Evaluate the model
        accuracy_rf = accuracy_score(y_test, y_pred_rf_decoded)
        report = classification_report(y_test, y_pred_rf_decoded)

        results.append({
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'accuracy': accuracy_rf,
            'report': report
        })

        print(f"Random Forest Accuracy: {accuracy_rf:.4f}")
        print(report)
        print(f"Time taken: {time.time() - start_time:.2f} seconds\n")

# Print all results
print("Results:")
for result in results:
    print(f"Max Depth: {result['max_depth']}, Min Samples Split: {result['min_samples_split']}, "
          f"Accuracy: {result['accuracy']:.4f}")


Random Forest Accuracy:  0.8947937831490619
Classification Report for Random Forest:
               precision    recall  f1-score   support

    Negative       0.92      0.61      0.74     16181
     Neutral       0.97      0.39      0.56      8485
    Positive       0.89      0.99      0.94     89025

    accuracy                           0.89    113691
   macro avg       0.93      0.66      0.74    113691
weighted avg       0.90      0.89      0.88    113691



In [None]:
# Define hyperparameter combinations
combinations = [
    {'max_depth': None, 'min_samples_split': 10, 'max_features': 'sqrt'},
    {'max_depth': None, 'min_samples_split': 10, 'max_features': 'log2'}
]

# Store results and models
results = []
models = []

# Train and evaluate each combination
for params in combinations:
    rf_model = RandomForestClassifier(
        n_estimators=200, 
        class_weight='balanced', 
        n_jobs=-1, 
        **params
    )
    
    start_time = time.time()
    rf_model.fit(X_train_tfidf, y_train_encoded)
    y_pred_rf = rf_model.predict(X_test_tfidf)

    # Decode predictions back to original labels
    y_pred_rf_decoded = label_encoder.inverse_transform(y_pred_rf)

    # Evaluate the model
    accuracy_rf = accuracy_score(y_test, y_pred_rf_decoded)
    report = classification_report(y_test, y_pred_rf_decoded)

    # Store the results
    results.append({
        'params': params,
        'accuracy': accuracy_rf,
        'report': report,
        'time': time.time() - start_time
    })
    models.append(rf_model)  # Store the trained model

# Print results
for i, result in enumerate(results):
    print(f"Combination {i + 1}: Max Depth: {result['params']['max_depth']}, "
          f"Min Samples Split: {result['params']['min_samples_split']}, "
          f"Max Features: {result['params']['max_features']}")
    print(f"Accuracy: {result['accuracy'] * 100:.2f}%")
    print("Classification Report:\n", result['report'])
    print(f"Time taken: {result['time']:.2f} seconds\n")





# Save the best model based on accuracy
best_model_index = max(range(len(results)), key=lambda i: results[i]['accuracy'])
best_model = models[best_model_index]


# Save the best model
for i, result in enumerate(results):
    if result['params'] == {'max_depth': None, 'min_samples_split': 10, 'max_features': 'sqrt'}:
        joblib.dump(models[i], 'rf_model_sqrt.joblib')
        print("Model saved as 'rf_model_sqrt.joblib'")

joblib.dump(tfidf, 'tfidf_vectorizer.joblib')  # Save the vectorizer




Combination 1: Max Depth: None, Min Samples Split: 10, Max Features: sqrt
Accuracy: 90.06%
Classification Report:
               precision    recall  f1-score   support

    Negative       0.88      0.67      0.76     16181
     Neutral       0.94      0.40      0.57      8485
    Positive       0.90      0.99      0.94     89025

    accuracy                           0.90    113691
   macro avg       0.91      0.69      0.76    113691
weighted avg       0.90      0.90      0.89    113691

Time taken: 2288.56 seconds

Combination 2: Max Depth: None, Min Samples Split: 10, Max Features: log2
Accuracy: 89.54%
Classification Report:
               precision    recall  f1-score   support

    Negative       0.93      0.61      0.74     16181
     Neutral       0.98      0.39      0.55      8485
    Positive       0.89      1.00      0.94     89025

    accuracy                           0.90    113691
   macro avg       0.93      0.66      0.74    113691
weighted avg       0.90      0.90 

### Random Forest Classifier (TF-IDF)
Using a Random Forest Classifier, we achieved **90.06% accuracy**. While the performance is solid, it does not fully capture the context and nuances of customer feedback, motivating our shift to fine-tuning **DistilBERT**.


---
---

### Fine-Tuning DistilBERT:

I initially attempted to fine-tune **DistilBERT** for the task to explore how leveraging a state-of-the-art transformer model could improve accuracy.I tried to fine-tune the model; However, due to insufficient GPU resources, the training process was **prohibitively slow**, which preventing full fine-tuning of the model.


---
---

**DistilBERT embeddings:**

To still leverage the power of DistilBERT without fully fine-tuning, I used **DistilBERT embeddings** as feature vectors, and then applied **Random Forest** for classification. This approach resulted in an accuracy of **88%**, slightly lower than the Random Forest model alone, which achieved **90%** accuracy.


In [19]:
# Load Tokenizer and Model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

# Prepare the dataset
df_hf = df_cleaned[["cleaned_text", "Sentiment"]].rename(columns={"cleaned_text": "text", "Sentiment": "label"})
df_hf["label"] = df_hf["label"].map({"negative": 0, "neutral": 1, "positive": 2})


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Remove rows with missing labels
df_hf = df_hf.dropna(subset=["label"])

# Check for missing or invalid labels
if df_hf["label"].isnull().any():
    raise ValueError("Labels contain missing values. Please clean your dataset.")

dataset = Dataset.from_pandas(df_hf)

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Rename "label" to "labels" (required by Hugging Face Trainer)
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

# Debugging: Print a sample to inspect inputs
print("Sample tokenized data:", tokenized_datasets[0])

# Split into train/test
train_test_split = tokenized_datasets.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

# Use Data Collator for Padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define Trainer arguments
training_args = TrainingArguments(
    output_dir="./results",  
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    save_strategy="epoch",
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)


In [None]:
# Save the trained model
trainer.save_model("./saved_model")


---

**DistilBERT embeddings:**


In [None]:
# Tokenize the text
def tokenize_text(text):
    return tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors='pt')

# Tokenize the training and test data
X_train_tokens = [tokenize_text(text) for text in X_train]
X_test_tokens = [tokenize_text(text) for text in X_test]

# Check one example tokenization output
print(X_train_tokens[0])


{'input_ids': tensor([[ 101, 2066, 2116, 2500, 2056, 7107, 4157, 6659, 7107, 4157, 2123, 2102,
         4965,  102,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,

In [11]:
# Label encoding for sentiments
sentiment_mapping = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
y_train_encoded = y_train.map(sentiment_mapping)
y_test_encoded = y_test.map(sentiment_mapping)

# Verify the mapping
print(y_train_encoded.head())


251349    0
523262    2
224543    2
291632    2
37385     2
Name: Sentiment, dtype: int64


In [None]:
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:
# Convert tokenized inputs into PyTorch tensors
def create_dataset(tokens, labels):
    input_ids = torch.cat([token['input_ids'] for token in tokens], dim=0)
    attention_masks = torch.cat([token['attention_mask'] for token in tokens], dim=0)
    labels = torch.tensor(labels.values)
    
    dataset = TensorDataset(input_ids, attention_masks, labels)
    return dataset

# Create the DataLoader objects for training and testing
train_dataset = create_dataset(X_train_tokens, y_train_encoded)
test_dataset = create_dataset(X_test_tokens, y_test_encoded)

# Create DataLoader
batch_size = 4
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)


In [None]:
# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5)
scheduler = StepLR(optimizer, step_size=1, gamma=0.9)

# Training loop
epochs = 3
accumulation_steps = 4  

for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_preds = 0
    total_preds = 0
    optimizer.zero_grad()

    for i, batch in enumerate(train_loader):
        input_ids, attention_masks, labels = batch
        input_ids = input_ids.to(device)
        attention_masks = attention_masks.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        # Backward pass
        loss.backward()
        optimizer.step()
        scheduler.step()

        # Calculate accuracy
        preds = torch.argmax(logits, dim=-1)
        correct_preds += (preds == labels).sum().item()
        total_preds += labels.size(0)

        total_loss += loss.item()

        if i % 100 == 0:  
            print(f"Epoch {epoch+1}, Batch {i}/{len(train_loader)}: Loss = {loss.item():.4f}")

    avg_loss = total_loss / len(train_loader)
    accuracy = correct_preds / total_preds * 100
    print(f"Epoch {epoch+1}: Loss = {avg_loss:.4f}, Accuracy = {accuracy:.2f}%")

Epoch 1, Batch 0/28423: Loss = 1.0795
Epoch 1, Batch 100/28423: Loss = 1.0977
Epoch 1, Batch 200/28423: Loss = 0.6802
Epoch 1, Batch 300/28423: Loss = 0.8028
Epoch 1, Batch 400/28423: Loss = 0.8547
Epoch 1, Batch 500/28423: Loss = 0.8346
Epoch 1, Batch 600/28423: Loss = 0.8413
Epoch 1, Batch 700/28423: Loss = 0.7786
Epoch 1, Batch 800/28423: Loss = 0.6608
Epoch 1, Batch 900/28423: Loss = 0.7753
Epoch 1, Batch 1000/28423: Loss = 0.7750
Epoch 1, Batch 1100/28423: Loss = 0.8219


In [None]:
# Evaluate on the test set
model.eval()
correct_preds = 0
total_preds = 0

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_masks, labels = batch
        input_ids = input_ids.to(device)
        attention_masks = attention_masks.to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        logits = outputs.logits
        
        # Calculate accuracy
        preds = torch.argmax(logits, dim=-1)
        correct_preds += (preds == labels).sum().item()
        total_preds += labels.size(0)

# Calculate test accuracy
test_accuracy = correct_preds / total_preds * 100
print(f"Test Accuracy: {test_accuracy:.2f}%")


In [None]:
# Load DistilBERT model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Function to extract embeddings in a memory-efficient way
def extract_embeddings_memory_efficient(texts, tokenizer, model, batch_size=8, save_path="train_embeddings_chunk"):
    model.eval()
    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size)):
            batch_texts = texts[i:i + batch_size]
            encoded = tokenizer(batch_texts, padding=True, truncation=True, max_length=128, return_tensors='pt')
            input_ids = encoded['input_ids']
            attention_mask = encoded['attention_mask']
            outputs = model(input_ids, attention_mask=attention_mask)
            batch_embeddings = outputs.last_hidden_state[:, 0, :].numpy()  # CLS token

            # Append each batch to a file
            with open(f"{save_path}_{i // batch_size}.npy", "wb") as f:
                np.save(f, batch_embeddings)

# Combine saved chunks into a single array
def combine_embedding_chunks(save_path, total_chunks):
    embeddings = []
    for chunk_idx in range(total_chunks):
        chunk_path = f"{save_path}_{chunk_idx}.npy"
        with open(chunk_path, "rb") as f:
            chunk_data = np.load(f)
            embeddings.append(chunk_data)
    return np.vstack(embeddings)

# Generate embeddings for training data
if os.path.exists("train_embeddings_combined.npy"):
    X_train_embeddings = np.load("train_embeddings_combined.npy")
else:
    print("Generating embeddings for training data...")
    extract_embeddings_memory_efficient(X_train.tolist(), tokenizer, distilbert, batch_size=8, save_path="train_embeddings_chunk")
    X_train_embeddings = combine_embedding_chunks("train_embeddings_chunk", len(X_train) // 8)
    np.save("train_embeddings_combined.npy", X_train_embeddings)

# Generate embeddings for testing data
if os.path.exists("test_embeddings_combined.npy"):
    X_test_embeddings = np.load("test_embeddings_combined.npy")
else:
    print("Generating embeddings for testing data...")
    extract_embeddings_memory_efficient(X_test.tolist(), tokenizer, distilbert, batch_size=8, save_path="test_embeddings_chunk")
    X_test_embeddings = combine_embedding_chunks("test_embeddings_chunk", len(X_test) // 8)
    np.save("test_embeddings_combined.npy", X_test_embeddings)


Generating embeddings for training data...


100%|██████████| 56846/56846 [4:51:09<00:00,  3.25it/s]  


Generating embeddings for testing data...


100%|██████████| 14212/14212 [1:17:44<00:00,  3.05it/s]


In [None]:
# Synchronize the size of y_train with X_train_embeddings
if len(X_train_embeddings) < len(y_train):
    y_train = y_train[:len(X_train_embeddings)]


if len(X_test_embeddings) < len(y_test):
    y_test = y_test[:len(X_test_embeddings)]

# Verify the sizes are now aligned
print(f"Synchronized sizes: X_train_embeddings: {len(X_train_embeddings)}, y_train: {len(y_train)}")

# Define hyperparameter combinations
combinations = [
    {'n_estimators': 200, 'max_depth': None, 'min_samples_split': 10, 'max_features': 'sqrt', 'class_weight': 'balanced'}
    ]

# Store results for each combination
results = []

# Train and evaluate each combination
for params in combinations:
    rf = RandomForestClassifier(
        random_state=42, 
        **params
    )
    print(f"Training with parameters: {params}")
    rf.fit(X_train_embeddings, y_train)

    # Predict on the test set
    y_pred = rf.predict(X_test_embeddings)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    # Store results
    results.append({
        'params': params,
        'accuracy': accuracy,
        'report': report
    })

# Print results
for i, result in enumerate(results):
    print(f"Combination {i + 1}: {result['params']}")
    print(f"Accuracy: {result['accuracy']:.2f}")
    print("Classification Report:")
    print(result['report'])
    print("\n")



Synchronized sizes: X_train_embeddings: 454760, y_train: 454760
Training with parameters: {'n_estimators': 200, 'max_depth': None, 'min_samples_split': 10, 'max_features': 'sqrt', 'class_weight': 'balanced'}
Combination 1: {'n_estimators': 200, 'max_depth': None, 'min_samples_split': 10, 'max_features': 'sqrt', 'class_weight': 'balanced'}
Accuracy: 0.88
Classification Report:
              precision    recall  f1-score   support

    Negative       0.93      0.49      0.64     16181
     Neutral       0.99      0.38      0.55      8485
    Positive       0.87      1.00      0.93     89022

    accuracy                           0.88    113688
   macro avg       0.93      0.62      0.71    113688
weighted avg       0.89      0.88      0.86    113688



