In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
import tensorflow as tf
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.optim import AdamW
import importlib.metadata

# Print library versions for debugging
print("TensorFlow version:", tf.__version__)
print("Transformers version:", importlib.metadata.version("transformers"))
print("PyTorch version:", torch.__version__)

# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt_tab', quiet=True)

class SentimentAnalysisPipeline:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.vectorizer = TfidfVectorizer(max_features=5000)
        self.tokenizer = Tokenizer(num_words=5000)
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    def clean_text(self, text):
        """Clean and preprocess text data"""
        text = str(text)
        text = text.lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        tokens = word_tokenize(text)
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens
                 if token not in self.stop_words]
        return ' '.join(tokens)

    def load_and_preprocess_data(self, dataset_path):
        """Load and preprocess the dataset"""
        df = pd.read_csv(dataset_path)
        print("DataFrame columns:", df.columns)

        if 'review_text' not in df.columns or 'sentiment' not in df.columns:
            raise ValueError("Dataset must contain 'review_text' and 'sentiment' columns")

        df['cleaned_review'] = df['review_text'].apply(self.clean_text)
        sentiment_map = {'positive': 2, 'neutral': 1, 'negative': 0}
        df['label'] = df['sentiment'].map(sentiment_map)

        if df['label'].isna().any():
            raise ValueError("Invalid sentiment labels found. Expected 'positive', 'neutral', or 'negative'.")

        return df

    def prepare_data_for_traditional_models(self, df):
        """Prepare data for Logistic Regression and Naive Bayes"""
        X = self.vectorizer.fit_transform(df['cleaned_review']).toarray()
        y = df['label'].values
        return train_test_split(X, y, test_size=0.2, random_state=42)

    def prepare_data_for_lstm(self, df, max_sequence_length=100):
        """Prepare data for LSTM model"""
        self.tokenizer.fit_on_texts(df['cleaned_review'])
        sequences = self.tokenizer.texts_to_sequences(df['cleaned_review'])
        X = pad_sequences(sequences, maxlen=max_sequence_length)
        y = df['label'].values
        return train_test_split(X, y, test_size=0.2, random_state=42)

    def prepare_data_for_bert(self, df, max_length=128):
        """Prepare data for BERT model"""
        input_ids = []
        attention_masks = []
        labels = df['label'].values

        for review in df['cleaned_review']:
            encoded = self.bert_tokenizer.encode_plus(
                review,
                add_special_tokens=True,
                max_length=max_length,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_tensors='pt'  # PyTorch tensors
            )
            input_ids.append(encoded['input_ids'][0])
            attention_masks.append(encoded['attention_mask'][0])

        input_ids = torch.stack(input_ids)
        attention_masks = torch.stack(attention_masks)
        labels = torch.tensor(labels, dtype=torch.long)

        dataset = torch.utils.data.TensorDataset(input_ids, attention_masks, labels)
        train_size = int(0.8 * len(dataset))
        test_size = len(dataset) - train_size
        train_dataset, test_dataset = torch.utils.data.random_split(
            dataset, [train_size, test_size], generator=torch.Generator().manual_seed(42)
        )

        return train_dataset, test_dataset

    def train_logistic_regression(self, X_train, X_test, y_train, y_test):
        """Train and evaluate Logistic Regression model"""
        model = LogisticRegression(multi_class='multinomial', max_iter=1000)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        return {
            'accuracy': accuracy_score(y_test, y_pred),
            'f1_score': f1_score(y_test, y_pred, average='weighted'),
            'confusion_matrix': confusion_matrix(y_test, y_pred)
        }

    def train_naive_bayes(self, X_train, X_test, y_train, y_test):
        """Train and evaluate Naive Bayes model"""
        model = MultinomialNB()
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        return {
            'accuracy': accuracy_score(y_test, y_pred),
            'f1_score': f1_score(y_test, y_pred, average='weighted'),
            'confusion_matrix': confusion_matrix(y_test, y_pred)
        }

    def train_lstm(self, X_train, X_test, y_train, y_test, vocab_size=5000,
                  max_sequence_length=100):
        """Train and evaluate LSTM model"""
        model = Sequential([
            Embedding(vocab_size, 100),
            LSTM(128, return_sequences=False),
            Dropout(0.2),
            Dense(64, activation='relu'),
            Dropout(0.2),
            Dense(3, activation='softmax')
        ])

        model.compile(optimizer='adam',
                     loss='sparse_categorical_crossentropy',
                     metrics=['accuracy'])

        model.fit(X_train, y_train, epochs=5, batch_size=32,
                 validation_split=0.2, verbose=0)

        y_pred = np.argmax(model.predict(X_test, verbose=0), axis=1)
        return {
            'accuracy': accuracy_score(y_test, y_pred),
            'f1_score': f1_score(y_test, y_pred, average='weighted'),
            'confusion_matrix': confusion_matrix(y_test, y_pred)
        }

    def train_bert(self, train_dataset, test_dataset, max_length=128):
        """Train and evaluate BERT model"""
        model = BertForSequenceClassification.from_pretrained(
            'bert-base-uncased',
            num_labels=3
        ).to(self.device)

        optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
        test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=8)

        # Training loop
        model.train()
        for epoch in range(3):
            total_loss = 0
            for batch in train_loader:
                input_ids, attention_mask, labels = [b.to(self.device) for b in batch]
                optimizer.zero_grad()
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                total_loss += loss.item()
                loss.backward()
                optimizer.step()
            print(f"Epoch {epoch+1}, Average Loss: {total_loss / len(train_loader):.4f}")

        # Evaluation
        model.eval()
        y_true = []
        y_pred = []
        with torch.no_grad():
            for batch in test_loader:
                input_ids, attention_mask, labels = [b.to(self.device) for b in batch]
                outputs = model(input_ids, attention_mask=attention_mask)
                preds = torch.argmax(outputs.logits, dim=1)
                y_true.extend(labels.cpu().numpy())
                y_pred.extend(preds.cpu().numpy())

        y_true = np.array(y_true)
        y_pred = np.array(y_pred)
        return {
            'accuracy': accuracy_score(y_true, y_pred),
            'f1_score': f1_score(y_true, y_pred, average='weighted'),
            'confusion_matrix': confusion_matrix(y_true, y_pred)
        }

    def plot_confusion_matrix(self, cm, model_name, save_path=None):
        """Plot confusion matrix"""
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f'Confusion Matrix - {model_name}')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        if save_path:
            plt.savefig(save_path)
            plt.close()
        else:
            plt.show()

def main():
    # Initialize pipeline
    pipeline = SentimentAnalysisPipeline()

    # Load and preprocess data
    dataset_path = 'bike_rental_reviews.csv'  # Update with actual path
    df = pipeline.load_and_preprocess_data(dataset_path)

    # Train and evaluate traditional models
    X_train_trad, X_test_trad, y_train_trad, y_test_trad = \
        pipeline.prepare_data_for_traditional_models(df)

    lr_results = pipeline.train_logistic_regression(
        X_train_trad, X_test_trad, y_train_trad, y_test_trad)
    nb_results = pipeline.train_naive_bayes(
        X_train_trad, X_test_trad, y_train_trad, y_test_trad)

    # Train and evaluate LSTM
    X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = \
        pipeline.prepare_data_for_lstm(df)
    lstm_results = pipeline.train_lstm(
        X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm)

    # Train and evaluate BERT
    train_dataset, test_dataset = pipeline.prepare_data_for_bert(df)
    bert_results = pipeline.train_bert(train_dataset, test_dataset)

    # Print results
    print("Logistic Regression Results:",
          f"Accuracy: {lr_results['accuracy']:.4f}, F1-Score: {lr_results['f1_score']:.4f}")
    print("Naive Bayes Results:",
          f"Accuracy: {nb_results['accuracy']:.4f}, F1-Score: {nb_results['f1_score']:.4f}")
    print("LSTM Results:",
          f"Accuracy: {lstm_results['accuracy']:.4f}, F1-Score: {lstm_results['f1_score']:.4f}")
    print("BERT Results:",
          f"Accuracy: {bert_results['accuracy']:.4f}, F1-Score: {bert_results['f1_score']:.4f}")

    # Plot confusion matrices
    pipeline.plot_confusion_matrix(lr_results['confusion_matrix'],
                                  'Logistic Regression', 'lr_cm.png')
    pipeline.plot_confusion_matrix(nb_results['confusion_matrix'],
                                  'Naive Bayes', 'nb_cm.png')
    pipeline.plot_confusion_matrix(lstm_results['confusion_matrix'],
                                  'LSTM', 'lstm_cm.png')
    pipeline.plot_confusion_matrix(bert_results['confusion_matrix'],
                                  'BERT', 'bert_cm.png')

if __name__ == '__main__':
    main()

TensorFlow version: 2.19.0
Transformers version: 4.55.2
PyTorch version: 2.8.0+cu126
DataFrame columns: Index(['review_text', 'sentiment'], dtype='object')




model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Average Loss: 0.0084
Epoch 2, Average Loss: 0.0026
Epoch 3, Average Loss: 0.0000
Logistic Regression Results: Accuracy: 1.0000, F1-Score: 1.0000
Naive Bayes Results: Accuracy: 1.0000, F1-Score: 1.0000
LSTM Results: Accuracy: 1.0000, F1-Score: 1.0000
BERT Results: Accuracy: 1.0000, F1-Score: 1.0000


#Sentiment Analysis Model Evaluation Report
This report evaluates four models—Logistic Regression, Naive Bayes, LSTM, and BERT—trained on the bike_rental_reviews.csv dataset for sentiment analysis, using accuracy, F1-score, and confusion matrices. The dataset includes review_text and sentiment columns, with labels mapped as positive (2), neutral (1), and negative (0). Models were evaluated on a 20% test split in Google Colab (TensorFlow 2.19.0, transformers 4.55.2, PyTorch 2.8.0+cu126).

1. Logistic Regression:


* Accuracy: 1.0000
* F1-Score: 1.0000 (weighted)
* Confusion Matrix: Saved as lr_cm.png. Perfect classification suggests overfitting or data issues (e.g., small dataset, leakage).

2. Naive Bayes:
* Accuracy: 1.0000
* F1-Score: 1.0000 (weighted)
* Confusion Matrix: Saved as nb_cm.png. Perfect predictions indicate potential dataset or preprocessing problems.

3. LSTM:

* Accuracy: 1.0000
* F1-Score: 1.0000 (weighted)
* Confusion Matrix: Saved as lstm_cm.png. Perfect performance points to data leakage or insufficient complexity.

4. BERT:
* Accuracy: 1.0000
* F1-Score: 1.0000 (weighted)
* Confusion Matrix: Saved as bert_cm.png. Perfect scores and 0.0000 loss by Epoch 3 suggest overfitting or label issues.

# Observations:

1. Perfect scores (1.0000) across all models are unrealistic, indicating a small dataset, data leakage, imbalanced labels, or preprocessing errors.

2. Execution took >50 minutes, driven by BERT training (bert-base-uncased).
