In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.utils import resample
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Bidirectional, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
import warnings
warnings.filterwarnings('ignore')

# Load Data
train_df = pd.read_csv(r"C:/Desktop/ML bootcamp/multi-class-sentiment-analysis/data/train/train.tsv", sep='\t', header=0)
test_df = pd.read_csv(r"C:/Desktop/ML bootcamp/multi-class-sentiment-analysis/data/test/test.tsv", sep='\t', header=0)

print(f"Training data: {train_df.shape}")
print(f"Test data: {test_df.shape}")

In [None]:
# EDA - Sentiment Distribution
plt.figure(figsize=(10, 5))
sentiment_counts = train_df['Sentiment'].value_counts().sort_index()
plt.bar(range(5), sentiment_counts.values, color=['red', 'orange', 'yellow', 'lightgreen', 'green'])
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment Class')
plt.ylabel('Count')
plt.xticks(range(5), ['Negative', 'Somewhat Negative', 'Neutral', 'Somewhat Positive', 'Positive'])
plt.savefig("sentiment_distribution.png")
plt.show()

In [None]:
# We balance the classes so the model learns positive and negative features equally.
def balance_data(df):
    target_size = 20000 
    balanced_df = pd.DataFrame()
    
    for i in range(5):
        class_subset = df[df['Sentiment'] == i]
        resampled_class = resample(class_subset,
                                   replace=True,        
                                   n_samples=target_size, 
                                   random_state=42)
        balanced_df = pd.concat([balanced_df, resampled_class])
    
    return balanced_df.sample(frac=1).reset_index(drop=True) 

train_df_balanced = balance_data(train_df)
print("Balanced Sentiment distribution:")
print(train_df_balanced['Sentiment'].value_counts().sort_index())

In [None]:
# Text Preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    return ' '.join(text.split())

# Initialize preprocessor
max_features = 20000
max_len = 100
tokenizer = Tokenizer(num_words=max_features, oov_token='<OOV>')
tokenizer.fit_on_texts(train_df['Phrase'].apply(clean_text))

X_train = tokenizer.texts_to_sequences(train_df['Phrase'].apply(clean_text))
X_train = pad_sequences(X_train, maxlen=max_len, padding='post', truncating='post')
y_train = train_df['Sentiment'].values

# Train-Validation Split
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

In [None]:
# Model Architecture
def create_model(vocab_size, max_len=100):
    inputs = Input(shape=(max_len,))
    embedding = Embedding(vocab_size, 100, input_length=max_len)(inputs)
    bilstm = Bidirectional(LSTM(128, return_sequences=True, dropout=0.3))(embedding)
    pooled = tf.keras.layers.GlobalMaxPooling1D()(bilstm)
    dense1 = Dense(128, activation='relu')(pooled)
    dropout = Dropout(0.5)(dense1)
    outputs = Dense(5, activation='softmax')(dropout)
    
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [None]:
# Train Multiple Models
models = {}
histories = {}
callbacks = [
    EarlyStopping(patience=5, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2)
]

# BiLSTM
model1 = create_model(len(tokenizer.word_index) + 1)
history1 = model1.fit(X_train_split, y_train_split, validation_data=(X_val_split, y_val_split),
                     epochs=30, batch_size=64, callbacks=callbacks, verbose=1)
models['BiLSTM'] = model1
histories['BiLSTM'] = history1

# CNN-LSTM
def create_cnn_lstm(vocab_size, max_len=100):
    inputs = Input(shape=(max_len,))
    embedding = Embedding(vocab_size, 100, input_length=max_len)(inputs)
    conv = tf.keras.layers.Conv1D(128, 5, activation='relu')(embedding)
    pool = tf.keras.layers.MaxPooling1D(5)(conv)
    lstm = LSTM(64)(pool)
    outputs = Dense(5, activation='softmax')(lstm)
    
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

model2 = create_cnn_lstm(len(tokenizer.word_index) + 1)
history2 = model2.fit(X_train_split, y_train_split, validation_data=(X_val_split, y_val_split),
                     epochs=30, batch_size=64, callbacks=callbacks, verbose=1)
models['CNN_LSTM'] = model2
histories['CNN_LSTM'] = history2

In [None]:
# Model Evaluation
results = {}
for name, model in models.items():
    print(f"\n{'='*20} Evaluating Model: {name} {'='*20}")
    
    y_pred_probs = model.predict(X_val_split)
    y_pred = np.argmax(y_pred_probs, axis=1)
    
    accuracy = accuracy_score(y_val_split, y_pred)
    results[name] = accuracy
    print(f"Overall Accuracy: {accuracy:.4f}")
    
    print("\nDetailed Sentiment Analysis Report:")
    target_names = ['Negative', 'Somewhat Neg', 'Neutral', 'Somewhat Pos', 'Positive']
    print(classification_report(y_val_split, y_pred, target_names=target_names))
    
    # Confusion Matrix Visualization
    plt.figure(figsize=(10, 7))
    cm = confusion_matrix(y_val_split, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=target_names, yticklabels=target_names)
    plt.title(f'Confusion Matrix - {name}')
    plt.ylabel('Actual Sentiment')
    plt.xlabel('Predicted Sentiment')
    plt.show()

In [None]:
# Save Best Model
best_model = max(results.keys(), key=lambda x: results[x])
models[best_model].save('models/best_model.h5')
import pickle
with open('models/tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)