In [None]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict, Counter
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize

# Download NLTK tokenizer data
nltk.download('punkt_tab', quiet=True)

True

In [2]:
class TF_IDF_with_N_Grams_Vectorizer:
    def __init__(self, max_features=5000, token_pattern=r"(?u)\b\w+\b", ngram_range=(1, 2)):
        self.max_features = max_features
        self.token_pattern = re.compile(token_pattern)
        self.ngram_range = ngram_range
        self.vocabulary_ = {}
        self.idf_ = {}

    def _tokenize(self, text):
        return self.token_pattern.findall(text.lower())

    def _generate_ngrams(self, tokens):
        ngram_tokens = []
        min_n, max_n = self.ngram_range
        for n in range(min_n, max_n + 1):
            ngram_tokens.extend([' '.join(tokens[i:i+n]) for i in range(len(tokens)-n+1)])
        return ngram_tokens

    def fit(self, raw_documents):
        df = defaultdict(int)
        doc_count = len(raw_documents)

        for doc in raw_documents:
            tokens = self._tokenize(doc)
            ngrams = self._generate_ngrams(tokens)
            unique_terms = set(ngrams)
            for term in unique_terms:
                df[term] += 1

        sorted_terms = sorted(df.items(), key=lambda x: -x[1])[:self.max_features]
        self.vocabulary_ = {term: idx for idx, (term, _) in enumerate(sorted_terms)}
        self.idf_ = {
            term: np.log((1 + doc_count) / (1 + df[term])) + 1.0
            for term in self.vocabulary_
        }

        return self

    def transform(self, raw_documents):
        n_docs = len(raw_documents)
        n_features = len(self.vocabulary_)
        X = np.zeros((n_docs, n_features), dtype=np.float32)

        for doc_idx, doc in enumerate(raw_documents):
            tokens = self._tokenize(doc)
            ngrams = self._generate_ngrams(tokens)
            tf = Counter(ngrams)

            for term, count in tf.items():
                if term in self.vocabulary_:
                    tf_val = count / len(ngrams)
                    idf_val = self.idf_[term]
                    tfidf = tf_val * idf_val
                    X[doc_idx, self.vocabulary_[term]] = tfidf

        return X

    def fit_transform(self, raw_documents):
        self.fit(raw_documents)
        return self.transform(raw_documents)

In [3]:
df = pd.read_csv("../../../data/clean_Tweets.csv")

X = df["text"]
y = df["airline_sentiment"]

In [4]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)  # negative=0, neutral=1, positive=2

In [None]:
vectorizer = TF_IDF_with_N_Grams_Vectorizer(max_features=20000, ngram_range=(1, 2))
X_vectorized = vectorizer.fit_transform(X)
print("TF-IDF Matrix Shape:", X_vectorized.shape)

TF-IDF Matrix Shape: (14427, 5000)


In [6]:
sampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = sampler.fit_resample(X_vectorized, y)

In [7]:
# Reshape TF-IDF for GRU (simulate sequence)
seq_length = 50
feature_dim = X_resampled.shape[1] // seq_length  # e.g., 5000 / 50 = 100
X_reshaped = X_resampled.reshape(-1, seq_length, feature_dim)

# Convert to PyTorch tensors
X_reshaped = torch.tensor(X_reshaped, dtype=torch.float32)
y_resampled = torch.tensor(y_resampled, dtype=torch.long)

# Dataset and DataLoader
class TweetDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = TweetDataset(X_reshaped, y_resampled)
batch_size = 32
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [8]:
# GRU Model
class SentimentGRU(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout=0.5):
        super(SentimentGRU, self).__init__()
        self.gru = nn.GRU(input_dim, hidden_dim, batch_first=True)
        self.fc1 = nn.Linear(hidden_dim, 64)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.fc2 = nn.Linear(64, output_dim)

    def forward(self, x):
        _, hidden = self.gru(x)  # hidden: (1, batch, hidden_dim)
        hidden = hidden.squeeze(0)  # (batch, hidden_dim)
        out = self.fc1(hidden)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        return out

In [9]:
# Initialize model
input_dim = feature_dim  # e.g., 100
hidden_dim = 128
output_dim = 3
model = SentimentGRU(input_dim, hidden_dim, output_dim)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [10]:
# Training setup
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop with early stopping
num_epochs = 20
patience = 3
best_val_loss = float("inf")
patience_counter = 0
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

In [11]:
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    train_correct = 0
    train_total = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        train_total += y_batch.size(0)
        train_correct += (predicted == y_batch).sum().item()
    train_loss /= len(train_loader)
    train_accuracy = train_correct / train_total
    train_losses.append(train_loss)
    train_accuracies.append(train_accuracy)

    model.eval()
    val_loss = 0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            val_total += y_batch.size(0)
            val_correct += (predicted == y_batch).sum().item()
        val_loss /= len(val_loader)
        val_accuracy = val_correct / val_total
        val_losses.append(val_loss)
        val_accuracies.append(val_accuracy)

    print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Train Acc: {train_accuracy:.4f}, Val Acc: {val_accuracy:.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        torch.save(model.state_dict(), "best_gru_model.pt")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered")
            break

Epoch 1, Train Loss: 1.0991, Val Loss: 1.0987, Train Acc: 0.3398, Val Acc: 0.3322
Epoch 2, Train Loss: 1.0979, Val Loss: 1.0960, Train Acc: 0.3393, Val Acc: 0.3544
Epoch 3, Train Loss: 1.0887, Val Loss: 1.0907, Train Acc: 0.3561, Val Acc: 0.3618
Epoch 4, Train Loss: 1.0770, Val Loss: 1.0805, Train Acc: 0.3721, Val Acc: 0.3717
Epoch 5, Train Loss: 1.0624, Val Loss: 1.0720, Train Acc: 0.3878, Val Acc: 0.3820
Epoch 6, Train Loss: 1.0489, Val Loss: 1.0721, Train Acc: 0.4080, Val Acc: 0.4053
Epoch 7, Train Loss: 1.0328, Val Loss: 1.0683, Train Acc: 0.4214, Val Acc: 0.3923
Epoch 8, Train Loss: 1.0118, Val Loss: 1.0276, Train Acc: 0.4536, Val Acc: 0.4580
Epoch 9, Train Loss: 0.9611, Val Loss: 0.9182, Train Acc: 0.5244, Val Acc: 0.5679
Epoch 10, Train Loss: 0.8332, Val Loss: 0.7971, Train Acc: 0.6201, Val Acc: 0.6536
Epoch 11, Train Loss: 0.7526, Val Loss: 0.7476, Train Acc: 0.6772, Val Acc: 0.6736
Epoch 12, Train Loss: 0.7059, Val Loss: 0.7137, Train Acc: 0.7071, Val Acc: 0.6971
Epoch 13, Tra

KeyboardInterrupt: 

In [None]:
# Evaluate
model.load_state_dict(torch.load("best_gru_model.pt"))
model.eval()

def evaluate_model(loader, model, device, label_encoder):
    y_true = []
    y_pred = []
    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            _, predicted = torch.max(outputs, 1)
            y_true.extend(y_batch.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())
    report = classification_report(y_true, y_pred, target_names=label_encoder.classes_, output_dict=True, zero_division=0)
    cm = confusion_matrix(y_true, y_pred)
    return y_true, y_pred, report, cm

# Training set evaluation
train_true, train_pred, train_report, _ = evaluate_model(train_loader, model, device, label_encoder)
train_accuracy = train_report['accuracy']
train_precision = train_report['macro avg']['precision']
train_recall = train_report['macro avg']['recall']
train_f1 = train_report['macro avg']['f1-score']

# Validation set evaluation
val_true, val_pred, val_report, val_cm = evaluate_model(val_loader, model, device, label_encoder)
val_accuracy = val_report['accuracy']
val_precision = val_report['macro avg']['precision']
val_recall = val_report['macro avg']['recall']
val_f1 = val_report['macro avg']['f1-score']

# Print metrics
print("\nTraining Set Metrics:")
print(f"Accuracy: {train_accuracy:.4f}")
print(f"Precision (macro): {train_precision:.4f}")
print(f"Recall (macro): {train_recall:.4f}")
print(f"F1-Score (macro): {train_f1:.4f}")
print("\nTraining Classification Report:")
print(classification_report(train_true, train_pred, target_names=label_encoder.classes_, zero_division=0))

print("\nValidation Set Metrics:")
print(f"Accuracy: {val_accuracy:.4f}")
print(f"Precision (macro): {val_precision:.4f}")
print(f"Recall (macro): {val_recall:.4f}")
print(f"F1-Score (macro): {val_f1:.4f}")
print("\nValidation Classification Report:")
print(classification_report(val_true, val_pred, target_names=label_encoder.classes_, zero_division=0))

In [None]:
# Confusion matrix for validation set
plt.figure(figsize=(8, 6))
sns.heatmap(val_cm, annot=True, fmt="d", cmap="Blues", xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Validation Confusion Matrix")
plt.show()

# Training history plots
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(train_losses, label="Train Loss")
plt.plot(val_losses, label="Val Loss")
plt.title("Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(train_accuracies, label="Train Accuracy")
plt.plot(val_accuracies, label="Val Accuracy")
plt.title("Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Example prediction
new_tweet = "love new seats virgin america"
new_vectorized = vectorizer.transform([new_tweet])
new_reshaped = new_vectorized.reshape(1, seq_length, feature_dim)
new_tensor = torch.tensor(new_reshaped, dtype=torch.float32).to(device)
model.eval()
with torch.no_grad():
    output = model(new_tensor)
    _, predicted = torch.max(output, 1)
predicted_label = label_encoder.inverse_transform([predicted.cpu().numpy()[0]])[0]
print(f"\nExample Tweet: {new_tweet}")
print(f"Predicted Sentiment: {predicted_label}")