In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, f1_score, recall_score, roc_curve, auc
from sklearn.preprocessing import LabelEncoder

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# import torch
# from transformers import BertTokenizer, BertModel

import re
from textblob import Word
from emot.emo_unicode import UNICODE_EMOJI, EMOTICONS_EMO
from autocorrect import Speller

import matplotlib.pyplot as plt
import seaborn as sns
from itertools import cycle

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# ---------------------------------------------
# run the following only once to download the nltk data
# ---------------------------------------------
# import nltk
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('punkt_tab')

In [None]:
# ---------------------------------------------
# preprocess text function
# ---------------------------------------------
def preprocess_text(text):
    abbreviation_dict = {
        "u": "you", "bked": "booked", "thx": "thanks", "plz": "please",
        "sfo": "san francisco airport", "lax": "los angeles airport",
        "nyc": "new york city", "bos": "boston", "las": "las vegas",
        "dal": "dallas", "dca": "washington, d.c.", "lg": "likely good"
    }
    english_contractions_dict = {
        "ain't": "am not", "aren't": "are not", "can't": "cannot", "can't've": "cannot have",
        "could've": "could have", "couldn't": "could not", "couldn't've": "could not have",
        "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not",
        "hasn't": "has not", "haven't": "have not", "he'd": "he would", "he'll": "he will",
        "he's": "he is", "how'd": "how did", "how'll": "how will", "how's": "how is",
        "i'd": "i would", "i'll": "i will", "i'm": "i am", "i've": "i have", "isn't": "is not",
        "it'd": "it would", "it'll": "it will", "it's": "it is", "let's": "let us",
        "ma'am": "madam", "might've": "might have", "mightn't": "might not", "must've": "must have",
        "mustn't": "must not", "needn't": "need not", "shan't": "shall not", "she'd": "she would",
        "she'll": "she will", "she's": "she is", "should've": "should have", "shouldn't": "should not",
        "that'd": "that would", "that's": "that is", "there's": "there is", "they'd": "they would",
        "they'll": "they will", "they're": "they are", "they've": "they have", "wasn't": "was not",
        "we'd": "we would", "we're": "we are", "we've": "we have", "weren't": "were not",
        "what'll": "what will", "what're": "what are", "what's": "what is", "what've": "what have",
        "where's": "where is", "who's": "who is", "who've": "who have", "won't": "will not",
        "would've": "would have", "wouldn't": "would not", "you'd": "you would", "you'll": "you will",
        "you're": "you are", "you've": "you have"
    }
    spell = Speller(lang='en')
    english_stopwords = stopwords.words("english")

    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\S+', '', text)

    words = text.split()
    new_words = [abbreviation_dict.get(word, word) for word in words]
    text = " ".join(new_words)

    words = text.split()
    new_words = [english_contractions_dict.get(word, word) for word in words]
    text = " ".join(new_words)

    for emot in UNICODE_EMOJI:
        if emot in text:
            text = text.replace(
                emot,
                " " + UNICODE_EMOJI[emot].replace(":", "").replace(",", "").replace("_", " ") + " "
            ).lower()
    for emo in EMOTICONS_EMO:
        if emo in text:
            text = text.replace(
                emo,
                " " + EMOTICONS_EMO[emo].replace(":", "").replace(",", "").replace("_", " ") + " "
            ).lower()

    text = re.sub(r'[^\w\s]', '', text)
    text = " ".join(x for x in text.split() if x.lower() not in english_stopwords)
    text = ' '.join(spell(word) for word in text.split())
    text = " ".join(Word(word).lemmatize() for word in text.split())
    tokens = word_tokenize(text)
    
    return tokens


# ---------------------------------------------
# evaluate model function
# ---------------------------------------------
def evaluate_model(model, X_train, X_test, y_train, y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    train_precision = precision_score(y_train, y_train_pred, average='weighted')
    train_recall = recall_score(y_train, y_train_pred, average='weighted')
    train_f1 = f1_score(y_train, y_train_pred, average='weighted')

    test_precision = precision_score(y_test, y_test_pred, average='weighted')
    test_recall = recall_score(y_test, y_test_pred, average='weighted')
    test_f1 = f1_score(y_test, y_test_pred, average='weighted')

    metrics = {
        'Training': {
            'Accuracy': train_accuracy * 100,
            'Precision': train_precision * 100,
            'Recall': train_recall * 100,
            'F1-score': train_f1 * 100
        },
        'Testing': {
            'Accuracy': test_accuracy * 100,
            'Precision': test_precision * 100,
            'Recall': test_recall * 100,
            'F1-score': test_f1 * 100
        }
    }    
    
    print("\nMetrics Differences (Training - Testing):")
    print(f"Accuracy Diff: {(train_accuracy - test_accuracy)*100:.2f}%")
    print(f"Precision Diff: {(train_precision - test_precision)*100:.2f}%")
    print(f"Recall Diff: {(train_recall - test_recall)*100:.2f}%")
    print(f"F1-Score Diff: {(train_f1 - test_f1)*100:.2f}%")
    
    print(f"\nTraining Metrics:")
    print(f"Accuracy: {train_accuracy*100:.2f}%")
    print(f"Precision: {train_precision*100:.2f}%") 
    print(f"Recall: {train_recall*100:.2f}%")
    print(f"F1-Score: {train_f1*100:.2f}%")
    
    print(f"\nTesting Metrics:")
    print(f"Accuracy: {test_accuracy*100:.2f}%")
    print(f"Precision: {test_precision*100:.2f}%")
    print(f"Recall: {test_recall*100:.2f}%") 
    print(f"F1-Score: {test_f1*100:.2f}%")
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))
    metrics_train = list(metrics['Training'].values())
    metrics_labels = list(metrics['Training'].keys())
    ax1.bar(metrics_labels, metrics_train, color=['blue', 'green', 'red', 'purple'])
    ax1.set_title('Training Metrics')
    ax1.set_ylim(0, 100)
    ax1.set_ylabel('Score (%)')
    ax1.grid(True, alpha=0.3)
    for i, v in enumerate(metrics_train):
        ax1.text(i, v + 1, f'{v:.2f}%', ha='center')
        
    metrics_test = list(metrics['Testing'].values())
    ax2.bar(metrics_labels, metrics_test, color=['blue', 'green', 'red', 'purple'])
    ax2.set_title('Testing Metrics')
    ax2.set_ylim(0, 100)
    ax2.set_ylabel('Score (%)')
    ax2.grid(True, alpha=0.3)
    for i, v in enumerate(metrics_test):
        ax2.text(i, v + 1, f'{v:.2f}%', ha='center')        
        
    plt.tight_layout()
    plt.show()
    
    cm = confusion_matrix(y_test, y_test_pred, labels=['positive', 'negative', 'neutral'])
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['positive', 'negative', 'neutral'], yticklabels=['positive', 'negative', 'neutral'])
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

    if hasattr(model, 'predict_proba'):
        y_score = model.predict_proba(X_test)
    else:
        y_score = model.decision_function(X_test)
        y_score = np.exp(y_score) / np.sum(np.exp(y_score), axis=1, keepdims=True)
    
    classes = ['positive', 'negative', 'neutral']
    y_test_bin = pd.get_dummies(y_test).values
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(len(classes)):
        fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    plt.figure(figsize=(6, 4))
    colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
    for i, color in zip(range(len(classes)), colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=2, label=f'ROC curve for {classes[i]} (AUC = {roc_auc[i]:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves for Multi-Class Sentiment Analysis')
    plt.legend(loc="lower right")
    plt.show()

In [None]:
df = pd.read_csv("../../../data/clean_Tweets.csv")

X = df["text"]
y = df["airline_sentiment"]

In [None]:
# -------------------------------------------------
# GloVe PyTorch
# -------------------------------------------------
# label_encoder = LabelEncoder()
# y = label_encoder.fit_transform(y)
# max_words = 10000
# max_len = 50
# tokenizer = word_tokenize
# def build_vocab(texts, max_words):
#     word_freq = {}
#     for text in texts:
#         for word in tokenizer(text):
#             word_freq[word] = word_freq.get(word, 0) + 1
#     vocab = {word: idx + 1 for idx, (word, _) in enumerate(sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:max_words-1])}
#     vocab["<OOV>"] = 0
#     return vocab
# vocab = build_vocab(X, max_words)
# X_sequences = [[vocab.get(word, vocab["<OOV>"]) for word in tokenizer(text)][:max_len] for text in X]
# X_padded = [seq + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in X_sequences]
# X_padded = np.array(X_padded)
# embedding_dim = 100
# embeddings_index = {}
# with open("../../../data/glove.6B.100d.txt", encoding="utf-8") as f:
#     for line in f:
#         values = line.split()
#         word = values[0]
#         coefs = np.asarray(values[1:], dtype="float32")
#         embeddings_index[word] = coefs
# embedding_matrix = np.zeros((max_words, embedding_dim))
# for word, idx in vocab.items():
#     if idx < max_words:
#         embedding_vector = embeddings_index.get(word)
#         if embedding_vector is not None:
#             embedding_matrix[idx] = embedding_vector
# embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)


# -------------------------------------------------
# GloVe TensorFlow
# -------------------------------------------------
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from sklearn.preprocessing import LabelEncoder
# data = pd.read_csv("clean_Tweets.csv")
# X = data["text"]
# y = data["airline_sentiment"]
# label_encoder = LabelEncoder()
# y = label_encoder.fit_transform(y)
# max_words = 10000
# max_len = 50
# tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
# tokenizer.fit_on_texts(X)
# X_sequences = tokenizer.texts_to_sequences(X)
# X_padded = tokenizer.texts_to_sequences(X)
# X_padded = pad_sequences(X_sequences, maxlen=max_len, padding="post", truncating="post")
# embedding_dim = 100
# embeddings_index = {}
# with open("glove.6B.100d.txt", encoding="utf-8") as f:
#     for line in f:
#         values = line.split()
#         word = values[0]
#         coefs = np.asarray(values[1:], dtype="float32")
#         embeddings_index[word] = coefs
# embedding_matrix = np.zeros((max_words, embedding_dim))
# for word, i in tokenizer.word_index.items():
#     if i < max_words:
#         embedding_vector = embeddings_index.get(word)
#         if embedding_vector is not None:
#             embedding_matrix[i] = embedding_vector


# -------------------------------------------------
# TF-IDF Vectorizer
# -------------------------------------------------
# vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
# X_vectorized = vectorizer.fit_transform(X)


# -------------------------------------------------
# Word2Vec
# -------------------------------------------------
# w2v_model = Word2Vec(sentences=[text.split() for text in X], vector_size=100, window=5, min_count=1, workers=4)
# def text_to_vec(text, model):
#     words = text.split()
#     word_vecs = [model.wv[word] for word in words if word in model.wv]
#     if len(word_vecs) == 0:
#         return np.zeros(model.vector_size)
#     return np.mean(word_vecs, axis=0)
# X_vectorized = np.array([text_to_vec(text, w2v_model) for text in X])


# -------------------------------------------------
# BERT Embeddings
# -------------------------------------------------
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained('bert-base-uncased')
# def text_to_bert_vec(text, tokenizer, model):
#     inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
#     with torch.no_grad():
#         outputs = model(**inputs)
#     return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
# X_vectorized = np.array([text_to_bert_vec(text, tokenizer, model) for text in X])
# pd.DataFrame(X_vectorized).to_csv('../../../vector_data/X_bert_embeddings.csv', index=False)


# -------------------------------------------------
# N-gram Features
# -------------------------------------------------
# bigram_vectorizer = CountVectorizer(
#     ngram_range=(2,2),
#     max_features=20000,
#     stop_words='english'
# )
# trigram_vectorizer = CountVectorizer(
#     ngram_range=(3,3),
#     max_features=20000,
#     stop_words='english'
# )
# X_bigram = bigram_vectorizer.fit_transform(X)
# X_trigram = trigram_vectorizer.fit_transform(X)
# X_vectorized = np.hstack((X_bigram.toarray(), X_trigram.toarray()))
# pd.DataFrame(X_vectorized).to_csv('../../../vector_data/X_ngram_features.csv', index=False)

In [None]:
# -------------------------------------------------
# TF-IDF with N-grams Vectorizer
# -------------------------------------------------
# TfidfVectorizer converts text into numerical features using TF-IDF scores
# max_features=20000: Limit vocabulary to top 20,000 most frequent words
# token_pattern=r"(?u)\b\w+\b": Match any word character (letters, digits, underscore)
# ngram_range=(1,3): Create features from single words, pairs, and triplets of consecutive words
# vectorizer = TfidfVectorizer(max_features=20000, token_pattern=r"(?u)\b\w+\b", ngram_range=(1,2))

# vectorizer = TfidfVectorizer(
#     max_features=10000,
#     token_pattern=r"(?u)\b\w+\b",
#     ngram_range=(1,3)
# )
# X_vectorized = vectorizer.fit_transform(X)

from collections import defaultdict, Counter

class TF_IDF_with_N_Grams_Vectorizer:
    def __init__(self, max_features=20000, token_pattern=r"(?u)\b\w+\b", ngram_range=(1, 3)):
        self.max_features = max_features
        self.token_pattern = re.compile(token_pattern)
        self.ngram_range = ngram_range
        self.vocabulary_ = {}
        self.idf_ = {}

    def _tokenize(self, text):
        return self.token_pattern.findall(text.lower())

    def _generate_ngrams(self, tokens):
        ngram_tokens = []
        min_n, max_n = self.ngram_range
        for n in range(min_n, max_n + 1):
            ngram_tokens.extend([' '.join(tokens[i:i+n]) for i in range(len(tokens)-n+1)])
        return ngram_tokens

    def fit(self, raw_documents):
        df = defaultdict(int)
        doc_count = len(raw_documents)

        for doc in raw_documents:
            tokens = self._tokenize(doc)
            ngrams = self._generate_ngrams(tokens)
            unique_terms = set(ngrams)
            for term in unique_terms:
                df[term] += 1

        # Calculate IDF and build vocabulary
        sorted_terms = sorted(df.items(), key=lambda x: -x[1])[:self.max_features]
        self.vocabulary_ = {term: idx for idx, (term, _) in enumerate(sorted_terms)}
        self.idf_ = {
            term: np.log((1 + doc_count) / (1 + df[term])) + 1.0
            for term in self.vocabulary_
        }

        return self

    def transform(self, raw_documents):
        n_docs = len(raw_documents)
        n_features = len(self.vocabulary_)
        X = np.zeros((n_docs, n_features), dtype=np.float32)

        for doc_idx, doc in enumerate(raw_documents):
            tokens = self._tokenize(doc)
            ngrams = self._generate_ngrams(tokens)
            tf = Counter(ngrams)

            for term, count in tf.items():
                if term in self.vocabulary_:
                    tf_val = count / len(ngrams)
                    idf_val = self.idf_[term]
                    tfidf = tf_val * idf_val
                    X[doc_idx, self.vocabulary_[term]] = tfidf

        return X

    def fit_transform(self, raw_documents):
        self.fit(raw_documents)
        return self.transform(raw_documents)


# vectorizer = TF_IDF_with_N_Grams_Vectorizer(max_features=20000, ngram_range=(1, 2))
vectorizer = TF_IDF_with_N_Grams_Vectorizer(max_features=5000, ngram_range=(1, 2))
X_vectorized = vectorizer.fit_transform(X)

print("Vocabulary:", vectorizer.vocabulary_)
print("TF-IDF Matrix:\n", X_vectorized)

sampler = RandomOverSampler(random_state=42)
X_resampled, y_resampled = sampler.fit_resample(X_vectorized, y)

In [None]:
# Load BERT embeddings from CSV
# X = pd.read_csv('../../../vector_data/X_bert_embeddings.csv')
# X = pd.read_csv('../../../vector_data/X_ngram_features.csv')

# handling data imbalance
# sampler = SMOTE(random_state=42)
sampler = RandomOverSampler(random_state=42)
# sampler = RandomUnderSampler(random_state=42)
# X_resampled, y_resampled = sampler.fit_resample(X_vectorized, y)

# X_resampled, y_resampled = sampler.fit_resample(X_padded, y)
X_resampled, y_resampled = sampler.fit_resample(X_vectorized, y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.1, random_state=42)

In [None]:
# sentiment_counts = y_train.value_counts()
sentiment_counts = pd.Series(y_train).value_counts()

plt.figure(figsize=(6, 4))
sentiment_counts.plot(kind='bar')
plt.xlabel('Airline Sentiment')
plt.ylabel('Number of Tweets')
plt.title('Distribution of Airline Sentiments')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# logistic_regression_classifier = LogisticRegression(random_state=42, max_iter=2000)
# logistic_regression_classifier.fit(X_train, y_train)
# evaluate_model(logistic_regression_classifier, X_train, X_test, y_train, y_test)

In [None]:
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train, y_train)
evaluate_model(naive_bayes_classifier, X_train, X_test, y_train, y_test)

In [None]:
# Random Forest with Grid Search
rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None]
}
rf = RandomForestClassifier(random_state=42)
rf_grid = GridSearchCV(rf, rf_param_grid, cv=5, scoring='f1_macro', n_jobs=-1)
rf_grid.fit(X_train, y_train)
best_rf = rf_grid.best_estimator_
print("\nBest Random Forest Parameters:", rf_grid.best_params_)

# Evaluate Random Forest
evaluate_model(best_rf, X_train, X_test, y_train, y_test)

In [None]:
support_vector_machine_classifier = SVC(kernel='linear', random_state=42)
support_vector_machine_classifier.fit(X_train, y_train)
evaluate_model(support_vector_machine_classifier, X_train, X_test, y_train, y_test)

In [None]:
# # Example text for prediction
# example_text = "This flight was terrible, I will never fly with them again!"
# example_text = "I'm so happy with the service, I will definitely fly with them again!"
# example_text = "I'm am going to have my lunch now"

# # Vectorize the example text using the same vectorizer
# example_vectorized = vectorizer.transform([example_text])

# # Make prediction
# # Logistic Regression predictions
# lr_prediction = logistic_regression_classifier.predict(example_vectorized)
# lr_probability = logistic_regression_classifier.predict_proba(example_vectorized)

# # SVM predictions 
# svm_prediction = support_vector_machine_classifier.predict(example_vectorized)

# # Naive Bayes predictions
# nb_prediction = naive_bayes_classifier.predict(example_vectorized)
# nb_probability = naive_bayes_classifier.predict_proba(example_vectorized)

# print(f"\nExample text: {example_text}")
# print("\nLogistic Regression:")
# print(f"Predicted sentiment: {lr_prediction[0]}")
# print(f"Prediction probabilities: {lr_probability[0]}")

# print("\nSupport Vector Machine:")
# print(f"Predicted sentiment: {svm_prediction[0]}")

# print("\nNaive Bayes:")
# print(f"Predicted sentiment: {nb_prediction[0]}")
# print(f"Prediction probabilities: {nb_probability[0]}")