In [5]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, f1_score, recall_score, roc_curve, auc
from sklearn.preprocessing import LabelEncoder

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

import torch
from transformers import BertTokenizer, BertModel
import hf_xet

import re
from textblob import Word
from emot.emo_unicode import UNICODE_EMOJI, EMOTICONS_EMO
from autocorrect import Speller
from imblearn.over_sampling import SMOTE

import matplotlib.pyplot as plt
import seaborn as sns
from itertools import cycle

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# ---------------------------------------------
# run the following only once to download the nltk data
# ---------------------------------------------
# import nltk
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('punkt_tab')

In [4]:
# ---------------------------------------------
# preprocess text function
# ---------------------------------------------
def preprocess_text(text):
    abbreviation_dict = {
        "u": "you", "bked": "booked", "thx": "thanks", "plz": "please",
        "sfo": "san francisco airport", "lax": "los angeles airport",
        "nyc": "new york city", "bos": "boston", "las": "las vegas",
        "dal": "dallas", "dca": "washington, d.c.", "lg": "likely good"
    }
    english_contractions_dict = {
        "ain't": "am not", "aren't": "are not", "can't": "cannot", "can't've": "cannot have",
        "could've": "could have", "couldn't": "could not", "couldn't've": "could not have",
        "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not",
        "hasn't": "has not", "haven't": "have not", "he'd": "he would", "he'll": "he will",
        "he's": "he is", "how'd": "how did", "how'll": "how will", "how's": "how is",
        "i'd": "i would", "i'll": "i will", "i'm": "i am", "i've": "i have", "isn't": "is not",
        "it'd": "it would", "it'll": "it will", "it's": "it is", "let's": "let us",
        "ma'am": "madam", "might've": "might have", "mightn't": "might not", "must've": "must have",
        "mustn't": "must not", "needn't": "need not", "shan't": "shall not", "she'd": "she would",
        "she'll": "she will", "she's": "she is", "should've": "should have", "shouldn't": "should not",
        "that'd": "that would", "that's": "that is", "there's": "there is", "they'd": "they would",
        "they'll": "they will", "they're": "they are", "they've": "they have", "wasn't": "was not",
        "we'd": "we would", "we're": "we are", "we've": "we have", "weren't": "were not",
        "what'll": "what will", "what're": "what are", "what's": "what is", "what've": "what have",
        "where's": "where is", "who's": "who is", "who've": "who have", "won't": "will not",
        "would've": "would have", "wouldn't": "would not", "you'd": "you would", "you'll": "you will",
        "you're": "you are", "you've": "you have"
    }
    spell = Speller(lang='en')
    english_stopwords = stopwords.words("english")

    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\S+', '', text)

    words = text.split()
    new_words = [abbreviation_dict.get(word, word) for word in words]
    text = " ".join(new_words)

    words = text.split()
    new_words = [english_contractions_dict.get(word, word) for word in words]
    text = " ".join(new_words)

    for emot in UNICODE_EMOJI:
        if emot in text:
            text = text.replace(
                emot,
                " " + UNICODE_EMOJI[emot].replace(":", "").replace(",", "").replace("_", " ") + " "
            ).lower()
    for emo in EMOTICONS_EMO:
        if emo in text:
            text = text.replace(
                emo,
                " " + EMOTICONS_EMO[emo].replace(":", "").replace(",", "").replace("_", " ") + " "
            ).lower()

    text = re.sub(r'[^\w\s]', '', text)
    text = " ".join(x for x in text.split() if x.lower() not in english_stopwords)
    text = ' '.join(spell(word) for word in text.split())
    text = " ".join(Word(word).lemmatize() for word in text.split())
    tokens = word_tokenize(text)
    
    return tokens


# ---------------------------------------------
# evaluate model function
# ---------------------------------------------
def evaluate_model(model, X_train, X_test, y_train, y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    train_precision = precision_score(y_train, y_train_pred, average='weighted')
    train_recall = recall_score(y_train, y_train_pred, average='weighted')
    train_f1 = f1_score(y_train, y_train_pred, average='weighted')

    test_precision = precision_score(y_test, y_test_pred, average='weighted')
    test_recall = recall_score(y_test, y_test_pred, average='weighted')
    test_f1 = f1_score(y_test, y_test_pred, average='weighted')

    metrics = {
        'Training': {
            'Accuracy': train_accuracy * 100,
            'Precision': train_precision * 100,
            'Recall': train_recall * 100,
            'F1-score': train_f1 * 100
        },
        'Testing': {
            'Accuracy': test_accuracy * 100,
            'Precision': test_precision * 100,
            'Recall': test_recall * 100,
            'F1-score': test_f1 * 100
        }
    }    
    print(metrics)
    
    print(f"\nTraining Metrics:")
    print(f"Accuracy: {train_accuracy*100:.2f}%")
    print(f"Precision: {train_precision*100:.2f}%") 
    print(f"Recall: {train_recall*100:.2f}%")
    print(f"F1-Score: {train_f1*100:.2f}%")
    
    print(f"\nTesting Metrics:")
    print(f"Accuracy: {test_accuracy*100:.2f}%")
    print(f"Precision: {test_precision*100:.2f}%")
    print(f"Recall: {test_recall*100:.2f}%") 
    print(f"F1-Score: {test_f1*100:.2f}%")
    
    print(classification_report(y_test, y_test_pred))
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))
    metrics_train = list(metrics['Training'].values())
    metrics_labels = list(metrics['Training'].keys())
    ax1.bar(metrics_labels, metrics_train, color=['blue', 'green', 'red', 'purple'])
    ax1.set_title('Training Metrics')
    ax1.set_ylim(0, 100)
    ax1.set_ylabel('Score (%)')
    ax1.grid(True, alpha=0.3)
    for i, v in enumerate(metrics_train):
        ax1.text(i, v + 1, f'{v:.2f}%', ha='center')
        
    metrics_test = list(metrics['Testing'].values())
    ax2.bar(metrics_labels, metrics_test, color=['blue', 'green', 'red', 'purple'])
    ax2.set_title('Testing Metrics')
    ax2.set_ylim(0, 100)
    ax2.set_ylabel('Score (%)')
    ax2.grid(True, alpha=0.3)
    for i, v in enumerate(metrics_test):
        ax2.text(i, v + 1, f'{v:.2f}%', ha='center')        
        
    plt.tight_layout()
    plt.show()
    
    cm = confusion_matrix(y_test, y_test_pred, labels=['positive', 'negative', 'neutral'])
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['positive', 'negative', 'neutral'], yticklabels=['positive', 'negative', 'neutral'])
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

    y_score = model.predict_proba(X_test)
    classes = ['positive', 'negative', 'neutral']
    y_test_bin = pd.get_dummies(y_test).values
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(len(classes)):
        fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    plt.figure(figsize=(6, 4))
    colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
    for i, color in zip(range(len(classes)), colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=2, label=f'ROC curve for {classes[i]} (AUC = {roc_auc[i]:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves for Multi-Class Sentiment Analysis')
    plt.legend(loc="lower right")
    plt.show()

In [6]:
df = pd.read_csv('../../../data/clean_Tweets.csv')

X = df['text']
y = df['airline_sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=19)

In [None]:
# vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
# X_train_tfidf = vectorizer.fit_transform(X_train)
# X_test_tfidf = vectorizer.transform(X_test)


# w2v_model = Word2Vec(sentences=[text.split() for text in X_train], vector_size=100, window=5, min_count=1, workers=4)
# def text_to_vec(text, model):
#     words = text.split()
#     word_vecs = [model.wv[word] for word in words if word in model.wv]
#     if len(word_vecs) == 0:
#         return np.zeros(model.vector_size)
#     return np.mean(word_vecs, axis=0)
# X_train_w2v = np.array([text_to_vec(text, w2v_model) for text in X_train])
# X_test_w2v = np.array([text_to_vec(text, w2v_model) for text in X_test])

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', use_xet=True)
model = BertModel.from_pretrained('bert-base-uncased', use_xet=True)

def text_to_bert_vec(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Convert training and testing text to BERT embeddings
# This might take a while depending on the dataset size and hardware
X_train_bert = np.array([text_to_bert_vec(text, tokenizer, model) for text in X_train])
X_test_bert = np.array([text_to_bert_vec(text, tokenizer, model) for text in X_test])

In [6]:
# handling data imbalance
smote = SMOTE(random_state=19)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_bert, y_train)

# from imblearn.over_sampling import RandomOverSampler
# rus = RandomUnderSampler(random_state=42)
# X_train_resampled, y_train_resampled = rus.fit_resample(X_train_bert, y_train)
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`

In [None]:
sentiment_counts = y_train_resampled.value_counts()

plt.figure(figsize=(8, 5))
sentiment_counts.plot(kind='bar')
plt.xlabel('Airline Sentiment')
plt.ylabel('Number of Tweets')
plt.title('Distribution of Airline Sentiments')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train_resampled, y_train_resampled)

logistic_regression_classifier = LogisticRegression(random_state=42, max_iter=1000)
logistic_regression_classifier.fit(X_train_resampled, y_train_resampled)

support_vector_machine_classifier = SVC(kernel='linear', random_state=42)
support_vector_machine_classifier.fit(X_train_resampled, y_train_resampled)



In [None]:
# Logistic Regression
lr_classifier = LogisticRegression(random_state=42, max_iter=1000)
lr_classifier.fit(X_train_resampled, y_train_resampled)

In [None]:
text = "great flight amazing service"
text = "worst flight ever"
text = "I am going to my school"

text_tokenized = preprocess_text(text)
text_tfidf = vectorizer.transform([text])
prediction = nb_classifier.predict(text_tfidf)

print(f"\nExample Tweet: '{text}'")
print(f"Predicted Sentiment: {prediction[0]}")