In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

# Download NLTK data files (run once)
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

# Load Dataset
data = pd.read_csv('/Users/nishaantsoni/Downloads/archive/Tweets.csv')  # Replace with your dataset path

# Select relevant columns and rename
data = data[['text', 'airline_sentiment']]
data = data.rename(columns={'airline_sentiment': 'sentiment'})

# Preprocess Text
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    words = [word for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(words)

data['clean_text'] = data['text'].apply(preprocess_text)

# Feature Extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(data['clean_text'])
y = data['sentiment']

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_grids = {
    'Logistic Regression': {
        'solver': ['liblinear', 'lbfgs'],
        'C': [7, 8, 9, 10]
    },
    'Support Vector Machine': {
        'kernel': ['poly', 'rbf'],
        'C': [3],
        'gamma': ['scale', 'auto']
    },
    'Random Forest': {
        'n_estimators': [195, 200, 205],
        'max_depth': [None, 1],
        'min_samples_split': [2]
    }
}

# Define Models
base_models = {
    'Logistic Regression': LogisticRegression(max_iter=2000),
    'Support Vector Machine': SVC(probability=True),
    'Random Forest': RandomForestClassifier()
}

# Train and Evaluate Models
# for model_name, model in models.items():
#     print(f"Training {model_name}...")
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     print(f"--- {model_name} Evaluation ---")
#     print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
#     print(f"Precision: {precision_score(y_test, y_pred, average='macro', zero_division=0):.4f}")
#     print(f"Recall:    {recall_score(y_test, y_pred, average='macro', zero_division=0):.4f}")
#     print(f"F1 Score:  {f1_score(y_test, y_pred, average='macro', zero_division=0):.4f}")
#     print(classification_report(y_test, y_pred, zero_division=0))
#     print("----------------------------\n")

for model_name, model in base_models.items():
    print(f"--- {model_name} ---")
    param_grid = param_grids[model_name]
    
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Accuracy:  {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"Recall:    {recall_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"F1 Score:  {f1_score(y_test, y_pred, average='weighted'):.4f}")
    print(classification_report(y_test, y_pred, zero_division=0))
    print("----------------------------\n")


tuned_models = {}
for model_name, model in base_models.items():
    print(f"Tuning {model_name}...")
    param_grid = param_grids[model_name]
    grid_search = GridSearchCV(
        model, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    tuned_models[model_name] = best_model
    print(f"{model_name} best parameters: {grid_search.best_params_}")
    print("----------------------------\n")

# Create Ensemble Model using VotingClassifier
ensemble = VotingClassifier(
    estimators=[
        ('lr', tuned_models['Logistic Regression']),
        ('svm', tuned_models['Support Vector Machine']),
        ('rf', tuned_models['Random Forest'])
    ],
    voting='soft'
)

# Fit Ensemble Model
print("Training Ensemble Model...")
ensemble.fit(X_train, y_train)
y_pred = ensemble.predict(X_test)

# Evaluate Ensemble Model
print("--- Ensemble Model Evaluation ---")
print(f"Accuracy:  {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall:    {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score:  {f1_score(y_test, y_pred, average='weighted'):.4f}")
print(classification_report(y_test, y_pred))
print("----------------------------\n")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nishaantsoni/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nishaantsoni/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/nishaantsoni/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Tuning Logistic Regression...
Logistic Regression best parameters: {'C': 8, 'solver': 'liblinear'}
----------------------------

Tuning Support Vector Machine...
Support Vector Machine best parameters: {'C': 3, 'gamma': 'scale', 'kernel': 'rbf'}
----------------------------

Tuning Random Forest...
Random Forest best parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 195}
----------------------------

Training Ensemble Model...
--- Ensemble Model Evaluation ---
Accuracy:  0.8029
Precision: 0.7933
Recall:    0.8029
F1 Score:  0.7917
              precision    recall  f1-score   support

    negative       0.83      0.93      0.88      1889
     neutral       0.69      0.48      0.57       580
    positive       0.77      0.67      0.72       459

    accuracy                           0.80      2928
   macro avg       0.76      0.69      0.72      2928
weighted avg       0.79      0.80      0.79      2928

----------------------------

