In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

# This script performs the following tasks:
# 1. Reads a cleaned CSV file containing tweet data and weather information.
# 2. Converts the tweet timestamp to a datetime format for easier manipulation.
# 3. Creates categorical features for weather severity (snowfall, wind speed, precipitation) and converts them into one-hot encoded columns.
# 4. Generates interaction terms, such as wind chill, to capture combined effects of temperature and wind speed.
# 5. Extracts temporal features from the tweet timestamp, including hour of the day, day of the week, and whether it is a weekend.
# 6. Saves the enhanced dataset with new features for further analysis or modeling.

df = pd.read_csv('Input/df_pre-processed.csv')

df['tweet_created'] = pd.to_datetime(df['tweet_created'])

df['snowfall_severity'] = np.where(df['snowfall'] > 1.0, 'heavy_snowfall', 'light_snowfall')
df['wind_severity'] = np.where(df['wind_speed'] > 30.0, 'high_wind', 'low_wind')
df['precipitation_severity'] = np.where(df['precipitation'] > 2.0, 'heavy_rain', 'light_rain')

df = pd.get_dummies(df, columns=['snowfall_severity', 'wind_severity', 'precipitation_severity'], drop_first=True)


df['wind_chill'] = 13.12 + 0.6215 * df['temperature'] - 11.37 * (df['wind_speed'] ** 0.16) + 0.3965 * df['temperature'] * (df['wind_speed'] ** 0.16)


df['hour_of_day'] = df['tweet_created'].dt.hour
df['day_of_week'] = df['tweet_created'].dt.dayofweek  
df['is_weekend'] = np.where(df['day_of_week'].isin([5, 6]), 1, 0)  

In [2]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from torch.utils.data import DataLoader, Dataset
import pandas as pd

# This script performs the following tasks:
# 1. Loads a pre-trained BERT model and tokenizer for generating text embeddings.
# 2. Defines a custom PyTorch Dataset class to handle text data for BERT processing.
# 3. Implements a function to generate BERT embeddings for a list of texts in batches, using mean pooling to aggregate token-level embeddings.
# 4. Computes BERT embeddings for the 'cleaned_text' column of a DataFrame and stores them as a new column.
# 5. Ensures the embeddings are in a proper 2D numpy array format for further use in machine learning pipelines.


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()  


class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]


def get_bert_embeddings(texts, batch_size=32):
    dataset = TextDataset(texts)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    all_embeddings = []
    with torch.no_grad():  
        for batch in dataloader:
            
            inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True, max_length=128)
            outputs = model(**inputs)
            
           
            embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
            all_embeddings.append(embeddings)

   
    return np.vstack(all_embeddings)


texts = df['cleaned_text'].tolist()  
bert_embeddings = get_bert_embeddings(texts)  


assert bert_embeddings.ndim == 2  


df['bert_embedding'] = list(bert_embeddings)

In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score

# This script performs the following tasks:
# 1. Encodes the target sentiment labels using LabelEncoder for machine learning compatibility.
# 2. Combines multiple feature sets, including weather features, temporal features, severity features, and BERT embeddings, into a single feature matrix.
# 3. Ensures proper formatting and stacking of all feature sets, including converting BERT embeddings into a 2D numpy array.
# 4. Trains an XGBoost classifier using 10-fold stratified cross-validation to evaluate model performance.
# 5. Computes evaluation metrics (AUC-ROC, Precision, Recall, and F1 Score) for each class and fold.
# 6. Aggregates and saves the results, including average and standard deviation metrics, into a CSV file for further analysis.


le = LabelEncoder()


df['sentiment_encoded'] = le.fit_transform(df['airline_sentiment'])



X_weather = df[['snowfall', 'wind_speed', 'precipitation', 'humidity', 'temperature']].values
X_temporal = df[['hour_of_day', 'day_of_week', 'is_weekend']].values


severity_columns = [
    'snowfall_severity_light_snowfall', 
    'wind_severity_low_wind', 
    'precipitation_severity_light_rain'
]


missing_columns = [col for col in severity_columns if col not in df.columns]
if missing_columns:
    raise KeyError(f"The following columns are missing: {missing_columns}")

X_severity = df[severity_columns].values


X_bert = np.array(df['bert_embedding'].tolist())


if X_bert.ndim == 1:
    X_bert = np.vstack(X_bert) 


X_combined = np.hstack((X_weather, X_temporal, X_severity, X_bert))  


y = df['sentiment_encoded'].values



model = XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=6, random_state=42)


cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)


all_results = []


for fold, (train_idx, test_idx) in enumerate(cv.split(X_combined, y), 1):
    X_train, X_test = X_combined[train_idx], X_combined[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    

    model.fit(X_train, y_train)
    

    y_pred_proba = model.predict_proba(X_test)
    

    for class_idx in np.unique(y):
        
        auc_roc = roc_auc_score((y_test == class_idx).astype(int), y_pred_proba[:, class_idx])
        
       
        y_pred_class = model.predict(X_test)
        precision = precision_score(y_test, y_pred_class, labels=[class_idx], average=None)[0]
        recall = recall_score(y_test, y_pred_class, labels=[class_idx], average=None)[0]
        f1 = f1_score(y_test, y_pred_class, labels=[class_idx], average=None)[0]  
        
   
        all_results.append({
            'Fold': fold,
            'Class': class_idx,
            'AUC_ROC': auc_roc,
            'Precision': precision,
            'Recall': recall,
            'F1_Score': f1  
        })


results_df = pd.DataFrame(all_results)


average_results = results_df.groupby('Class').agg({
    'AUC_ROC': 'mean',
    'Precision': 'mean',
    'Recall': 'mean',
    'F1_Score': 'mean'  
}).reset_index()
average_results['Fold'] = 'Average'

std_results = results_df.groupby('Class').agg({
    'AUC_ROC': 'std',
    'Precision': 'std',
    'Recall': 'std',
    'F1_Score': 'std'  
}).reset_index()
std_results['Fold'] = 'Std Dev'


final_results_df = pd.concat([results_df, average_results, std_results], ignore_index=True)

final_results_df.to_csv('Output/additional_feature_all_class_cross_validation.csv', index=False)