In [36]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

## Data

In [89]:
# Load and Preprocess the Data:
import pandas as pd

data = pd.read_csv('data.csv')

In [56]:
data.columns

Index(['timestamp', 'id', 'likes', 'query', 'replies', 'retweets', 'text',
       'user', 'outage', 'outage_state'],
      dtype='object')

In [57]:
data.head()

Unnamed: 0,timestamp,id,likes,query,replies,retweets,text,user,outage,outage_state
0,2012-11-01 23:50:22,264152432282578945,1,EversourceMA OR EversourceNH OR VelcoVT OR nat...,1.0,3,"Tom May, CEO of Northeast Utilities, the paren...",EversourceMA,1,WV OH PA NJ CT MA NY DE MD IN KY MI
1,2012-11-01 23:45:13,264151136792109056,0,EversourceMA OR EversourceNH OR VelcoVT OR nat...,0.0,0,@NYGovCuomo @lipanews @nationalgridus @nyseand...,readyforthenet,1,WV OH PA NJ CT MA NY DE MD IN KY MI
2,2012-11-01 23:34:44,264148498352590849,1,EversourceMA OR EversourceNH OR VelcoVT OR nat...,0.0,1,Some amazing video from the Wareham microburst...,EversourceMA,1,WV OH PA NJ CT MA NY DE MD IN KY MI
3,2012-11-01 23:34:20,264148399190851584,0,EversourceMA OR EversourceNH OR VelcoVT OR nat...,0.0,0,@nationalgridus Call me if you need some help ...,sparky1000,1,WV OH PA NJ CT MA NY DE MD IN KY MI
4,2012-11-01 23:31:56,264147793147490304,0,EversourceMA OR EversourceNH OR VelcoVT OR nat...,1.0,8,Current PSNH statewide w/o power: 885. We're d...,EversourceNH,1,WV OH PA NJ CT MA NY DE MD IN KY MI


In [49]:
len(data)

38069

In [58]:
data[["outage"]].value_counts()

outage
1         20431
0         17638
dtype: int64

In [88]:
# Separate samples for each class
outage_samples = data[data['outage'] == 1]
no_outage_samples = data[data['outage'] == 0]

# Randomly sample 500 samples from each class for training data
outage_training_samples = outage_samples.sample(n=500, random_state=42)
no_outage_training_samples = no_outage_samples.sample(n=500, random_state=42)

# Concatenate the samples from both classes for training data
training_data = pd.concat([outage_training_samples, no_outage_training_samples])

# Get the remaining samples for testing data
outage_remaining_samples = outage_samples[~outage_samples.index.isin(outage_training_samples.index)]
no_outage_remaining_samples = no_outage_samples[~no_outage_samples.index.isin(no_outage_training_samples.index)]

# Randomly sample 2000 samples from each class for testing data
outage_testing_samples = outage_remaining_samples.sample(n=2000, random_state=42)
no_outage_testing_samples = no_outage_remaining_samples.sample(n=2000, random_state=42)

# Concatenate the samples from both classes for testing data
testing_data = pd.concat([outage_testing_samples, no_outage_testing_samples])

# Verify the distribution
print(training_data['outage'].value_counts())
print(testing_data['outage'].value_counts())


0    500
1    500
Name: outage, dtype: int64
0    2000
1    2000
Name: outage, dtype: int64


In [90]:
avg_length_train = training_data.groupby('outage')['text'].apply(lambda x: x.str.len().mean())
avg_length_test = testing_data.groupby('outage')['text'].apply(lambda x: x.str.len().mean())


In [94]:
print(avg_length_train)
print(avg_length_train.mean())

outage
0    101.496
1     99.430
Name: text, dtype: float64
100.463


In [96]:
print(avg_length_test)
print(avg_length_test.mean())

outage
0    100.8235
1     99.2205
Name: text, dtype: float64
100.02199999999999


## Baselines

In [99]:
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def run_nlp_model(model_name, train_data, test_data):
    """
    Run NLP model workflow for XGBoost, SVM, or Logistic Regression.
    
    Args:
    - model_name (str): Name of the model ('xgboost', 'svm', or 'logistic').
    - train_data (DataFrame): Training data with 'text' and 'outage' columns.
    - test_data (DataFrame): Testing data with 'text' and 'outage' columns.
    """
    # Extract labels from the 'outage' column for training and testing data
    train_labels = train_data['outage']
    test_labels = test_data['outage']
    
    # Preprocessing
    vectorizer = TfidfVectorizer()
    train_features = vectorizer.fit_transform(train_data['text'])
    test_features = vectorizer.transform(test_data['text'])
    
    # Model selection
    if model_name == 'xgboost':
        model = XGBClassifier()
    elif model_name == 'svm':
        model = SVC()
    elif model_name == 'logistic':
        model = LogisticRegression()
    else:
        raise ValueError("Invalid model name. Choose from 'xgboost', 'svm', or 'logistic'.")
    
    # Training
    model.fit(train_features, train_labels)
    
    # Testing
    predictions = model.predict(test_features)
    
    # Evaluation
    accuracy = accuracy_score(test_labels, predictions)
    precision = precision_score(test_labels, predictions)
    recall = recall_score(test_labels, predictions)
    f1 = f1_score(test_labels, predictions)
    
    # Print the evaluation metrics
    print("Model:", model_name)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)


### SVM

In [101]:
# Call the function for SVM
run_nlp_model('svm', training_data, testing_data)

Model: svm
Accuracy: 0.6615
Precision: 0.671443736730361
Recall: 0.6325
F1-score: 0.6513903192584963


### Logistic

In [102]:
# Call the function for Logistic Regression
run_nlp_model('logistic', training_data, testing_data)

Model: logistic
Accuracy: 0.6535
Precision: 0.652281746031746
Recall: 0.6575
F1-score: 0.6548804780876494


### XGBoost

In [103]:
# Call the function for XGBoost
run_nlp_model('xgboost', training_data, testing_data)


Model: xgboost
Accuracy: 0.6015
Precision: 0.5881841876629018
Recall: 0.677
F1-score: 0.6294746629474663
