In [36]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

## Data

In [89]:
# Load and Preprocess the Data:
import pandas as pd

data = pd.read_csv('data.csv')

In [56]:
data.columns

Index(['timestamp', 'id', 'likes', 'query', 'replies', 'retweets', 'text',
       'user', 'outage', 'outage_state'],
      dtype='object')

In [57]:
data.head()

Unnamed: 0,timestamp,id,likes,query,replies,retweets,text,user,outage,outage_state
0,2012-11-01 23:50:22,264152432282578945,1,EversourceMA OR EversourceNH OR VelcoVT OR nat...,1.0,3,"Tom May, CEO of Northeast Utilities, the paren...",EversourceMA,1,WV OH PA NJ CT MA NY DE MD IN KY MI
1,2012-11-01 23:45:13,264151136792109056,0,EversourceMA OR EversourceNH OR VelcoVT OR nat...,0.0,0,@NYGovCuomo @lipanews @nationalgridus @nyseand...,readyforthenet,1,WV OH PA NJ CT MA NY DE MD IN KY MI
2,2012-11-01 23:34:44,264148498352590849,1,EversourceMA OR EversourceNH OR VelcoVT OR nat...,0.0,1,Some amazing video from the Wareham microburst...,EversourceMA,1,WV OH PA NJ CT MA NY DE MD IN KY MI
3,2012-11-01 23:34:20,264148399190851584,0,EversourceMA OR EversourceNH OR VelcoVT OR nat...,0.0,0,@nationalgridus Call me if you need some help ...,sparky1000,1,WV OH PA NJ CT MA NY DE MD IN KY MI
4,2012-11-01 23:31:56,264147793147490304,0,EversourceMA OR EversourceNH OR VelcoVT OR nat...,1.0,8,Current PSNH statewide w/o power: 885. We're d...,EversourceNH,1,WV OH PA NJ CT MA NY DE MD IN KY MI


In [49]:
len(data)

38069

In [58]:
data[["outage"]].value_counts()

outage
1         20431
0         17638
dtype: int64

In [88]:
# Separate samples for each class
outage_samples = data[data['outage'] == 1]
no_outage_samples = data[data['outage'] == 0]

# Randomly sample 500 samples from each class for training data
outage_training_samples = outage_samples.sample(n=500, random_state=42)
no_outage_training_samples = no_outage_samples.sample(n=500, random_state=42)

# Concatenate the samples from both classes for training data
training_data = pd.concat([outage_training_samples, no_outage_training_samples])

# Get the remaining samples for testing data
outage_remaining_samples = outage_samples[~outage_samples.index.isin(outage_training_samples.index)]
no_outage_remaining_samples = no_outage_samples[~no_outage_samples.index.isin(no_outage_training_samples.index)]

# Randomly sample 2000 samples from each class for testing data
outage_testing_samples = outage_remaining_samples.sample(n=2000, random_state=42)
no_outage_testing_samples = no_outage_remaining_samples.sample(n=2000, random_state=42)

# Concatenate the samples from both classes for testing data
testing_data = pd.concat([outage_testing_samples, no_outage_testing_samples])

# Verify the distribution
print(training_data['outage'].value_counts())
print(testing_data['outage'].value_counts())


0    500
1    500
Name: outage, dtype: int64
0    2000
1    2000
Name: outage, dtype: int64


In [90]:
avg_length_train = training_data.groupby('outage')['text'].apply(lambda x: x.str.len().mean())
avg_length_test = testing_data.groupby('outage')['text'].apply(lambda x: x.str.len().mean())


In [94]:
print(avg_length_train)
print(avg_length_train.mean())

outage
0    101.496
1     99.430
Name: text, dtype: float64
100.463


In [92]:
print(avg_length_test)
print(avg_length_test.mean())

outage
0    100.8235
1     99.2205
Name: text, dtype: float64