# Imports

In [1]:
import pandas as pd
from nltk.tokenize                   import RegexpTokenizer
from bs4                             import BeautifulSoup  
from nltk.corpus                     import stopwords
from sklearn.linear_model            import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.pipeline                import Pipeline,FeatureUnion
from sklearn.model_selection         import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics                 import confusion_matrix,classification_report
import warnings

warnings.filterwarnings("ignore")

In [3]:
df=pd.read_csv('../datasets/clean_tweets.csv')

# PreProcessing

In [5]:
df.isnull().sum()

username         0
text             0
date             0
clean_text       0
target           0
imbalance_fix    0
targets          0
lat              0
long             0
dtype: int64

In [6]:
df.dropna(how='any',inplace=True)

In [7]:
df.isnull().sum()

username         0
text             0
date             0
clean_text       0
target           0
imbalance_fix    0
targets          0
lat              0
long             0
dtype: int64

In [8]:
df.shape

(11453, 9)

In [9]:
X = df['clean_text']
y=df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.3, stratify = y)

In [10]:
#Keep a vectorized DF for the records
cvec = CountVectorizer(max_df= 0.9,min_df=0.0012,ngram_range=(1,1))
cvec.fit(X_train)
DF = cvec.transform(X_train)
DF = pd.DataFrame(DF.toarray(),
                          columns=cvec.get_feature_names())

In [11]:
DF.shape

(8017, 1460)

In [12]:
DF.to_csv('../datasets/countvec_words.csv')

# Model Set Up

In [None]:
# Define Pipeline
pipe = Pipeline(steps = [('vectorizer', CountVectorizer()),     # first tuple is for first step: vectorizer
                         ('model', LogisticRegression())        # second tuple is for second step: model
                        ])    

# Construct Grid Parameters
hyperparams = {'vectorizer__ngram_range': [(1,1), (2,2), (1,2)],
               'vectorizer__max_df': [0.9, 0.8, 1.0],
               'vectorizer__min_df':[0.0012],

               'model__penalty': ['l1', 'l2'],
               'model__C': [0.1, 1, 3, 10],
               'model__max_iter':[50,100,150]
            
              }

 # Perform Grid Search
gs = GridSearchCV(pipe, # pipeline object replaces what we usually had as empty model class
                 param_grid=hyperparams,
                 cv = 3,
                 scoring = 'accuracy')
results = gs.fit(X_train, y_train)

# Get best params
results.best_estimator_

In [None]:
vectorizer_params = results.best_estimator_.steps[0][1].get_params()
model_params = results.best_estimator_.steps[1][1].get_params()

# Get score
train_score = results.best_score_
print('Best TRAIN accuracy: {:.4f}'.format(train_score))
test_score = results.score(X_test, y_test)
print('Best TEST set accuracy: {:.4f}'.format(test_score))
results.best_params_

In [None]:
pipe = Pipeline(steps = [('cv', CountVectorizer(max_df= 0.9,min_df=0.001,ngram_range=(1,1))),
                         ('model', LogisticRegression(C=0.1,max_iter=50,penalty='l1')),     
                        ])    

# Model Evaluation

In [None]:
#Get predictions and probabilities
results = pipe.fit(X_train, y_train);
pred=pipe.predict(X_test)
pred_proba=pipe.predict_proba(X_test)

In [None]:
#Confusion Matrix
cm=confusion_matrix(y_test, pred)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax, fmt=".1f", cmap='YlOrBr', cbar=False); 
# labels, title and ticks
ax.set_xlabel('Predicted labels');
ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['No Power Outage', 'Power Outage']); ax.yaxis.set_ticklabels(['No Power Outage', 'Power Outage']);

In [None]:
#Classification Report
from sklearn.metrics import classification_report
thresh = .7
print(classification_report(y_test, pred_proba[:, 1]>thresh))

In [14]:
#ROC Curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

ns_probs = [0 for _ in range(len(y_test))]

pipe_probs = pipe.predict_proba(X_test)
pipe_probs = pipe_probs[:, 1]
ns_auc = roc_auc_score(y_test, ns_probs)
pipe_auc = roc_auc_score(y_test, pipe_probs)

# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Logistic: ROC AUC=%.3f' % (pipe_auc))

# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
pipe_fpr, pipe_tpr, _ = roc_curve(y_test, pipe_probs)

# plot the roc curve for the model
plt.plot(ns_fpr, ns_tpr, linestyle='--', label='Base')
plt.plot(pipe_fpr, pipe_tpr, marker='.', label='LR')

# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

# show the legend
plt.legend()
plt.title('ROC/AUC')

# show the plot
plt.show()

NotFittedError: CountVectorizer - Vocabulary wasn't fitted.

In [304]:
#Saving predictions
pred_train=pd.DataFrame(pipe.predict(X_train))
pred_train['origin']='train'
pred_test=pd.DataFrame(pipe.predict(X_test))
pred_test['origin']='test'

In [305]:
pred_test['index']=indices

In [306]:
pred_test.to_csv('./preds_index.csv')

In [307]:
all_preds=pd.concat([pred_train,pred_test], axis=0,ignore_index=True)

In [308]:
all_preds.rename(columns={0:'preds','origin':'origin'},inplace=True)

In [311]:
pd.concat([df,all_preds]);

In [310]:
df.to_csv('./tweets_and_preds.csv')