In [1]:
import pandas as pd
import warnings
from nltk.stem                       import WordNetLemmatizer
from nltk.tokenize                   import RegexpTokenizer
from bs4                             import BeautifulSoup  
from nltk.corpus                     import stopwords
from sklearn.linear_model            import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.pipeline                import Pipeline,FeatureUnion
from sklearn.model_selection         import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics                 import confusion_matrix,classification_report

In [2]:
df=pd.read_csv('../datasets/clean_tweets.csv').drop(columns='Unnamed: 0')

In [3]:
df.isnull().sum()

username        0
text            0
label           0
dates           0
clean_text    230
target          0
lat             0
long            0
dtype: int64

In [4]:
df.dropna(how='any',inplace=True)

In [5]:
df.head()

Unnamed: 0,username,text,label,dates,clean_text,target,lat,long
0,Amithridya1001,No update or action on supply issue at near 2 ...,outage,2019-07-23,update action supply issue near still even cus...,1,42.351035,-71.323284
1,ellensweeps,I live in what is considered the largest conce...,outage,2019-07-23,live considered largest concentrated outage ma...,1,42.375944,-71.767229
2,CoxHelp,"No, there is not an outage and your modem is r...",outage,2019-07-23,outage modem receiving signal us reset modem f...,1,42.361009,-71.490488
3,Steven_McKie,No big deal @Xfinity I didn’t have to work tod...,outage,2019-07-23,big deal xfinity work today anything outage ni...,1,42.332771,-71.387141
4,Larapic,So we filled the Adpt with post-its today and ...,outage,2019-07-23,filled adpt post today second outage ended dis...,1,42.348956,-70.991783


In [6]:
X = df['clean_text']
y=df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.3, stratify = y)

In [None]:
# all warnings in this cell pertain to the solver used in LogisticRegression()
# warning us about a potential change in future versions of sklearn
warnings.filterwarnings('ignore')

# Define Pipeline
pipe = Pipeline(steps = [('vectorizer', CountVectorizer()),     # first tuple is for first step: vectorizer
                         ('model', LogisticRegression())        # second tuple is for second step: model
                        ])    

# Construct Grid Parameters
hyperparams = {'vectorizer__ngram_range': [(1,1), (2,2), (1,2)],
               'vectorizer__max_df': [0.9, 0.8, 1],
               'vectorizer__min_df':[0.001],

               'model__penalty': ['l1', 'l2'],
               'model__C': [0.1, 1, 3, 10],
               'model__max_iter':[50,100,150]
            
              }

 # Perform Grid Search
gs = GridSearchCV(pipe, # pipeline object replaces what we usually had as empty model class
                 param_grid=hyperparams,
                 cv = 3,
                 scoring = 'accuracy')
results = gs.fit(X_train, y_train)

# Get best params
results.best_estimator_

# Turn warnings back on
warnings.filterwarnings('default')

In [None]:
vectorizer_params = results.best_estimator_.steps[0][1].get_params()
model_params = results.best_estimator_.steps[1][1].get_params()

# Get score
train_score = results.best_score_
print('Best TRAIN accuracy: {:.4f}'.format(train_score))
test_score = results.score(X_test, y_test)
print('Best TEST set accuracy: {:.4f}'.format(test_score))
results.best_params_

In [None]:
pipe = Pipeline(steps = [('cv', CountVectorizer(max_df= 0.9,min_df=0.001,ngram_range=(1,1))),
                         ('model', LogisticRegression(C=0.1,max_iter=50,penalty='l1')),     
                        ])    

In [None]:
results = pipe.fit(X_train, y_train);
pred=pipe.predict(X_test)
pred_proba=pipe.predict_proba(X_test)

In [None]:
cm=confusion_matrix(y_test, pred)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax, fmt=".1f", cmap='Greys', cbar=False); 
# labels, title and ticks
ax.set_xlabel('Predicted labels');
ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['<=50K', '>50K']); ax.yaxis.set_ticklabels(['<=50K', '>50K']);

In [None]:
from sklearn.metrics import classification_report
thresh = .5
print(classification_report(y_test, pred_proba[:, 1]>thresh))

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

ns_probs = [0 for _ in range(len(y_test))]

pipe_probs = pipe.predict_proba(X_test)
pipe_probs = pipe_probs[:, 1]
ns_auc = roc_auc_score(y_test, ns_probs)
pipe_auc = roc_auc_score(y_test, pipe_probs)

# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Logistic: ROC AUC=%.3f' % (pipe_auc))

# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
pipe_fpr, pipe_tpr, _ = roc_curve(y_test, pipe_probs)

# plot the roc curve for the model
plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
plt.plot(pipe_fpr, pipe_tpr, marker='.', label='Logistic')

# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

# show the legend
plt.legend()

# show the plot
plt.show()