## EDA

##### Imports

In [1]:
# general imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# imports for tweet cleaning
import regex as re
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

# imports for modeling
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

import warnings

In [2]:
df = pd.read_csv('./data/train.csv')

In [3]:
df['target'].value_counts(normalize = True)

0    0.57034
1    0.42966
Name: target, dtype: float64

In [4]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
y = df['target']
X = df['text'].str.replace('http\S+|www.\S+', ' ', case = False)

### Tweet Cleaning, and Preprocessing

In [6]:
def tweet_cleaning(raw, lemmat = False, stem = False):
    # 1. Remove non-letters.
    letters_only = re.sub('[^a-zA-Z]', ' ', raw)
    
    # 2. Convert to lower case, split into individual words.
    words = letters_only.lower().split()
    
    # 3. Join all the stopwords as a string with " ", remove "'" from the stopwords and split it as a list.
    stops = " ".join(stopwords.words('english')).replace("'", "").split()
    
    # 4. Remove stopwords.
    meaningful_words = [w for w in words if not w in stops]
    
    # lemmatizing corpus
    if lemmat == True and stem == False:
        lemmatizer = WordNetLemmatizer()
        lemmatizing = [lemmatizer.lemmatize(word) for word in meaningful_words]
        return(" ".join(lemmatizing))
    # Stemming corpus
    elif lemmat == False and stem == True:
        p_stemmer = PorterStemmer()
        stemming = [p_stemmer.stem(word) for word in meaningful_words]
        return(" ".join(stemming))
    # Simple cleaning of corpus
    else:
        return(" ".join(meaningful_words))

In [7]:
X = X.apply(lambda tweet: tweet_cleaning(tweet.lstrip(), lemmat = True))

In [8]:
X.head()

0           deed reason earthquake may allah forgive u
1                forest fire near la ronge sask canada
2    resident asked shelter place notified officer ...
3    people receive wildfire evacuation order calif...
4    got sent photo ruby alaska smoke wildfire pour...
Name: text, dtype: object

In [9]:
cvec = CountVectorizer()
tfidf = TfidfVectorizer()

X_train,X_test, y_train, y_test = train_test_split(X, y, 
                                                   stratify = y, 
                                                   random_state = 42)

In [10]:
X_train_cvec = pd.DataFrame(cvec.fit_transform(X_train).toarray(), 
                       columns = cvec.get_feature_names())
X_test_cvec = pd.DataFrame(cvec.transform(X_test).toarray(),
                      columns = cvec.get_feature_names())
X_train_tfidf = pd.DataFrame(tfidf.fit_transform(X_train).toarray(), 
                       columns = tfidf.get_feature_names())
X_test_tfidf = pd.DataFrame(tfidf.transform(X_test).toarray(),
                      columns = tfidf.get_feature_names())

In [11]:
X_train_tfidf

Unnamed: 0,aa,aaaaaaallll,aaaaaand,aaarrrgghhh,aace,aal,aan,aannnnd,aar,aaronthefm,...,zojadelin,zombie,zone,zoom,zouma,zourryart,zrnf,zumiez,zurich,zxathetis
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5704,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5706,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5707,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
X_test_tfidf

Unnamed: 0,aa,aaaaaaallll,aaaaaand,aaarrrgghhh,aace,aal,aan,aannnnd,aar,aaronthefm,...,zojadelin,zombie,zone,zoom,zouma,zourryart,zrnf,zumiez,zurich,zxathetis
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1899,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1900,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1901,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1902,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
baseline = round(df['target'].value_counts(normalize = True)[0],2)
baseline

0.57

In [14]:
# Instantiating Pipelines for potential models
pipe_rf_cvec = Pipeline([('cvec',CountVectorizer()),                         
                         ('rf', RandomForestClassifier(n_estimators= 100))])
pipe_rf_tfidf = Pipeline([('tfidf', TfidfVectorizer()),                          
                          ('rf', RandomForestClassifier(n_estimators=100))])
pipe_ada_cvec = Pipeline([('cvec', CountVectorizer()),                          
                          ('ada', AdaBoostClassifier(n_estimators=100))])
pipe_ada_tfidf = Pipeline([('tfidf', TfidfVectorizer()),                           
                           ('ada', AdaBoostClassifier(n_estimators = 100))])
pipe_svc_cvec = Pipeline([('cvec', CountVectorizer()),
                           ('svc', SVC(gamma = 'scale',
                                       random_state = 42))])
pipe_svc_tfidf = Pipeline([('tfidf', TfidfVectorizer()),                    
                           ('svc', SVC(gamma = 'scale',
                                       random_state = 42))])
# Instantiating vectorizer parameters
cvec_params = {'cvec__stop_words':[None,'english'],
               'cvec__ngram_range':[(1,1),(1,2),(1,3)]}
tfidf_params = {'tfidf__stop_words': [None, 'english'],
                'tfidf__ngram_range': [(1,1),(1,2),(1,3)]}

# Random Forest GridSearches
grid_rf_cvec = GridSearchCV(pipe_rf_cvec, cvec_params,cv = 5)
grid_rf_tfidf = GridSearchCV(pipe_rf_tfidf, tfidf_params, cv = 5)


# Adaboost GridSearches
grid_ada_cvec = GridSearchCV(pipe_ada_cvec, cvec_params, cv = 5)
grid_ada_tfidf = GridSearchCV(pipe_ada_tfidf, tfidf_params, cv = 5)

# SVC GridSearch
grid_svc_cvec = GridSearchCV(pipe_svc_cvec,cvec_params, cv = 5)
grid_svc_tfidf = GridSearchCV(pipe_svc_tfidf, tfidf_params, cv = 5)

In [15]:
models = [grid_rf_cvec, grid_rf_tfidf, grid_ada_cvec, 
          grid_ada_tfidf, grid_svc_cvec, grid_svc_tfidf]
model_names = ['CountVectorized Random Forest','TFIDF Random Forest',
               'CountVectorized Adaboost','TFIDF Adaboost', 'CountVectorized SVC',
               'TFIDF SVC']

# loops through each gridsearch and prints out accuracy scores and parameters for the best estimator
for (model, model_name) in zip(models, model_names):
    model.fit(X_train,y_train)
    print(f'{model_name}')
    print(f'best params: {model.best_params_}')
    print(f'best estimator train score: {model.best_estimator_.score(X_train,y_train)}')
    print(f'best estimator test score: {model.best_estimator_.score(X_test,y_test)}')
    print()
print(f'baseline score:{baseline}')

CountVectorized Random Forest
best params: {'cvec__ngram_range': (1, 1), 'cvec__stop_words': 'english'}
best estimator train score: 0.9886144683832545
best estimator test score: 0.7925420168067226

TFIDF Random Forest
best params: {'tfidf__ngram_range': (1, 2), 'tfidf__stop_words': None}
best estimator train score: 0.9889647924330005
best estimator test score: 0.7956932773109243

CountVectorized Adaboost
best params: {'cvec__ngram_range': (1, 2), 'cvec__stop_words': 'english'}
best estimator train score: 0.7880539499036608
best estimator test score: 0.7725840336134454

TFIDF Adaboost
best params: {'tfidf__ngram_range': (1, 3), 'tfidf__stop_words': None}
best estimator train score: 0.8013662637940094
best estimator test score: 0.7757352941176471

CountVectorized SVC
best params: {'cvec__ngram_range': (1, 1), 'cvec__stop_words': None}
best estimator train score: 0.9483272026624627
best estimator test score: 0.8046218487394958

TFIDF SVC
best params: {'tfidf__ngram_range': (1, 1), 'tfidf_