In [None]:
import pandas as pd
import spacy
nlp = spacy.load('en_core_web_lg')
df = pd.read_csv('data/train.csv')
df = df.fillna('')

In [2]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
from sklearn.model_selection import train_test_split
df['text'] = df['text'].apply(lambda x: x.replace('%20', ' '))
df['text'] = df['text'].apply(lambda x: x.replace('@', ''))
train, val = train_test_split(df, stratify=df['target'])

test = pd.read_csv('data/test.csv')
test = test.fillna('')
test['text'] = test['text'].apply(lambda x: x.replace('%20', ' '))
test['text'] = test['text'].apply(lambda x: x.replace('@', ''))

In [4]:
cs = pd.crosstab(df['keyword'], df['target'], normalize='index')
cs['pct_diff'] = abs(cs[0].values - cs[1].values)

cs['pct_diff'].sort_values(ascending=False)[:20]

keyword
aftershock           1.000000
derailment           1.000000
debris               1.000000
wreckage             1.000000
body%20bags          0.951220
outbreak             0.950000
typhoon              0.947368
oil%20spill          0.947368
ruin                 0.945946
blazing              0.941176
suicide%20bombing    0.939394
body%20bag           0.939394
electrocute          0.937500
suicide%20bomber     0.935484
screaming            0.888889
traumatised          0.885714
panicking            0.878788
blew%20up            0.878788
blight               0.875000
bombing              0.862069
Name: pct_diff, dtype: float64

## Spacy NLP, Tokenize, and Vectorize Text Column

In [5]:
def text_to_doc(text):
    doc = nlp(text)
    return doc

In [6]:
def tokenize(doc):
    return [token.lemma_ for token in doc if not token.is_punct and not token.is_stop]

In [7]:
def vectorize(doc):
    return doc.vector

In [8]:
train['docs'] = train['text'].apply(text_to_doc)
val['docs'] = val['text'].apply(text_to_doc)
test['docs'] = test['text'].apply(text_to_doc)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [9]:
train['tokens'] = train['docs'].apply(tokenize)
val['tokens'] = val['docs'].apply(tokenize)
test['tokens'] = test['docs'].apply(tokenize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [10]:
train['vectors'] = train['docs'].apply(vectorize)
val['vectors'] = val['docs'].apply(vectorize)
test['vectors'] = test['docs'].apply(vectorize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [11]:
train['tokens'][:10]

5323    [hashtagteaclub, place, ex, Pars, defender, An...
466     [volleyball, Attack, II, Volleyball, Training,...
2966    [Drowning, Girl, Caitlin, R., Kiernan, Centipe...
6658    [ALIPAPER, woman, get, problem, keepingthevigi...
2565    [let, bring, matter, hard, try, beconfident, l...
6243    [ÛÏLordBrathwaite, ahh, hate, snow, \n\n, lol...
335     [samihonkonen, time, 23, hour, late, series, W...
2405    [jozerphine, literally, look, yeah, derail, SM...
5313    [family, sue, Legionnaires, 40, family, affect...
3798    [toddler, Bedding, Firetruck, Bundle, Fire, Tr...
Name: tokens, dtype: object

## TFIDF Bag of Words Model

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

In [13]:
vect = TfidfVectorizer(ngram_range=(1,3), max_df=.9, min_df=2, max_features=1000)
clf = RandomForestClassifier(n_estimators=50, max_depth=20)

In [14]:
X_train = vect.fit_transform(train['text'])
X_val = vect.transform(val['text'])
X_test = vect.transform(val['text'])
clf.fit(X_train, train['target'])

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=20, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [15]:
clf.score(X_train, train['target'])

0.8043440182168505

In [16]:
# Baseline model
clf.score(X_val, val['target'])

0.7484243697478992

In [17]:
X_train.shape

(5709, 1000)

## Spacy Word Embedding Model

In [18]:
clf = RandomForestClassifier(n_estimators=50, max_depth=20)

In [19]:
X_train = [doc.vector for doc in train['docs']]
X_val = [doc.vector for doc in val['docs']]
X_test = [doc.vector for doc in test['docs']]

In [20]:
clf.fit(X_train, train['target'])    

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=20, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [21]:
print(clf.score(X_train, train['target']))
print(clf.score(X_val, val['target']))

0.9891399544578735
0.8046218487394958


In [22]:
from sklearn.model_selection import RandomizedSearchCV
clf = RandomForestClassifier()

params = {'n_estimators':range(50, 150),
         'max_depth':range(10,50)}

In [23]:
grid = RandomizedSearchCV(clf, params, verbose=1, n_jobs=-1, n_iter=50)
grid.fit(X_train, train['target'])

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   50.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:  5.7min finished


RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
            

In [24]:
grid.best_score_

0.7896305664831446

In [25]:
grid.best_params_

{'n_estimators': 133, 'max_depth': 41}

In [26]:
print(grid.score(X_train, train['target']))
print(grid.score(X_val, val['target']))

0.9891399544578735
0.8135504201680672


In [27]:
y_pred = grid.predict(X_test)
pred_sub = pd.DataFrame({'id':test['id'], 'target':y_pred})

In [28]:
samp_sub = pd.read_csv('data/sample_submission.csv')
samp_sub.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


In [29]:
pred_sub.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [30]:
pred_sub.to_csv('word_embeddings_sub.csv', index=False)

## Neural Network Model

In [31]:
from tensorflow.keras.layers import Dense, LSTM, Embedding, GlobalAveragePooling1D, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

In [32]:
reduce_lr = ReduceLROnPlateau(factor=0.5, patience=5)
early_stop = EarlyStopping(patience=7)
model = Sequential()
model.add(Dense(16, activation='relu', input_dim=300))
model.add(Dropout(.5))
model.add(Dense(24, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [33]:
model.compile(optimizer='adamax', loss='binary_crossentropy', metrics=['accuracy'])

In [34]:
X_train = pd.DataFrame(X_train)
X_val = pd.DataFrame(X_val)
X_test = pd.DataFrame(X_test)

In [35]:
model.fit(X_train, train['target'], epochs=200, validation_data=(X_val, val['target']), callbacks=[reduce_lr, early_stop])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200


<keras.callbacks.History at 0x2135446fe48>

In [36]:
y_pred = model.predict(X_test)

In [37]:
y_pred = y_pred.round(0)

In [38]:
y_pred = y_pred.reshape((3263,))

In [39]:
pred_sub = pd.DataFrame({'id':test['id'], 'target':y_pred})
pred_sub['target'] = pred_sub['target'].apply(lambda x: int(x))
pred_sub.to_csv('simple_nn_sub.csv', index=False)

In [40]:
pred_sub

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


### Hyperparameter Tuning

In [41]:
from keras_tuner import RandomSearch
from tensorflow.keras.optimizers import Adam

def build_model(hp):
    model = Sequential()
    model.add(Dense(hp.Choice('initial_units', [16, 32, 64, 128], default=16), activation='relu', input_dim=300))
    if hp.Boolean('add_dropout'):
        model.add(Dropout(hp.Float('dropout', .01, .8)))
    for i in range(1, hp.Int('num_layers', 1, 4)):
        model.add(Dense(hp.Choice(f'units_{i}', [16, 32, 64, 128]), activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer=Adam(hp.Float('learning_rate', .001, .1)), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [42]:
class MyTuner(RandomSearch):
  def run_trial(self, trial, *args, **kwargs):
    # You can add additional HyperParameters for preprocessing and custom training loops
    # via overriding `run_trial`
    kwargs['batch_size'] = trial.hyperparameters.Choice('batch_size', [32,64,128])
#     kwargs['epochs'] = trial.hyperparameters.Int('epochs', 30, 70)
    super(MyTuner, self).run_trial(trial, *args, **kwargs)

In [43]:
tuner = MyTuner(build_model, objective='val_accuracy', max_trials=200, executions_per_trial=3)

In [44]:
tuner.search(X_train, train['target'], epochs=100, validation_data=(X_val, val['target']), callbacks=[EarlyStopping(patience=5)])

Trial 200 Complete [00h 00m 07s]
val_accuracy: 0.8224789897600809

Best val_accuracy So Far: 0.8310574293136597
Total elapsed time: 00h 41m 28s
INFO:tensorflow:Oracle triggered exit


In [45]:
best_hp = tuner.get_best_hyperparameters(10)
[x.values for x in best_hp]

[{'initial_units': 128,
  'add_dropout': True,
  'num_layers': 4,
  'learning_rate': 0.0016811370972802673,
  'batch_size': 64,
  'units_1': 128,
  'dropout': 0.3105276090115756,
  'units_2': 32,
  'units_3': 128},
 {'initial_units': 16,
  'add_dropout': True,
  'num_layers': 4,
  'learning_rate': 0.0053621070764740845,
  'batch_size': 128,
  'units_1': 32,
  'dropout': 0.3832748803463333,
  'units_2': 16,
  'units_3': 128},
 {'initial_units': 16,
  'add_dropout': True,
  'num_layers': 2,
  'learning_rate': 0.008893345372387871,
  'batch_size': 128,
  'units_1': 128,
  'dropout': 0.37599956645412413,
  'units_2': 32,
  'units_3': 32},
 {'initial_units': 128,
  'add_dropout': False,
  'num_layers': 2,
  'learning_rate': 0.027094340070893928,
  'batch_size': 128,
  'units_1': 16,
  'dropout': 0.5365498664367073,
  'units_2': 16,
  'units_3': 32},
 {'initial_units': 32,
  'add_dropout': True,
  'num_layers': 1,
  'learning_rate': 0.00578589138656092,
  'batch_size': 128,
  'units_1': 64,


In [46]:
model = tuner.get_best_models(num_models=1)[0]
y_pred = model.predict(X_test)
y_pred = y_pred.round(0)
y_pred = y_pred.reshape((3263,))
pred_sub = pd.DataFrame({'id':test['id'], 'target':y_pred})
pred_sub['target'] = pred_sub['target'].apply(lambda x: int(x))
pred_sub.to_csv('hp_tuned_nn_sub.csv', index=False)

### Tensorflow Embedding Neural Network

In [47]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(num_words=1000,oov_token='<OOV>')

In [48]:
tokenizer.fit_on_texts(train['tokens'])

In [49]:
X_train = tokenizer.texts_to_sequences(train['tokens'])
X_val = tokenizer.texts_to_sequences(val['tokens'])
X_test = tokenizer.texts_to_sequences(test['tokens'])

In [50]:
X_train = pad_sequences(X_train, padding='post', truncating='post')
X_val = pad_sequences(X_val, len(X_train[0]), padding='post', truncating='post')
X_test = pad_sequences(X_test, len(X_train[0]), padding='post', truncating='post')

In [51]:
model = Sequential()
model.add(Embedding(1000, 32))
model.add(LSTM(32, return_sequences=True))
model.add(Dropout(.2))
model.add(LSTM(16))
model.add(Dense(1, activation='sigmoid'))

In [52]:
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

In [53]:
model.fit(X_train, train['target'], epochs=200, validation_data=(X_val, val['target']), callbacks=[EarlyStopping('val_accuracy', patience=10), reduce_lr])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200


<keras.callbacks.History at 0x213510d5978>

In [54]:
y_pred = model.predict(X_test)
y_pred = y_pred.round(0)
y_pred = y_pred.reshape((3263,))

In [55]:
pred_sub = pd.DataFrame({'id':test['id'], 'target':y_pred})
pred_sub['target'] = pred_sub['target'].apply(lambda x: int(x))
pred_sub.to_csv('embedding_nn_sub.csv', index=False)