In [1]:
import pandas as pd
import re
import numpy as np
import tensorflow_hub as hub
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Embedding, LSTM,Dense, SpatialDropout1D, Dropout
from keras import layers
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split

In [84]:
train = pd.read_csv('csv/train.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [85]:
ids_with_target_error = [328,443,513,2619,3640,3900,4342,5781,6552,6554,6570,6701,6702,6729,6861,7226]
train.loc[train['id'].isin(ids_with_target_error),'target'] = 0

In [86]:
test = pd.read_csv('csv/test.csv')
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


# BERT

In [87]:
len_embedding = 768

In [6]:
bert_train = np.loadtxt("train_embedding_file_bert.txt").reshape(len(train.index),len_embedding)
bert_train.shape

(7613, 768)

In [7]:
bert_test = np.loadtxt("test_embedding_file_bert.txt").reshape(len(test.index),len_embedding)
bert_test.shape

(3263, 768)

In [88]:
X, y = bert_train, train['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=13)

In [89]:
model = Sequential()
model.add(layers.Dense(384, input_dim=len_embedding, activation='relu'))
model.add(Dropout(0.25))
model.add(layers.Dense(192, input_dim=len_embedding, activation='relu'))
model.add(Dropout(0.25))
model.add(layers.Dense(1, activation='sigmoid'))

In [90]:
optimizer=Adam(learning_rate=5e-5)
model.compile(loss='binary_crossentropy', 
              optimizer=optimizer, 
              metrics=['accuracy'])
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_21 (Dense)             (None, 384)               295296    
_________________________________________________________________
dropout_14 (Dropout)         (None, 384)               0         
_________________________________________________________________
dense_22 (Dense)             (None, 192)               73920     
_________________________________________________________________
dropout_15 (Dropout)         (None, 192)               0         
_________________________________________________________________
dense_23 (Dense)             (None, 1)                 193       
Total params: 369,409
Trainable params: 369,409
Non-trainable params: 0
_________________________________________________________________


In [91]:
history = model.fit(X_train, y_train,
                    epochs=10,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [92]:
test_pred = model.predict(bert_test)
test_pred_int = test_pred.round().astype('int')

In [93]:
test['bert_net'] = test_pred
test = test['bert_net'].to_frame()
test.head()

Unnamed: 0,bert_net
0,0.776438
1,0.984704
2,0.429647
3,0.978157
4,0.943807


In [94]:
test.to_csv('csv/test_red_bert_con_fix.csv')

In [95]:
train_pred = model.predict(bert_train)
train_pred_int = train_pred.round().astype('int')

In [96]:
train['bert_net'] = train_pred
train = train['bert_net'].to_frame()
train.to_csv('csv/train_red_bert_con_fix.csv')

# ELMo

In [97]:
len_embedding = 1024

In [18]:
elmo_train = np.loadtxt("train_embedding_file_elmo.txt").reshape(len(train.index),len_embedding)
elmo_train.shape

(7613, 1024)

In [19]:
elmo_test = np.loadtxt("test_embedding_file_elmo.txt").reshape(len(test.index),len_embedding)
elmo_test.shape

(3263, 1024)

In [98]:
train = pd.read_csv('csv/train.csv')
ids_with_target_error = [328,443,513,2619,3640,3900,4342,5781,6552,6554,6570,6701,6702,6729,6861,7226]
train.loc[train['id'].isin(ids_with_target_error),'target'] = 0
test = pd.read_csv('csv/test.csv')
X, y = elmo_train, train['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=13)

In [99]:
model = Sequential()
model.add(layers.Dense(512, input_dim=len_embedding, activation='relu'))
model.add(Dropout(0.25))
model.add(layers.Dense(256, input_dim=len_embedding, activation='relu'))
model.add(Dropout(0.25))
model.add(layers.Dense(1, activation='sigmoid'))

In [100]:
optimizer=Adam(learning_rate=5e-5)
model.compile(loss='binary_crossentropy', 
              optimizer=optimizer, 
              metrics=['accuracy'])
model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_24 (Dense)             (None, 512)               524800    
_________________________________________________________________
dropout_16 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_25 (Dense)             (None, 256)               131328    
_________________________________________________________________
dropout_17 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_26 (Dense)             (None, 1)                 257       
Total params: 656,385
Trainable params: 656,385
Non-trainable params: 0
_________________________________________________________________


In [101]:
history = model.fit(X_train, y_train,
                    epochs=10,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [102]:
test_pred = model.predict(elmo_test)
test_pred_int = test_pred.round().astype('int')

In [103]:
test['elmo_net'] = test_pred
test = test['elmo_net'].to_frame()
test.head()

Unnamed: 0,elmo_net
0,0.796359
1,0.912771
2,0.721194
3,0.925166
4,0.921997


In [104]:
test.to_csv('csv/test_red_elmo_con_fix.csv')

In [105]:
train_pred = model.predict(elmo_train)
train_pred_int = train_pred.round().astype('int')

In [106]:
train['elmo_net'] = train_pred
train = train['elmo_net'].to_frame()
train.to_csv('csv/train_red_elmo_con_fix.csv')

# W2V

In [107]:
len_embedding = 300

In [30]:
w2v_train = np.loadtxt("train_embedding_file_w2v(norm).txt").reshape(len(train.index),len_embedding)
w2v_train.shape

(7613, 300)

In [31]:
w2v_test = np.loadtxt("test_embedding_file_w2v(norm).txt").reshape(len(test.index),len_embedding)
w2v_test.shape

(3263, 300)

In [108]:
train = pd.read_csv('csv/train.csv')
ids_with_target_error = [328,443,513,2619,3640,3900,4342,5781,6552,6554,6570,6701,6702,6729,6861,7226]
train.loc[train['id'].isin(ids_with_target_error),'target'] = 0
test = pd.read_csv('csv/test.csv')
X, y = w2v_train, train['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=13)

In [109]:
model = Sequential()
model.add(layers.Dense(150, input_dim=len_embedding, activation='relu'))
model.add(Dropout(0.25))
model.add(layers.Dense(75, input_dim=len_embedding, activation='relu'))
model.add(Dropout(0.25))
model.add(layers.Dense(1, activation='sigmoid'))

In [110]:
optimizer=Adam(learning_rate=5e-5)
model.compile(loss='binary_crossentropy', 
              optimizer=optimizer, 
              metrics=['accuracy'])
model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_27 (Dense)             (None, 150)               45150     
_________________________________________________________________
dropout_18 (Dropout)         (None, 150)               0         
_________________________________________________________________
dense_28 (Dense)             (None, 75)                11325     
_________________________________________________________________
dropout_19 (Dropout)         (None, 75)                0         
_________________________________________________________________
dense_29 (Dense)             (None, 1)                 76        
Total params: 56,551
Trainable params: 56,551
Non-trainable params: 0
_________________________________________________________________


In [111]:
history = model.fit(X_train, y_train,
                    epochs=10,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [112]:
test_pred = model.predict(w2v_test)
test_pred_int = test_pred.round().astype('int')

In [113]:
test['w2v_net'] = test_pred
test = test['w2v_net'].to_frame()
test.head()

Unnamed: 0,w2v_net
0,0.898196
1,0.878895
2,0.775255
3,0.928634
4,0.98856


In [114]:
test.to_csv('csv/test_red_w2v_con_fix.csv')

In [115]:
train_pred = model.predict(w2v_train)
train_pred_int = train_pred.round().astype('int')

In [116]:
train['w2v_net'] = train_pred
train = train['w2v_net'].to_frame()
train.to_csv('csv/train_red_w2v_con_fix.csv')

# GLOve

In [117]:
len_embedding = 200

In [42]:
glove_train = np.loadtxt("train_embedding_file_glove.txt").reshape(len(train.index),len_embedding)
glove_train.shape

(7613, 200)

In [43]:
glove_test = np.loadtxt("test_embedding_file_glove.txt").reshape(len(test.index),len_embedding)
glove_test.shape

(3263, 200)

In [118]:
train = pd.read_csv('csv/train.csv')
ids_with_target_error = [328,443,513,2619,3640,3900,4342,5781,6552,6554,6570,6701,6702,6729,6861,7226]
train.loc[train['id'].isin(ids_with_target_error),'target'] = 0
test = pd.read_csv('csv/test.csv')
X, y = glove_train, train['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=13)

In [119]:
model = Sequential()
model.add(layers.Dense(100, input_dim=len_embedding, activation='relu'))
model.add(Dropout(0.25))
model.add(layers.Dense(50, input_dim=len_embedding, activation='relu'))
model.add(Dropout(0.25))
model.add(layers.Dense(1, activation='sigmoid'))

In [120]:
optimizer=Adam(learning_rate=5e-5)
model.compile(loss='binary_crossentropy', 
              optimizer=optimizer, 
              metrics=['accuracy'])
model.summary()

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_30 (Dense)             (None, 100)               20100     
_________________________________________________________________
dropout_20 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_31 (Dense)             (None, 50)                5050      
_________________________________________________________________
dropout_21 (Dropout)         (None, 50)                0         
_________________________________________________________________
dense_32 (Dense)             (None, 1)                 51        
Total params: 25,201
Trainable params: 25,201
Non-trainable params: 0
_________________________________________________________________


In [121]:
history = model.fit(X_train, y_train,
                    epochs=10,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [122]:
test_pred = model.predict(glove_test)
test_pred_int = test_pred.round().astype('int')

In [123]:
test['glove_net'] = test_pred
test = test['glove_net'].to_frame()
test.head()

Unnamed: 0,glove_net
0,0.680662
1,0.663334
2,0.536962
3,0.864167
4,0.96814


In [124]:
test.to_csv('csv/test_red_glove_con_fix.csv')

In [125]:
train_pred = model.predict(glove_train)
train_pred_int = train_pred.round().astype('int')

In [126]:
train['glove_net'] = train_pred
train = train['glove_net'].to_frame()
train.to_csv('csv/train_red_glove_con_fix.csv')

In [53]:
history.history['accuracy'][9]

0.7898117303848267