In [10]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [11]:
import numpy as np
import pandas as pd

In [12]:
from tensorflow import keras

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Bidirectional



In [13]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
df = pd.read_csv('preprocessed_clean_train_set.csv')

In [16]:
df.head()

Unnamed: 0,keyword,location,text,target,total_words,char_count,sentence_count,avg_word_length,avg_sentence_lenght,tokenized_text,clean_text,clean_total_words,clean_char_count,clean_sentence_count,clean_avg_word_length,clean_avg_sentence_lenght
0,,,Our Deeds are the Reason of this #earthquake M...,1,13,57,1,4.384615,13.0,"['Our', 'Deeds', 'are', 'the', 'Reason', 'of',...",reason allah forgive,3,18,1,6.0,3.0
1,,,Forest fire near La Ronge Sask. Canada,1,7,32,2,4.571429,3.5,"['Forest', 'fire', 'near', 'La', 'Ronge', 'Sas...",forest fire near la range canada,6,27,1,4.5,6.0
2,,,All residents asked to 'shelter in place' are ...,1,22,112,2,5.090909,11.0,"['All', 'residents', 'asked', 'to', ""'"", 'shel...",resident ask shelter place officer evacuation ...,10,63,1,6.3,10.0
3,,,"13,000 people receive #wildfires evacuation or...",1,9,57,1,6.333333,9.0,"['13,000', 'people', 'receive', '#wildfires', ...",people evacuation order california,4,31,1,7.75,4.0
4,,,Just got sent this photo from Ruby #Alaska as ...,1,17,72,1,4.235294,17.0,"['Just', 'got', 'sent', 'this', 'photo', 'from...",get send photo smoke school,5,23,1,4.6,5.0


In [17]:
df = df.dropna(subset=['clean_text'])
X = df['clean_text']
y = df['target']

X_train,X_test,y_train, y_test = train_test_split(X,y,test_size=.2, random_state=40521)

In [18]:
tokenizer = Tokenizer(num_words = 1000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

In [19]:
sequences = tokenizer.texts_to_sequences(X_train)
padded = pad_sequences(sequences, padding="post", truncating='post')

In [20]:
sequences

[[19, 389],
 [1, 1, 741, 180],
 [365, 90, 504, 43, 151, 158, 11, 430, 109, 545, 135],
 [72],
 [50, 478, 1, 1, 1, 916, 1],
 [312, 70, 201, 9, 649, 1, 917, 30, 312, 70, 201, 9],
 [390, 63, 55, 190, 313, 842, 918, 390, 190],
 [53],
 [505, 15, 684, 575, 521, 75, 366, 450, 685, 686],
 [1, 479, 55, 221],
 [687, 19, 19, 51, 86],
 [789, 742],
 [33, 1, 480, 367, 688],
 [451, 201, 314, 3, 431, 128, 38],
 [110, 576, 1, 1, 250],
 [202, 506, 843, 315, 316],
 [76, 743],
 [235, 577, 1, 1, 191, 844, 235, 30],
 [278, 1],
 [11],
 [222, 140, 790, 3, 171, 578, 2, 3, 171, 129, 919],
 [90, 504, 43, 151, 158, 11, 430, 109, 545, 135],
 [744, 368, 650, 505, 651],
 [652, 522, 432, 615, 546, 1],
 [653, 1, 1, 265, 32, 340],
 [1, 689, 317, 294],
 [481, 654, 1, 181, 34, 1],
 [77, 745, 920],
 [122],
 [152, 655, 845, 655],
 [690, 1, 921, 1, 1, 1, 746, 16, 90, 1, 1, 921, 1],
 [91, 28, 295, 128, 38, 111],
 [192, 160, 78, 251, 172, 656, 41, 479, 27, 1, 656],
 [479, 507, 747, 1, 1, 747],
 [1, 53, 67, 846],
 [1, 296, 212,

In [21]:
word_index

{'<OOV>': 1,
 'like': 2,
 'fire': 3,
 'new': 4,
 'get': 5,
 'go': 6,
 'people': 7,
 'video': 8,
 'kill': 9,
 'crash': 10,
 'bomb': 11,
 'come': 12,
 'body': 13,
 'time': 14,
 'attack': 15,
 'year': 16,
 'emergency': 17,
 'man': 18,
 'want': 19,
 'burn': 20,
 'day': 21,
 'disaster': 22,
 'news': 23,
 'pm': 24,
 'know': 25,
 'family': 26,
 'look': 27,
 'home': 28,
 'storm': 29,
 'police': 30,
 'love': 31,
 'good': 32,
 'watch': 33,
 'work': 34,
 'flood': 35,
 'car': 36,
 'think': 37,
 'california': 38,
 'bag': 39,
 'building': 40,
 'scream': 41,
 'say': 42,
 'suicide': 43,
 'death': 44,
 'collapse': 45,
 'today': 46,
 'feel': 47,
 'let': 48,
 'life': 49,
 'train': 50,
 'world': 51,
 'wreck': 52,
 'need': 53,
 'low': 54,
 'drown': 55,
 'dead': 56,
 'help': 57,
 'destroy': 58,
 'way': 59,
 'nuclear': 60,
 'live': 61,
 'injury': 62,
 'fear': 63,
 'plan': 64,
 'war': 65,
 'blow': 66,
 'rescue': 67,
 'fall': 68,
 'find': 69,
 'wound': 70,
 'read': 71,
 'right': 72,
 'survive': 73,
 'accident'

In [22]:
padded

array([[ 19, 389,   0, ...,   0,   0,   0],
       [  1,   1, 741, ...,   0,   0,   0],
       [365,  90, 504, ...,   0,   0,   0],
       ...,
       [  1, 233,  57, ...,   0,   0,   0],
       [277,  61,   1, ...,   0,   0,   0],
       [ 43, 764,  73, ...,   0,   0,   0]])

In [23]:
padded[0]

array([ 19, 389,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0])

In [30]:
total_words = len(tokenizer.word_index)+1
max_sequence_len = max([len(x) for x in sequences])

In [31]:
model = Sequential()
model.add(Embedding(total_words,128, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(32, activation='relu'))
model.add(Dense(1,activation='sigmoid'))

In [32]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [33]:
y_train[:]

5487    0
185     0
7478    1
816     0
2439    1
       ..
3530    1
51      0
3433    1
4443    1
6432    0
Name: target, Length: 5968, dtype: int64

In [34]:
xs = padded[:,:-1]

In [35]:
history = model.fit(xs,y_train,epochs=10,verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [36]:
xs

array([[ 19, 389,   0, ...,   0,   0,   0],
       [  1,   1, 741, ...,   0,   0,   0],
       [365,  90, 504, ...,   0,   0,   0],
       ...,
       [  1, 233,  57, ...,   0,   0,   0],
       [277,  61,   1, ...,   0,   0,   0],
       [ 43, 764,  73, ...,   0,   0,   0]])

In [37]:
max_sequence_len

17

In [24]:
xs[0][0]

NameError: name 'xs' is not defined

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [39]:
X2 = df['clean_text']
y2 = df['target']

X_train2,X_test2,y_train2, y_test2 = train_test_split(X2,y2,test_size=.2, random_state=40521)

In [40]:
tfdf = TfidfVectorizer(min_df = 10,max_df = .5, ngram_range=(1,2))
tf_vect = tfdf.fit_transform(X_train2)

In [41]:
X_train_r = pd.DataFrame(tfdf.transform(X_train2).todense(),columns=tfdf.get_feature_names())
X_test_r = pd.DataFrame(tfdf.transform(X_test2).todense(),columns=tfdf.get_feature_names())
y_train_r = y_train2.copy()
y_test_r = y_test2.copy()



In [47]:
model2 = Sequential()
model2.add(Dense(32,activation='relu'))
model2.add(Dense(16,activation='relu'))
model2.add(Dense(1,activation='sigmoid'))

model2.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [48]:
history2 = model2.fit(X_train_r,y_train_r,epochs=32, validation_data=[X_test_r,y_test_r])

Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 23/32
Epoch 24/32
Epoch 25/32
Epoch 26/32
Epoch 27/32
Epoch 28/32
Epoch 29/32
Epoch 30/32
Epoch 31/32
Epoch 32/32


In [44]:
y_train_r

5487    0
185     0
7478    1
816     0
2439    1
       ..
3530    1
51      0
3433    1
4443    1
6432    0
Name: target, Length: 5968, dtype: int64

In [45]:
X_train_r

Unnamed: 0,aba,aba woman,abandon,ablaze,absolutely,acc,acc news,accident,account,act,...,year ago,year atomic,year old,yes,york,young,yous,youth,youtube,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5963,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5964,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5965,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5966,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
