In [188]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, SimpleRNN, Input, BatchNormalization, Flatten, Embedding, Dropout

from gensim.models import KeyedVectors

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [189]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [190]:
test

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [191]:
x_train = train.iloc[:,3].values
y_train = train.iloc[:,4].values

In [192]:
file = open('glove.6B.100d.txt', 'r', encoding = 'utf8')
content = file.readlines()
file.close()

# content

In [193]:
embeddings = {}

for line in content:
    line = line.split()
    embeddings[line[0]] = np.array(line[1:], dtype = float)

In [194]:
def get_maxlen(data):
    maxlen = 0
    for sent in data:
        maxlen = max(maxlen, len(sent))
    return maxlen

In [195]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)
word2index = tokenizer.word_index

In [196]:
Xtokens = tokenizer.texts_to_sequences(x_train)

maxlen = get_maxlen(Xtokens)
print(maxlen)

x_train = pad_sequences(Xtokens, maxlen = maxlen,  padding = 'post', truncating = 'post')

33


In [197]:
import re
embed_size = 100
embedding_matrix = np.zeros((len(word2index)+1, embed_size))

for word, i in word2index.items():
    # Clean the word by removing specified characters using regex
    cleaned_word = re.sub("[!@#$()'1234567890/ûò]", '', word)

    # Proceed only if the cleaned word is in the embeddings
    if cleaned_word in embeddings:
        embed_vector = embeddings[cleaned_word]
        embedding_matrix[i] = embed_vector
        print(cleaned_word)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
chrissie
finn
mathew
sk
fco
tobias
ellwood
condemns
critters
angela
sanders
roundhouse
innovative
beluga
seaworld
mothe
compulsory
aerospace
exec
months
razak
isla
reunion
scmp
awash
abstract
xeni
organizing
package
tours
apologise
sincerely
grudges
baking
nlm
providers
towboat
trek
vtc
eulogies
tormented
matias
yahoo
delaying
fundamentals
valuations
trois
rivieres
normally
embrace
torrent
java
uses
dynastic
the
shepherd
vines
encaustic
light
rain
dressing
incessant
applied
bible
unloads
descriptive
indecency
differently
pt
slew
ing
soaked
pads
tampons
deluge
wrinkled
decayed
ren
shadowman
monstrosities
hough
bb
healing
deluge
translated
billionaire
incremental
caring
frustrated
asae
exhibitor
meditation
divine
blessing
canadian
timing
avoiding
downright
joseph
turner
shade
sul
telemarketing
colo
amazons
fiction
farms
ris
murphy
com
spam
tarp
outfield
infield
squabble
immigration
oup
valdes
testy
afterwards
savour
prolong

In [198]:
embedding_matrix

array([[ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ],
       [ 0.13482 ,  0.40224 , -0.42266 , ..., -0.27989 ,  0.28937 ,
         0.043783],
       [ 0.23182 , -0.35374 , -0.067178, ..., -0.34251 ,  0.87474 ,
         0.12929 ],
       ...,
       [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ],
       [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ],
       [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ]])

In [199]:
x_train.shape, y_train.shape

((7613, 33), (7613,))

In [200]:
model = Sequential([
    Embedding(input_dim = len(word2index) + 1,
          output_dim = embed_size,
          input_length = maxlen,
          weights = [embedding_matrix],
          trainable = False
          ),
    Input(shape=x_train.shape),
    LSTM(units = 500, return_sequences = True),
    LSTM(units = 500),
    Dense(300, activation = 'relu'),
    Dense(2, activation = 'softmax')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
model.fit(x_train, y_train, epochs=10, verbose=1)

Epoch 1/10
 52/238 [=====>........................] - ETA: 2:09 - loss: 0.6208 - accuracy: 0.6833

In [None]:
x_test = test.iloc[:,3].values

In [None]:
x_test = x_test.reshape(x_test.shape + (1,))

x_test.shape

In [None]:
x_test

In [None]:
for i in range(100):
  tweet = x_test[i]
  test_seq = tokenizer.texts_to_sequences(tweet)
  test = pad_sequences(test_seq, maxlen = maxlen, padding = 'post', truncating = 'post')

  prediction = model.predict(test)
  print(np.argmax(prediction), tweet)
  print(" ")