In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import tensorflow as tf
import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras import layers, Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, GRU
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [2]:
raw_train=pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
raw_test=pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [3]:
df=raw_train.copy()
dftest=raw_test.copy()

In [4]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
dftest.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [6]:
df.shape

(7613, 5)

In [7]:
dftest.shape

(3263, 4)

In [8]:
df.drop(['id', 'location', 'keyword'], axis=1, inplace=True)
dftest.drop(['id', 'location', 'keyword'], axis=1, inplace=True)

In [9]:
df.head(2)

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1


In [10]:
df.isnull().sum()

text      0
target    0
dtype: int64

In [11]:
dftest.isna().sum()

text    0
dtype: int64

In [12]:
df.duplicated().sum()

92

In [13]:
df.drop_duplicates(inplace=True)

In [14]:
df['target'].value_counts()

target
0    4315
1    3206
Name: count, dtype: int64

The dataset can be considered balanced

In [17]:
def preprocess(text):
    text=re.sub('http:\S+|@\S+|www.\S+|https\S+', '', text)
    text=re.sub('[^\w\s]', '', text)
    return text

In [18]:
df['text']=[preprocess(text) for text in df['text']]

In [19]:
for text in df['text'][:10]:
    print(text)

Our Deeds are the Reason of this earthquake May ALLAH Forgive us all
Forest fire near La Ronge Sask Canada
All residents asked to shelter in place are being notified by officers No other evacuation or shelter in place orders are expected
13000 people receive wildfires evacuation orders in California 
Just got sent this photo from Ruby Alaska as smoke from wildfires pours into a school 
RockyFire Update  California Hwy 20 closed in both directions due to Lake County fire  CAfire wildfires
flood disaster Heavy rain causes flash flooding of streets in Manitou Colorado Springs areas
Im on top of the hill and I can see a fire in the woods
Theres an emergency evacuation happening now in the building across the street
Im afraid that the tornado is coming to our area


In [20]:
y=df['target']
x=df['text']

In [21]:
x_train, x_test, y_train, y_test=train_test_split(x,y, test_size=0.2, random_state=0)

In [22]:
tok=Tokenizer()
tok.fit_on_texts(x_train)

In [23]:
train_seq=tok.texts_to_sequences(x_train)
train_seq=pad_sequences(train_seq, maxlen=20, padding='post', truncating='post')
test_seq=tok.texts_to_sequences(x_test)
test_seq=pad_sequences(test_seq, maxlen=20, padding='post', truncating='post')

In [24]:
len(tok.word_index)

13986

In [25]:
metrics=[
    keras.metrics.BinaryAccuracy(name='accuracy'),
    keras.metrics.F1Score(name='f1_score')
]

In [26]:
model_lstm=Sequential([
    Embedding(input_dim=len(tok.word_index)+1, output_dim=30, input_length=20),
    Bidirectional(LSTM(32, return_sequences=True)),
    Bidirectional(LSTM(32)),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

In [28]:
model_lstm.compile(
    loss='binary_crossentropy',
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    metrics=metrics
)

In [29]:
rnn_lstm=model_lstm.fit(
    train_seq, y_train,
    epochs=100,
    batch_size=100,
    validation_split=0.2,
    callbacks=keras.callbacks.EarlyStopping(patience=5, verbose=1, restore_best_weights=True),
    verbose=1
)

Epoch 1/100
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 43ms/step - accuracy: 0.5557 - f1_score: 0.5950 - loss: 0.6857 - val_accuracy: 0.6271 - val_f1_score: 0.5943 - val_loss: 0.6318
Epoch 2/100
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - accuracy: 0.7691 - f1_score: 0.6003 - loss: 0.5249 - val_accuracy: 0.7708 - val_f1_score: 0.5943 - val_loss: 0.4841
Epoch 3/100
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - accuracy: 0.9064 - f1_score: 0.5817 - loss: 0.2787 - val_accuracy: 0.7201 - val_f1_score: 0.5943 - val_loss: 0.6655
Epoch 4/100
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - accuracy: 0.9502 - f1_score: 0.5941 - loss: 0.1560 - val_accuracy: 0.7409 - val_f1_score: 0.5943 - val_loss: 0.6988
Epoch 5/100
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - accuracy: 0.9673 - f1_score: 0.5964 - loss: 0.1179 - val_accuracy: 0.7342 - val_f1_score: 0.5943

In [30]:
model_lstm.evaluate(test_seq, y_test, batch_size=100)

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.7870 - f1_score: 0.6143 - loss: 0.4703


[0.4783758819103241, 0.775415301322937, 0.6122636795043945]

In [31]:
model_gru=Sequential([
    Embedding(input_dim=len(tok.word_index)+1, output_dim=30, input_length=20),
    Bidirectional(GRU(32, return_sequences=True)),
    Bidirectional(GRU(32)),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

In [32]:
model_gru.compile(
    loss='binary_crossentropy',
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    metrics=metrics
)

In [33]:
rnn_gru=model_gru.fit(
    train_seq, y_train,
    epochs=100,
    batch_size=100,
    validation_split=0.2,
    callbacks=keras.callbacks.EarlyStopping(patience=5, verbose=1, restore_best_weights=True),
    verbose=1
)

Epoch 1/100
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 49ms/step - accuracy: 0.6555 - f1_score: 0.5991 - loss: 0.6877 - val_accuracy: 0.5772 - val_f1_score: 0.5943 - val_loss: 0.6756
Epoch 2/100
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 32ms/step - accuracy: 0.6482 - f1_score: 0.5949 - loss: 0.6336 - val_accuracy: 0.7924 - val_f1_score: 0.5943 - val_loss: 0.4779
Epoch 3/100
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step - accuracy: 0.8812 - f1_score: 0.5888 - loss: 0.3189 - val_accuracy: 0.7708 - val_f1_score: 0.5943 - val_loss: 0.5348
Epoch 4/100
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 31ms/step - accuracy: 0.9278 - f1_score: 0.5979 - loss: 0.2088 - val_accuracy: 0.7500 - val_f1_score: 0.5943 - val_loss: 0.5856
Epoch 5/100
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step - accuracy: 0.9587 - f1_score: 0.5923 - loss: 0.1475 - val_accuracy: 0.7558 - val_f1_score: 0.594

In [34]:
model_gru.evaluate(test_seq, y_test, batch_size=100)

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.7733 - f1_score: 0.6143 - loss: 0.4864


[0.5000054836273193, 0.764784038066864, 0.6122636795043945]

In [50]:
dftest['text']=dftest['text'].apply(preprocess)

In [51]:
x=dftest['text']

In [52]:
test_seq=tok.texts_to_sequences(x)
test_seq=pad_sequences(test_seq, maxlen=20, truncating='post', padding='post')

In [53]:
pred=model_gru.predict(test_seq)

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step


In [54]:
pred=pd.DataFrame(pred, columns=['target'])
pred['target']=pred['target'].apply(lambda x: 1 if x>0.5 else 0)
pred

Unnamed: 0,target
0,1
1,0
2,1
3,1
4,1
...,...
3258,1
3259,0
3260,1
3261,1


In [48]:
sub=pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')

In [55]:
sub

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0
...,...,...
3258,10861,0
3259,10865,0
3260,10868,0
3261,10874,0


In [56]:
sub['target']=pred['target']
sub

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,0
3260,10868,1
3261,10874,1


In [57]:
sub.to_csv('submission.csv', mode='w', index=False)