# Import all needed libraries

**For math and schemes:**

In [97]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

**For data porocessing & model creation**

In [98]:
import nltk # for words filtration
from nltk.corpus import stopwords
import re # for string maniputations
from sklearn.model_selection import train_test_split
# to process data
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
# for creating models
from keras.models import Sequential
from keras.layers import Dense, Embedding,GRU, LSTM, RNN
from keras.callbacks import EarlyStopping, ModelCheckpoint
import keras.backend as K
# for beautiful representation
from wordcloud import WordCloud

# Load our data

And have a look on it:

In [99]:
#load data
train_data=pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
train_data.head()

In [100]:
test_data=pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
test_data.head()

# See if data is balanced

And as we can see from the histogram - our data is completely balanced.

In [101]:
sns.countplot(train_data['target'])
plt.title('Not about disaster:'+str(train_data.target.value_counts()[0])+'\n'+
         'Real disaster:'+str(train_data.target.value_counts()[1]))
plt.show()

**Also we must calculate how long words in our dataset.**

In [102]:
def words_len(arr, text):
    word_len = []
    for i in arr:
        word_len.append(len(i.split(' ')))

    plt.figure(figsize=(12,6))
    sns.countplot(word_len)
    plt.xlabel("Lengths of words:")
    plt.ylabel('Len repeats:')
    plt.title(text)
    plt.show()
    
words_len(train_data['text'],"Train data_set")

# Clear data set
As we see in the start - data contains a lot of Null in keyword and location fields.

For future prediction we dont need that colowns. 

In [103]:
train_data.drop(['keyword','location'], axis=1, inplace=True)
test_data.drop(['keyword','location'], axis=1, inplace=True)

**Now time to find most popular words in dataset:**

In [104]:
def show_words(arr):
    tmp = ''
    for i in arr:
        tmp +=i
    
    wc = WordCloud(collocations = False, background_color = 'white').generate(tmp)
    plt.figure(figsize=(10,10))
    plt.imshow(wc, interpolation='bilinear')

    plt.axis("off")

    plt.show()

show_words(train_data['text'])

As we see - some of this words a meanless. Like "a", "co","u","WH370" etc.

So next step is clean our data. Delete all stopwords, words with len < 4, brecets and so on.

Also convert all text to lowerCase and delete all non English letters and number.

In [105]:
swords=set(stopwords.words('english'))

def clear_txt(text):
    h_str = text.lower()
    h_str = re.sub(r'(http|https)?\/\/(\w|\.|\/|\?|\=|\&|\%)*\b','',h_str)
    
    h_str = re.sub(r'\{[^)]*\}', '', h_str)
    h_str = re.sub(r'\([^)]*\)', '', h_str)
    
    h_str = re.sub('[^a-zA-Z]', ' ', h_str)
    
    tokens = [w for w in h_str.split() if not w in swords] 
    
    res = []
    for i in tokens:
        if len(i) >=4:
            res.append(i)
    return (" ".join(res)).strip()

In [106]:
clean_train = []
for i in train_data['text']:
    clean_train.append(clear_txt(i))

**Lets look on some example of our work**

In [107]:
print("Before: ", train_data['text'][7])
print("After: ", clean_train[7])

**And wordsCloud also become much clear and meaningful**

In [108]:
show_words(clean_train)

In [109]:
clean_test = []
for i in test_data['text']:
    clean_test.append(clear_txt(i))

# Final result

In data we left only important to us words. And and their len distribute like this:

In [110]:
words_len(clean_train,"Train data_set")
words_len(clean_test,"Test data_set")

# Prepare data for training

Divade data into train and validation set (4:1).

In [111]:
#models
X_train,X_valid,y_train,y_valid = train_test_split(clean_train, train_data['target'], test_size = 0.2, random_state = 40)
print(f"Train size: {len(X_train)}, {len(y_train)}")
print(f"Validation size: {len(X_valid)}, {len(y_valid)}")

# Next let create vocabluart by tokinaize our tweets.
**As we can see from previous gistograms - max len of words is 20.**

In [112]:
#as we see from plot:
max_len = 20

tokenizer=Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

X_train=tokenizer.texts_to_sequences(X_train)
X_valid=tokenizer.texts_to_sequences(X_valid)

X_test=tokenizer.texts_to_sequences(clean_test)

X_train=pad_sequences(X_train,maxlen=max_len,padding='post')
X_valid=pad_sequences(X_valid,maxlen=max_len,padding='post')

X_test=pad_sequences(X_test,maxlen=max_len,padding='post')

voc = len(tokenizer.word_index) + 1
print("Vocabluary size is ", voc)

**Next we must categorize our data (about disaster and fake)**

In [113]:
y_train=to_categorical(y_train,num_classes=2)
y_valid=to_categorical(y_valid,num_classes=2)

print(y_train.shape)
print(y_valid.shape)

# First model

We build first model - with 1 LSTM layer. 

To prevent the perpetual increase in loss, we use the relu activation function.
And output is sigmoid becouse we have binar clasifications. 
**Same in next models too.**

In [114]:
K.clear_session()

model1 = Sequential()
model1.add(Embedding(voc,100,input_length=max_len,trainable=True,mask_zero=True))
model1.add(LSTM(300,dropout=0.1,recurrent_dropout=0.2))
model1.add(Dense(64,activation='relu'))
model1.add(Dense(2,activation='sigmoid'))
model1.summary()

**Compile our model, using most popular and simple optimazer for RNN - ADAM.**

And also divade data into batches with size 650. If this value is lower, then the growth of loss is too rapid. You can put more, but it is important not to cross the line when 30 epochs are not enough for full-fledged learning.

In [115]:
model1.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

h1 = model1.fit(x=np.array(X_train), y=np.array(y_train),batch_size = 650, epochs=30,
                   validation_data=(np.array(X_valid),np.array(y_valid)))

# 1st result
On graphics we see that validation loss high, but it's normal (lasts are 1.1 and 1.97).

And accuracy in between 0.75 and 0.8 (last is +-0.76).

In [116]:
plt.plot(h1.history['val_loss'],'r',label='val_loss')
plt.plot(h1.history['loss'],'g',label='train_loss')
plt.legend()

In [117]:
plt.plot(h1.history['val_acc'],'b',label='val_acc')
plt.plot(h1.history['acc'],'y',label='train_acc')
plt.legend()

# Model 2
**We build second model - with 1 GRU layer.**

Hyper parameters is same as before.

In [118]:
model2=Sequential()
model2.add(Embedding(voc,100,input_length=max_len,trainable=True,mask_zero=True))
model2.add(GRU(300,dropout=0.1,recurrent_dropout=0.2))
model2.add(Dense(64,activation='relu'))
model2.add(Dense(2,activation='sigmoid'))
model2.summary()

model2.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['acc'])

In [119]:
h2=model2.fit(x=np.array(X_train),y=np.array(y_train),batch_size=650,epochs=30,
          validation_data=(np.array(X_valid),np.array(y_valid)))

# Second result
On graphics we see that validation loss high, but it's smaller than 1st model (last is 1.3).

And accuracy in between 0.75 and 0.8 (last is +-0.77).


In [120]:
plt.plot(h2.history['val_loss'],'r',label='val_loss')
plt.plot(h2.history['loss'],'g',label='train_loss')
plt.legend()

In [121]:
plt.plot(h2.history['val_acc'],'b',label='val_acc')
plt.plot(h2.history['acc'],'y',label='train_acc')
plt.legend()

# 3rd model
**Lets try multylayer LSTM.** Because it have better result (the difference is small, but it is there)

Lets add 4 LSTM layer with size 350,150,50.
Butch size now 200. We have greater losses, but the quality of education must increase.

Other hyper parametrs are same.

In [125]:
# i think best result is LSTM, so lent add another layer
model3=Sequential()
model3.add(Embedding(voc,100,input_length=max_len,trainable=True,mask_zero=True))
model3.add(LSTM(350,dropout=0.1,recurrent_dropout=0.2,return_sequences=True))
model3.add(LSTM(150,dropout=0.1,recurrent_dropout=0.2,return_sequences=True))
model3.add(LSTM(50,dropout=0.1,recurrent_dropout=0.2))
model3.add(Dense(64,activation='relu'))
model3.add(Dense(2,activation='sigmoid'))
model3.summary()

model3.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['acc'])

In [126]:
h3=model3.fit(x=np.array(X_train),y=np.array(y_train),batch_size=200,epochs=30,
          validation_data=(np.array(X_valid),np.array(y_valid)))

# 3rd result
On graphics we see that validation loss higher (last is 1.4).

And accuracy in between 0.74 and 0.8 (last is +-0.77).

In [127]:
plt.plot(h3.history['val_loss'],'r',label='val_loss')
plt.plot(h3.history['loss'],'g',label='train_loss')
plt.legend()

In [128]:
plt.plot(h3.history['val_acc'],'b',label='val_acc')
plt.plot(h3.history['acc'],'y',label='train_acc')
plt.legend()

# Last model
**Lets try multylayer GRU.** Just for compare it with previous. 

Lets add 4 GRU layer with size 350,150,50.
Butch size now 200. We have greater losses, but the quality of education must increase.

Other hyper parametrs are same.

In [129]:
model4=Sequential()
model4.add(Embedding(voc,100,input_length=max_len,trainable=True,mask_zero=True))
model4.add(GRU(350,dropout=0.1,recurrent_dropout=0.2,return_sequences=True))
model4.add(GRU(150,dropout=0.1,recurrent_dropout=0.2,return_sequences=True))
model4.add(GRU(50,dropout=0.1,recurrent_dropout=0.2))
model4.add(Dense(64,activation='relu'))
model4.add(Dense(2,activation='sigmoid'))
model4.summary()

model4.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['acc'])

In [130]:
h4=model4.fit(x=np.array(X_train),y=np.array(y_train),batch_size=200,epochs=30,
          validation_data=(np.array(X_valid),np.array(y_valid)))

# 4th result
On graphics we see that validation loss higher (last is +-1.5).

And accuracy in between 0.73 and 0.8 (last is +-0.74).

In [131]:
plt.plot(h4.history['val_loss'],'r',label='val_loss')
plt.plot(h4.history['loss'],'g',label='train_loss')
plt.legend()

In [132]:
plt.plot(h4.history['val_acc'],'b',label='val_acc')
plt.plot(h4.history['acc'],'y',label='train_acc')
plt.legend()

# Final submition

**So, the best model is model 3...**
> (But from start to start we can see any modes can win)

In [133]:
predict_help = model3.predict(X_test)
prediction = [0 if i[0]>=0.5 else 1 for i in predict_help]
test_data['target'] = prediction

sub = test_data[['id', 'target']]
print("Start write into csv")
sub.to_csv('Submission.csv', index=False)
print("End write into csv")