# Kaggle Natural Language Process Project : Disaster Tweets
## project link : https://www.kaggle.com/c/nlp-getting-started

In [422]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
# version 1 module -> Sklearn
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.model_selection import train_test_split

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [423]:
# version 2 module : keras & tensorflow
import tensorflow.keras as keras
import tensorflow
from keras.models import Sequential
from keras.layers import LSTM,SimpleRNN
from keras.layers import Dense, Embedding, Bidirectional, Dropout, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [424]:
train_all = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
train_all.head()

In [425]:
# count NaN in keyword, location (NaN is not in id, text, target)
train_all_na = train_all.isna().sum()
train_all_na

In [426]:
# Non-Target Data
train_all[train_all["target"] == 0]["text"].values[0]

In [427]:
train_all[train_all["target"] == 1]["text"].values[0]

In [428]:
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
test.head()

In [429]:
test_na = test.isna().sum()
test_na

In [430]:
sample_answer = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
sample_answer

In [431]:
train_all = train_all.drop(['id', 'keyword', 'location'], axis=1)
test = test.drop(['id', 'keyword', 'location'], axis=1)

In [432]:
y_train =  train_all['target'].values
x_train = train_all.drop(['target'], axis=1).values.reshape(len(train_all),)
x_test = test['text'].values.reshape(len(test),)

In [433]:
total_tweets = np.concatenate((x_train, x_test))
print('Total tweets : ', len(total_tweets))

In [434]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(total_tweets)

# 전체 단어 길이 가져오기
vocab_size = len(tokenizer.word_index) + 1
print('Size of Vocabulary : ', vocab_size)

In [435]:
# Maximum length for padding sequence
maxlen = max(len(x.split()) for x in total_tweets)
print('Maximum length of tweet : ', maxlen)

In [436]:
X_train_token = tokenizer.texts_to_sequences(x_train)
X_test_token = tokenizer.texts_to_sequences(x_test)

print('Text before tokenized')
print(x_train[0])
print('\nText after tokenized')
print(X_train_token[0])

In [437]:
X_train_pad = pad_sequences(X_train_token, maxlen=maxlen, padding='post')
X_test_pad = pad_sequences(X_test_token, maxlen=maxlen, padding='post')

print('Tokenized text before padding')
print(X_train_token[0])
print('\nTokenized text after padding')
print(X_train_pad[0])

In [438]:
embed_units=100
hidden_units=128

model=Sequential()
model.add(Embedding(vocab_size, embed_units, input_length = maxlen))
model.add(SimpleRNN(hidden_units))
model.add(Dropout(0.2))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(1, activation='sigmoid'))

model.summary()

In [439]:
learning_rate = 1e-4

model.compile(loss = 'binary_crossentropy',
              optimizer = 'adam',
              metrics = ['accuracy'])

In [440]:
es_cb=keras.callbacks.EarlyStopping( patience=5, verbose=1)
model_history = model.fit(X_train_pad, y_train, 
                          batch_size=64, 
                          epochs=15, 
                          validation_split=0.3,
                          callbacks=[es_cb])

In [441]:
pred = model.predict(X_test_pad, verbose=1)

In [442]:
sample_answer["target"] = pred
sample_answer["target"] = sample_answer["target"].apply(lambda x : 0 if x<=.5 else 1)

In [443]:
sample_answer.to_csv("submission.csv", index=False)

# Bidirectional Model

In [444]:
model_BRNN=Sequential()
model_BRNN.add(Embedding(vocab_size, embed_units, input_length = maxlen))
model_BRNN.add(Bidirectional(SimpleRNN(hidden_units)))
model_BRNN.add(Dropout(0.2))
model_BRNN.add(Dense(256, activation='relu'))
model_BRNN.add(Dropout(0.2))
model_BRNN.add(Dense(1, activation='sigmoid'))

model_BRNN.summary()

In [445]:
learning_rate = 1e-4

model_BRNN.compile(loss = 'binary_crossentropy',
              optimizer = 'adam',
              metrics = ['accuracy'])

In [446]:
model_BRNN_history = model_BRNN.fit(X_train_pad, y_train, 
                          batch_size=128, 
                          epochs=15, 
                          validation_split=0.2,
                          callbacks=[es_cb])

In [447]:
pred = model_BRNN.predict(X_test_pad, verbose=1)

In [448]:
sample_answer["target"] = pred
sample_answer["target"] = sample_answer["target"].apply(lambda x : 0 if x<=.5 else 1)

In [449]:
sample_answer.to_csv("submission.csv", index=False)

# Bidirectional LSTM

In [450]:
model_BLS=Sequential()
model_BLS.add(Embedding(vocab_size, embed_units, input_length = maxlen))
model_BLS.add(Bidirectional(LSTM(hidden_units)))
model_BLS.add(Dropout(0.2))
model_BLS.add(Dense(256, activation='relu'))
model_BLS.add(Dropout(0.2))
model_BLS.add(Dense(1, activation='sigmoid'))

model_BLS.summary()

In [451]:
learning_rate = 1e-4

model_BLS.compile(loss = 'binary_crossentropy',
              optimizer = 'adam',
              metrics = ['accuracy'])

In [452]:
model_BLS_history = model_BLS.fit(X_train_pad, y_train, 
                          batch_size=256, 
                          epochs=15, 
                          validation_split=0.2,
                          callbacks=[es_cb])

In [453]:
pred = model_BLS.predict(X_test_pad, verbose = 1)

In [454]:
sample_answer["target"] = pred
sample_answer["target"] = sample_answer["target"].apply(lambda x : 0 if x<=.5 else 1)

In [455]:
sample_answer.to_csv("submission.csv", index=False)