# Solovev_NLP(lstm)

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf
from tensorflow.keras import utils 
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Embedding, Flatten, Activation
from tensorflow.keras.layers import Conv1D, SpatialDropout1D, MaxPooling1D, GlobalAveragePooling1D, GlobalMaxPooling1D,GlobalMaxPool1D
from tensorflow.keras.layers import LSTM, GRU, SimpleRNN, Bidirectional
from tensorflow.keras.optimizers import Adam, RMSprop
import tensorflow_hub as hub
from tensorflow.keras.preprocessing.sequence import pad_sequences

import string
import re
import nltk.data
from nltk.tokenize import word_tokenize
import seaborn as sns

from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.preprocessing.text import Tokenizer, text_to_word_sequence 
from keras.layers import Input
from keras.layers import Dense
from keras.utils.vis_utils import plot_model
from keras.models import Model
from keras.layers.recurrent import LSTM
from keras.layers.merge import concatenate
from keras.layers import Embedding
from nltk.corpus import stopwords

from sklearn.metrics import (precision_score,recall_score,f1_score,classification_report,accuracy_score)
from sklearn.model_selection import train_test_split

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')


# Preprocessing

In [3]:
# removing stop words
def remove_stopwords(text):
    sw = stopwords.words('english')
    words = text.split(' ')
    filtered = [w for w in words if w not in sw]
    return ' '.join([str(v) for v in filtered])
df_train['text'] = df_train['text'].apply(remove_stopwords)
df_test['text']  = df_test['text'].apply(remove_stopwords)

In [4]:
# removing stop words
def remove_stopwords(text):
    sw = stopwords.words('english')
    words = text.split(' ')
    filtered = [w for w in words if w not in sw]
    return ' '.join([str(v) for v in filtered])
df_train['text'] = df_train['text'].apply(remove_stopwords)
df_test['text']  = df_test['text'].apply(remove_stopwords)

In [5]:
# removing punct
def remove_punct(text):
    table = str.maketrans('', '', string.punctuation)    
    return text.translate(table)

df_train['text'] = df_train['text'].apply(remove_punct)
df_test['text'] = df_test['text'].apply(remove_punct)

In [6]:
def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  
        u"\U0001F300-\U0001F5FF"  
        u"\U0001F680-\U0001F6FF"  
        u"\U0001F1E0-\U0001F1FF"  
        u"\U00002500-\U00002BEF"  
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f" 
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

df_train['text'] = df_train['text'].apply(remove_emojis)
df_test['text'] = df_test['text'].apply(remove_emojis)

In [7]:
def clean_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

df_train['text'] = df_train['text'].apply(lambda x: clean_text(x))
df_test['text'] = df_test['text'].apply(lambda x: clean_text(x))

In [8]:
#Renam location names
df_train['location'].replace({'United States':'USA','New York':'USA',"London":'UK',"Los Angeles, CA":'USA',"Washington, D.C.":'USA',"California":'USA',"Chicago, IL":'USA',"Chicago":'USA',"New York, NY":'USA',"California, USA":'USA',"FLorida":'USA',"Nigeria":'Africa',"Kenya":'Africa',"Everywhere":'Worldwide',"San Francisco":'USA', "Florida":'USA',"United Kingdom":'UK',"Los Angeles":'USA',"Toronto":'Canada',"San Francisco, CA":'USA',"NYC":'USA',
"Seattle":'USA', "Earth":'Worldwide', "Ireland":'UK',"London, England":'UK',"New York City":'USA', "Texas":'USA',"London, UK":'UK',"Atlanta, GA":'USA', "Mumbai":"India"},inplace=True)
df_test['location'].replace({'United States':'USA','New York':'USA',"London":'UK',"Los Angeles, CA":'USA',"Washington, D.C.":'USA',"California":'USA',"Chicago, IL":'USA',"Chicago":'USA',"New York, NY":'USA',"California, USA":'USA',"FLorida":'USA',"Nigeria":'Africa',"Kenya":'Africa',"Everywhere":'Worldwide',"San Francisco":'USA', "Florida":'USA',"United Kingdom":'UK',"Los Angeles":'USA',"Toronto":'Canada',"San Francisco, CA":'USA',"NYC":'USA',
"Seattle":'USA', "Earth":'Worldwide', "Ireland":'UK',"London, England":'UK',"New York City":'USA', "Texas":'USA',"London, UK":'UK',"Atlanta, GA":'USA', "Mumbai":"India"},inplace=True)

In [9]:
def decontraction(tweet):

    tweet = re.sub(r"won\'t", " will not", tweet)
    tweet = re.sub(r"won\'t've", " will not have", tweet)
    tweet = re.sub(r"can\'t", " can not", tweet)
    tweet = re.sub(r"don\'t", " do not", tweet)    
    tweet = re.sub(r"can\'t've", " can not have", tweet)
    tweet = re.sub(r"ma\'am", " madam", tweet)
    tweet = re.sub(r"let\'s", " let us", tweet)
    tweet = re.sub(r"ain\'t", " am not", tweet)
    tweet = re.sub(r"shan\'t", " shall not", tweet)
    tweet = re.sub(r"sha\n't", " shall not", tweet)
    tweet = re.sub(r"o\'clock", " of the clock", tweet)
    tweet = re.sub(r"y\'all", " you all", tweet)
    tweet = re.sub(r"n\'t", " not", tweet)
    tweet = re.sub(r"n\'t've", " not have", tweet)
    tweet = re.sub(r"\'re", " are", tweet)
    tweet = re.sub(r"\'s", " is", tweet)
    tweet = re.sub(r"\'d", " would", tweet)
    tweet = re.sub(r"\'d've", " would have", tweet)
    tweet = re.sub(r"\'ll", " will", tweet)
    tweet = re.sub(r"\'ll've", " will have", tweet)
    tweet = re.sub(r"\'t", " not", tweet)
    tweet = re.sub(r"\'ve", " have", tweet)
    tweet = re.sub(r"\'m", " am", tweet)
    tweet = re.sub(r"\'re", " are", tweet)
    return tweet 
df_train['text'] = df_train['text'].apply(lambda x : decontraction(x))
df_test['text'] = df_test['text'].apply(lambda x : decontraction(x))

# Model  "LSTM"

In [10]:
from bert import bert_tokenization

In [11]:
%%time
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

Wall time: 5.78 s


In [12]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

In [14]:
def pre_Process_data(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    for text in texts:
        text = tokenizer.tokenize(text)            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [58]:
max_len = 180
epochs = 10
batch_size = 30
lr = 0.000001

def build_model(bert_layer, max_len=512):
    input_word_id = Input(shape=(max_len,),dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_id = Input(shape=(max_len,), dtype=tf.int32, name = "segment_id")
    
    _, sequence_output = bert_layer([input_word_id, input_mask, segment_id])
    output = sequence_output[:, 0, :]
    dense1 = Dense(units=256,activation='relu')(output)
    dense1 = Dropout(0.4)(dense1)
    dense2 = Dense(units=128, activation='relu')(dense1)
    dense3 = Dropout(0.4)(dense2)
    out = Dense(1, activation='sigmoid')(dense3)
    
    model = Model(inputs=[input_word_id, input_mask, segment_id],outputs=out)
    model.compile(Adam(lr=lr), loss='binary_crossentropy', metrics=['accuracy'])    
    return model

In [59]:
train_input = pre_Process_data(df_train.text.values, tokenizer, max_len=max_len)
test_input = pre_Process_data(df_test.text.values, tokenizer, max_len=max_len)
train_labels = df_train.target.values

In [1]:
model = build_model(bert_layer, max_len=max_len)

checkpoint = ModelCheckpoint('model.h5', monitor='val_loss', save_best_only=True)
train_history = model.fit(train_input, train_labels,validation_split=0.2,epochs=epochs,callbacks=[checkpoint], batch_size=batch_size)

In [51]:
model.load_weights('model.h5')
test_pred = model.predict(test_input)
submission['target'] = test_pred.round().astype(int)
submission.to_csv("submission.csv", index=False)