This notebook explains pre-processing and modeling of data once the EDA part is completed.

### Importing libraries

In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from nltk.corpus import stopwords
plt.style.use('ggplot')
stop=set(stopwords.words('english'))
import re
from nltk.tokenize import word_tokenize
import string
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D
from keras.initializers import Constant
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam
from abbreviation import abbreviations

### Read Train test data

In [9]:
tweet = pd.read_csv("data/train.csv")    
test = pd.read_csv("data/test.csv")
tweet.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


### Data Cleaning

As we know,twitter tweets always have to be cleaned before we go into modelling part.<br>
So we will do some basic cleaning such as <br>
(1) spelling correction,<br>
(2) removing punctuations,<br>
(3) removing html tags and emojis etc.

In [10]:
# Concatenate train test data for cleaning.
df=pd.concat([tweet,test])
print(df.shape)

(10876, 5)


### Removing urls

In [11]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

df['text']=df['text'].apply(lambda x : remove_URL(x))

### Removing HTML tags 

In [12]:
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

df['text']=df['text'].apply(lambda x : remove_html(x))

### Removing Emojis

In [13]:
# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

df['text']=df['text'].apply(lambda x: remove_emoji(x))

### Removing punctuations 

In [14]:
def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

df['text']=df['text'].apply(lambda x : remove_punct(x))

### Convert abbrevations to its meaningful text 

In [15]:
def convert_abbrev(word):
    return abbreviations[word.lower()] if word.lower() in abbreviations.keys() else word

def convert_abbrev_in_text(text):
    tokens = word_tokenize(text)
    tokens = [convert_abbrev(word) for word in tokens]
    text = ' '.join(tokens)
    return text

df['text'] = df["text"].apply(lambda x: convert_abbrev_in_text(x))

###  GloVe for Vectorization

(1) Golve is count based model.<br>
(2) Learns their vectors by doing dimensionality reduction on the co-occurence of count matrix.<br>
(3) Can be implemented efficiently on large amount of data.

In [16]:
def create_corpus(df):
    corpus=[]
    for tweet in tqdm(df['text']):
        words=[word.lower() for word in word_tokenize(tweet) if((word.isalpha()==1) & (word not in stop))]
        corpus.append(words)
    return corpus

corpus=create_corpus(df)

100%|██████████████████████████████████████████████████████████████████████████| 10876/10876 [00:03<00:00, 2731.03it/s]


Here we will use pretrained glove model to represent our words

In [17]:
embedding_dict={}

with open('data/glove.twitter.27B.200d.txt','r',encoding="utf8") as f:
    for line in f:
        values=line.split()
        word=values[0]
        vectors=np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors
f.close()

In [18]:
MAX_LEN=50
tokenizer_obj=Tokenizer()
tokenizer_obj.fit_on_texts(corpus)
sequences=tokenizer_obj.texts_to_sequences(corpus)

tweet_pad=pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')

In [19]:
word_index=tokenizer_obj.word_index
print('Number of unique words:',len(word_index))

Number of unique words: 20276


In [20]:
num_words=len(word_index)+1
embedding_matrix=np.zeros((num_words,200))

for word,i in tqdm(word_index.items()):
    if i > num_words:
        continue
    
    emb_vec=embedding_dict.get(word)
    if emb_vec is not None:
        embedding_matrix[i]=emb_vec

100%|████████████████████████████████████████████████████████████████████████| 20276/20276 [00:00<00:00, 163819.68it/s]


### Defining model structure

In [21]:
model=Sequential()

embedding=Embedding(num_words,200,embeddings_initializer=Constant(embedding_matrix),
                   input_length=MAX_LEN,trainable=False)

model.add(embedding)
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))


optimzer=Adam(learning_rate=6e-5)

model.compile(loss='binary_crossentropy',optimizer=optimzer,metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 200)           4055400   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 50, 200)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 100)               120400    
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 4,175,901
Trainable params: 120,501
Non-trainable params: 4,055,400
_________________________________________________________________


### Separate train test data to process further

In [22]:
train=tweet_pad[:tweet.shape[0]]
test=tweet_pad[tweet.shape[0]:]

### Train test split

In [23]:
X_train,X_test,y_train,y_test=train_test_split(train,tweet['target'].values,test_size=0.15)
print('Shape of train',X_train.shape)
print("Shape of Validation ",X_test.shape)

Shape of train (6471, 50)
Shape of Validation  (1142, 50)


### Fitting model

In [24]:
history=model.fit(X_train,y_train,batch_size=32,epochs=15,validation_data=(X_test,y_test),verbose=2)

Train on 6471 samples, validate on 1142 samples
Epoch 1/15
6471/6471 - 28s - loss: 0.6679 - acc: 0.5943 - val_loss: 0.5045 - val_acc: 0.7671
Epoch 2/15
6471/6471 - 23s - loss: 0.5438 - acc: 0.7503 - val_loss: 0.4796 - val_acc: 0.7942
Epoch 3/15
6471/6471 - 23s - loss: 0.5258 - acc: 0.7588 - val_loss: 0.4710 - val_acc: 0.7968
Epoch 4/15
6471/6471 - 24s - loss: 0.5155 - acc: 0.7673 - val_loss: 0.4544 - val_acc: 0.8030
Epoch 5/15
6471/6471 - 24s - loss: 0.5027 - acc: 0.7790 - val_loss: 0.4532 - val_acc: 0.8074
Epoch 6/15
6471/6471 - 24s - loss: 0.5044 - acc: 0.7792 - val_loss: 0.4512 - val_acc: 0.8109
Epoch 7/15
6471/6471 - 24s - loss: 0.4971 - acc: 0.7801 - val_loss: 0.4506 - val_acc: 0.8074
Epoch 8/15
6471/6471 - 24s - loss: 0.4897 - acc: 0.7901 - val_loss: 0.4481 - val_acc: 0.8056
Epoch 9/15
6471/6471 - 24s - loss: 0.4984 - acc: 0.7816 - val_loss: 0.4444 - val_acc: 0.8117
Epoch 10/15
6471/6471 - 24s - loss: 0.4897 - acc: 0.7866 - val_loss: 0.4435 - val_acc: 0.8056
Epoch 11/15
6471/6471

### Make predictions on test data

In [25]:
y_pre=model.predict(test)
y_pre=np.round(y_pre).astype(int).reshape(3263)

### Exporting results to csv

In [26]:
sample_sub=pd.read_csv('data/sample_submission.csv')
sub=pd.DataFrame({'id':sample_sub['id'].values.tolist(),'target':y_pre})
sub.to_csv('submission/v1.csv',index=False)

Hope this notebook was useful!!