In [7]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import csv
import re
import spacy

In [8]:
nlp = spacy.load("en_core_web_sm")

## ** Data PreProcessing**
### We will process the data using Pandas

In [9]:
import pandas as pd

In [10]:
df_train=pd.read_csv('train.csv')

In [11]:
df_train.head(5)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [12]:
train_text=df_train[['text']]
## We will convert the type of column to str
trian_text=train_text.astype('str')

In [13]:
train_text.head(5)

Unnamed: 0,text
0,Our Deeds are the Reason of this #earthquake M...
1,Forest fire near La Ronge Sask. Canada
2,All residents asked to 'shelter in place' are ...
3,"13,000 people receive #wildfires evacuation or..."
4,Just got sent this photo from Ruby #Alaska as ...


# **Data Cleaninig**
## We implement below steps for data cleaning.
## Steps:
### Removals:
#### 1. Removing Punctuations
#### 2. Removal of Stop words
#### 3. Removal of Frequent words
#### 4. Removal of Rare words
#### 5. Removal of URLs
#### 6. Removal of HTML Tags
### Conversions:
#### 1. Lower case
#### 2. Lemmatization
#### 3. Spelling corrections
#### 4. Chat Words (BRB- Be right back)

### **Lower Casing**

In [14]:
## Now lets lower case the string nad assign it to another column in the dataframe.
##Dartaseries.str will actually convert the dataseries to string inorder to do any string related activities.
df_train['lower_text'] = df_train['text'].str.lower()

In [15]:
df_train.head(5)

Unnamed: 0,id,keyword,location,text,target,lower_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this #earthquake m...
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask. canada
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked to 'shelter in place' are ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive #wildfires evacuation or..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby #alaska as ...


## **Removal od Punctuations**
### We will remove all the special characters
### For this we will import string module and we will remove punctuations like
!"#$%&\'()*+,-./:;<=>?@[\\]^_{|}~`


In [16]:
import string

[https://www.programiz.com/python-programming/methods/string/maketrans]
## Inside string we will use make_trans and translate
### This will take three parameters, 1st two are actual text and replacable text.(search text, replace the search text with)
### The third would : if any text or symbol has to be removed from actual text then it will take care.
### eg: actual text: 'abc', replacable text:'ghi', text to be removed:'b'
### input text:'abcdef'
### If we apply make_trans and translate then the replace text 'ghi' will replace actual text 'abc' and shows the result 'ghidef'. If we specify the third variable as 'b'. The system before repacing removes the character b from actual text. and the replacement happens only with the respective indexes of replacement text. So replace text can only replace 'a' and 'c' with 'g' and 'i' and 'b' is removed.
### The result would be :'gidef'

### If we specify empty for the first two paramaters then specify the third params with some letters or symbols to be removed then it will simply remove from the text.


In [17]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [18]:
## WE will create new column and perform the punctuations removal

## WE will create a function for the removal
## For that we will u
def punct_removal(text):
    ##Create punctuationstring from string library
    punct_str=string.punctuation
    make_translate= text.maketrans('','',punct_str)
    return text.translate(make_translate)


In [19]:
###Lets take one sample
text='Hi !! How are you?? :-)'
punct_removal(text)

'Hi  How are you '

In [20]:
### Now we will create a new column and remove all the punctuations
df_train['remove_punct']=df_train['lower_text'].apply(lambda text:punct_removal(text) )

In [21]:
df_train[['lower_text','remove_punct']].tail(5)

Unnamed: 0,lower_text,remove_punct
7608,two giant cranes holding a bridge collapse int...,two giant cranes holding a bridge collapse int...
7609,@aria_ahrary @thetawniest the out of control w...,ariaahrary thetawniest the out of control wild...
7610,m1.94 [01:04 utc]?5km s of volcano hawaii. htt...,m194 0104 utc5km s of volcano hawaii httptcozd...
7611,police investigating after an e-bike collided ...,police investigating after an ebike collided w...
7612,the latest: more homes razed by northern calif...,the latest more homes razed by northern califo...


## Spell checker
### We will change the spelling of the words as it is human driven text not machines
### For this we are using SpellChecker from spellchecker library. we have to install pyspellchecker for usage

In [22]:
from spellchecker import SpellChecker

In [23]:
spell=SpellChecker()
## WE will define function 
def spell_checker(text):
    ## Take the text and find the unknown words or mis spelled words from the text
    mis_spelled=spell.unknown(text.split())
    #List to store all the words
    correct_words=[]
    ## Loop the word and if any word is present in mis spelled then replace that with correct word
    for word in text.split():
        if word in mis_spelled:
            correct_words.append(spell.correction(word))
        else:
            correct_words.append(word)
    return ' '.join(correct_words)

In [27]:
text=u'cooool loooool'
spell_checker(text)

'cool loooool'

In [28]:
### Now we will create a column for that and apply the function
df_train['spell_check']= df_train['remove_punct'].apply(lambda text: spell_checker(text))

KeyboardInterrupt: 

In [None]:
df_train[['remove_punct', 'spell_check']].head(50)

### **Removal of Stop words**
#### We will remove the stop words such as 'a', 'the' which doesn't give any meaning to the sentence
#### This can be done using Spacy library

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in text.split(' ') if word not in nlp.Defaults.stop_words])


In [None]:
remove_stopwords(u'Hello, i am a runner')

In [None]:
### Now we will create a column and apply stop_words function
df_train['stop_words']=df_train['spell_check'].apply(lambda text:remove_stopwords(text))

In [None]:
df_train[['text', 'stop_words']].tail()

## **Removal of frequent words**
### Frequent words can be removed to reduce the burden on the mdoel training and while embedding
### We will check what are the frequent words and decide whether we can remove it from the corpus data
### For this we will import counter library from collections module.
#### This will keep track of frequency of each word and store it in dictionary

In [None]:
from collections import Counter

In [None]:
cnt=Counter()

In [None]:
text= u'Hello, i am vikas and my son name is also vikas and my father name is also vikas'
for word in text.split():
    #print(word)
    cnt[word] += 1

In [None]:
cnt.elements

### We will do the same for the column

In [None]:
for sent in df_train['stop_words']:
    for word in sent.split():
        cnt[word]+=1

In [None]:
cnt.most_common(20)
### By This we can remove few: 'like', 'im', 'amp', 'new','2','got' which would reduce that to 1300 characters.
##indexes are 1,2,3,5,10,20

In [None]:
most_common=cnt.most_common(20)[:3]
most_common.append(cnt.most_common(20)[4])
most_common.append(cnt.most_common(20)[9])
most_common.append(cnt.most_common(20)[19])
most_common

## Now we will convert the list of sets to only list of words and then we will create function and remove all those from the text


In [None]:
most_common_list=[word for (word, wordindx) in most_common]
most_common_list

In [None]:
def freq_words(text):
    return ' '.join([word for word in text.split() if word not in most_common_list])
        

In [None]:
text=u'im like scared 2 years ago'
freq_words(text)

In [None]:
df_train['freq_words']= df_train['stop_words'].apply(lambda text:freq_words(text))

In [None]:
df_train[['stop_words', 'freq_words']]

### **Removal of rare words**
#### we will remove rare words by taking elements from the collections counter

In [None]:
cnt.most_common()[-10:]

### As we can see there are many urls present in the text so before removing rare words we will remove urls from the text

## **Removal of URLs**
### We will remove the text which has http and https in the text
### This can be achieved using re library by simply replacing with space
### We will compile first with the http links and then we will replace with space

In [None]:
import re

In [None]:
def remove_urls(text):
    ## Defining url pattern
    url_pattern=re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

In [None]:
text=u'https://www.google.com you can go here'
remove_urls(text)
#It is working

In [None]:
### Lets create a column and apply
df_train['remove_url']= df_train['freq_words'].apply(lambda text: remove_urls(text))

In [None]:
df_train[['freq_words','remove_url']]

### Now we will check the rare words from the text


In [None]:
cnt=Counter()
for sent in df_train['remove_url'].values:
    for word in sent.split():
        
        cnt[word]+= 1

In [None]:
cnt.most_common()[-100:]

## **Still there are some http links which doesnt follow the actual url format**
### For that we have created a custom url identification and removal of those links

In [None]:

def unformatted_urls(text):

  while True:

    index=text.find('http')
    ## If founf anything
    if index !=-1:  
      #print(index)
      link_end_index= text[index:].find(' ')
      # If didn't find any space and returns -1 then that means it is the end of sentence
      if link_end_index !=-1:
        #print(index, link_end_index)
        #When we are taking the end of link we are considering the start index as the start of the link.
        #when trimming we can't take the link index but the index from actual text so strip_end_index
        strip_end_index=index + link_end_index
        text=text[:index] + text[strip_end_index:]
        #print(text)
      else:
        #If that is the end of sentence then print only the index text
        text=text[:index]
    else:
      break
  text = text.strip(' ')
  #print(text)
  #print(text[0], text[-1])
  #text = text[:text.find('http')]
  #text = text[:text.find('https')]
  text=re.sub(5*' ',' ',text)
  text=re.sub(4*' ',' ',text)
  text=re.sub(3*' ',' ',text)
  text=re.sub(2*' ',' ',text)

  return text

In [None]:
text=u'httptcorln09wke9g'
unformatted_urls(text)

In [None]:
### We will apply the same to the column.
### Here we dnt need to create another column as it is related to url we will implement in the same column
df_train['remove_url']= df_train['remove_url'].apply(lambda text:unformatted_urls(text))

### Now lets check the rare words

In [None]:
cnt=Counter()
for sent in df_train['remove_url']:
    for word in sent.split():
        cnt[word] += 1

In [None]:
cnt.most_common()[-10:]
## This is impressive!!

## **Spell Checking**
### We can have more spelling mistakes as it is human driven text not machine.
### We will use Spellchecker library from python to overcome this issue for more accuracte text.

In [None]:
from spellchecker import SpellChecker

## **Lemmatization**
### We will apply lemmatization to the text
### This can be done using spacy

In [None]:
lem=nlp(u'i am a runner who runs 5 km daily and ran for 2 years')
for word in lem:
    print(word.lemma_)
' '.join([word.lemma_ for word in lem])

### We will add the column and apply the same to the text 

In [None]:
def word_lemmatization(text):
    lem=nlp(text)
    return ' '.join([word.lemma_ for word in lem])

In [None]:
text=u'i am a runner who runs 5 km daily and ran for 2 years'
word_lemmatization(text)

### Now create a column and apply the same to the column

In [None]:
df_train['lemm_text']= df_train['remove_url'].apply(lambda text:word_lemmatization(text))

In [None]:
df_train[['remove_url','lemm_text']]

## **Removal of HTML Tags**
### We will remove all the HTML tags available in the text
### We will do this using beautifulsoap library from bs4 package

In [None]:
from bs4 import BeautifulSoup

In [None]:
def remove_tags(text):
    return BeautifulSoup(text,'lxml').text

In [None]:
text='<div> Hello </div>'
remove_tags(text)

### Now lets create  column and apply the remove tags

In [None]:
df_train.columns

In [None]:
df_train['remove_tags'] = df_train['lemm_text'].apply(lambda text: remove_tags(text))

In [None]:
df_train[['lemm_text', 'remove_tags']]

## **Spell Checking**
### We can have more spelling mistakes as it is human driven text not machine.
### We will use Spellchecker library from python to overcome this issue for more accuracte text.

In [None]:
from spellchecker import SpellChecker

In [None]:
doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today")
text="I am a runner running in a race because I love to run since I ran today"
doc=stop_words(text)
#doc=['hi', 'ran','runner']
doc3=[]
for token in doc:
    doc1=nlp(token)
    #print(type(doc1[0].lemma_))
    doc3.append(doc1[0].lemma_)
    #print(type(token))
    #print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_)
#x= [token.lemma_ for token in doc1]
#print(x)
#np.moveaxis(doc3,0,1).shape
doc3

In [None]:
text="I am a runner running in a race because I love to run since I ran today"
print(words_lemmatization(stop_words(text)))
print(stop_words(text))

In [29]:
##Stop Words
def stop_words(text):
    return [word for word in text.split(' ') if word not in nlp.Defaults.stop_words]


##Lemmatization
def words_lemmatization(text):
    lemma_words=[]
    for word in text:
        ##There would be only one word in the 'word' not a list. so we won't loop instead we will directly take the lemmatise word.
        token=nlp(word)
        lemma_words.append(token[0].lemma_)
       
    return lemma_words

def preprocessing_data(text):

  while True:

    index=text.find('http')
    ## If founf anything
    if index !=-1:  
      #print(index)
      link_end_index= text[index:].find(' ')
      # If didn't find any space and returns -1 then that means it is the end of sentence
      if link_end_index !=-1:
        #print(index, link_end_index)
        #When we are taking the end of link we are considering the start index as the start of the link.
        #when trimming we can't take the link index but the index from actual text so strip_end_index
        strip_end_index=index + link_end_index
        text=text[:index] + text[strip_end_index:]
        #print(text)
      else:
        #If that is the end of sentence then print only the index text
        text=text[:index]
    else:
      break
  text = text.strip(' ')
  #print(text)
  #print(text[0], text[-1])
  #text = text[:text.find('http')]
  #text = text[:text.find('https')]
  text=re.sub(5*' ',' ',text)
  text=re.sub(4*' ',' ',text)
  text=re.sub(3*' ',' ',text)
  text=re.sub(2*' ',' ',text)
  ##Stop_words
  text= stop_words(text)
  text= words_lemmatization(text)

  return text

In [30]:
corpus_data,corpus_label=[],[]
with open('train.csv','r', errors='ignore') as train_csv:
  train_file= csv.reader(train_csv)
  next(train_file)
  i=1

  for data in train_file:
    i+=1
    #print(str(data[3]))
    corpus_data.append(preprocessing_data(str(data[3])))
    corpus_label.append(int(data[4]))

    
    


KeyboardInterrupt: 

In [None]:
corpus_data[:5]

In [None]:
nlp.Defaults.stop_words

In [None]:
corpus_data[:2]

In [None]:
corpus_data[35:50]

In [None]:
corpus_data=np.array(corpus_data)
corpus_label=np.array(corpus_label)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
token=Tokenizer(oov_token='OOV')
token.fit_on_texts(corpus_data)
vocab_size=len(token.word_index)

In [None]:
#vocab_size=10000

In [None]:
len(token.word_index)

In [None]:
corpus_seq= token.texts_to_sequences(corpus_data)


In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
corpus_pad=pad_sequences(corpus_seq, padding='post', truncating='post')

In [None]:
corpus_pad.shape

In [None]:
spilt_data=7000
train_pad=corpus_pad[:spilt_data]
train_label=corpus_label[:spilt_data]
test_pad=corpus_pad[spilt_data:]
test_label= corpus_label[spilt_data:]

In [None]:
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense,Dropout,GlobalAveragePooling1D

In [None]:
vocab_size

In [None]:
keras.backend.clear_session()
model= keras.Sequential()
model.add(Embedding(input_dim= vocab_size+1, output_dim=100, input_length=27))
model.add(LSTM(100, return_sequences=True))
#model.add(Dropout(rate=0.8))
#model.add(LSTM(32))
model.add(Dropout(rate=0.5))
model.add(GlobalAveragePooling1D())
#model.add(Dense(256, activation=tf.nn.relu))
#model.add(Dense(32, activation=tf.nn.relu))
model.add(Dense(units=1, activation=tf.nn.sigmoid))
def scheduler(epochs, lr):
  return 1e-8 * (10**(epochs/20))

callback=keras.callbacks.LearningRateScheduler(scheduler)
loss= keras.losses.BinaryCrossentropy()
optimizer= keras.optimizers.Adam(1e-6)
model.compile(loss=loss, optimizer=optimizer,metrics=['accuracy'])
model.summary()


In [None]:

#model.fit(train_pad, train_label, batch_size=32, epochs=100,callbacks=[callback])
model.fit(train_pad, train_label, batch_size=64, epochs=600, validation_data=(test_pad, test_label))

In [None]:
loss=model.history.history['loss']
val_loss=model.history.history['val_loss']
acc=model.history.history['accuracy']
val_acc=model.history.history['val_accuracy']


In [None]:
import matplotlib.pyplot as plt
plt.plot( np.arange(600), loss)


plt.plot(np.arange(600), val_loss)

In [None]:
plt.plot( np.arange(600), acc)


plt.plot(np.arange(600), val_acc)

In [None]:
import pandas as pd

In [None]:
test_data=pd.read_csv('/content/test.csv')

In [None]:
test_data.head()

In [None]:
test_label=[]
with open('../input/nlp-getting-started/test.csv') as test_csv:
  test_data= csv.reader(test_csv)
  next(test_data)
  for data in test_data:
    test_label.append(preprocessing_data(str(data[3])))

In [None]:
test_label= np.array(test_label)

In [None]:
test_label= np.array(test_label)
test_seq=token.texts_to_sequences(test_label)
test_pad= pad_sequences(test_seq, maxlen=30, padding='post', truncating='post')

In [None]:
prediction= model.predict_classes(test_pad)

In [None]:
submit= pd.read_csv('../input/nlp-getting-started/sample_submission.csv')

In [None]:
submit['target']=prediction

In [None]:
submit.to_csv('submit.csv',index=False)