# Hey Kagglers!! Lets dive into the "fancy" NLP world with a simple yet illustrative problem

In [None]:
from IPython import display
display.Image("../input/nlpproject/nlp-natural-language-processing-cognitive-computing-technology-concept-hand-businessman-touching-hologram-screen-world-map-216384742.jpg", width=1200,height=400,)

⚙️**As always ... Lets get our tool ready before we get started!**

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import numpy as np 
import pandas as pd 
from keras.datasets import imdb
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import Tokenizer  
from nltk.tokenize import word_tokenize
from keras.preprocessing.sequence import pad_sequences
from keras import models
import string
import seaborn as sns
from nltk.stem.porter import PorterStemmer
from keras import layers
from keras import losses
from keras import metrics
import html
from tensorflow.keras import optimizers
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')
import re
from nltk.tokenize.casual import TweetTokenizer
from tensorflow.keras.utils import plot_model
import unicodedata

Loading data set 

In [None]:
train_df=pd.read_csv('../input/nlp-getting-started/train.csv')
test_df=pd.read_csv('../input/nlp-getting-started/test.csv')

In [None]:
train_df.head(30)

*As we can see, the text needs alot of preprocessing to remove all unwanted and misleading parts*

In [None]:
train_df.info()

**For today ... we are going to stick only with the text and ignore the other features**

In [None]:
train_df.drop(['id','keyword','location'],axis = 1, inplace = True)
#we will need the test id column later for results submision 
test_id = test_df['id']
test_df.drop(['id','keyword','location'], axis = 1 , inplace = True)

In [None]:
train_df.head()

# 🔎EDA

In [None]:
display.Image("../input/exploratory/explore.jpeg", width=600)

**Lets take a quick look at the distribution of the target**

In [None]:
train_df['target'].value_counts()

In [None]:
colors = sns.color_palette('Set2')

plt.figure(figsize=(10,10))
plt.title('Percent of disaster-related tweets')
explode = (0, 0.05)
train_df['target'].value_counts().plot.pie(shadow = True,colors = colors,explode = explode,autopct='%.2f%%')

Note: 1 is related to a disaster                                                                  
              0 is not related to a disaster

**Number of characters in tweets**

In [None]:
tweet = train_df
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))
tweet_len=tweet[tweet['target']==1]['text'].str.len()
ax1.hist(tweet_len,color='red')
ax1.set_title('disaster tweets')
tweet_len=tweet[tweet['target']==0]['text'].str.len()
ax2.hist(tweet_len,color='green')
ax2.set_title('Not disaster tweets')
fig.suptitle('Characters in tweets')
plt.show()

**Average word length in a tweet**

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))
word=tweet[tweet['target']==1]['text'].str.split().apply(lambda x : [len(i) for i in x])
sns.distplot(word.map(lambda x: np.mean(x)),ax=ax1,color='red')
ax1.set_title('disaster')
word=tweet[tweet['target']==0]['text'].str.split().apply(lambda x : [len(i) for i in x])
sns.distplot(word.map(lambda x: np.mean(x)),ax=ax2,color='green')
ax2.set_title('Not disaster')
fig.suptitle('Average word length in each tweet');

**Now lets make the necessary preprocessing to get the data ready for modeling**

# 🗳Text Preprocessing

In [None]:
display.Image("../input/dataprep/datapreproc.png",width=500, height=500)

In [None]:
punctuation =string.punctuation[1:]
vocab = 20000

In [None]:
def remove_special_chars(text):
    re1 = re.compile(r'  +')
    x1 = text.lower().replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>', 'u_n').replace(' @.@ ', '.').replace(
        ' @-@ ', '-').replace('\\', ' \\ ').replace('.*', '.').replace('#', '')
    return re1.sub(' ', html.unescape(x1))

def to_lowercase(text):
    return text.lower()

def replace_numbers(text):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    return re.sub(r'\d+', '', text)

def remove_non_ascii(text):
    """Remove non-ASCII characters from list of tokenized words"""
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

def sequencing(text):
    t = TweetTokenizer(preserve_case=False,strip_handles=True, reduce_len=True)
    seq=t.tokenize(text)
    return seq

def remove_punct(seq):
    words = [w for w in seq if w not in string.punctuation[1:]] #remove all punc excluding exlimnation mark
    return words

def stop_words_remove(seq):
    stop_words = stopwords.words('english')
    words = [w for w in seq if w not in stop_words]
    return words

# lemmetization

def lemmatize_words(words):
    """Lemmatize words in text"""

    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in words]

def lemmatize_verbs(words):
    """Lemmatize verbs in text"""

    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word, pos='v') for word in words])

#join all
def list_text(words):
    return ''.join(words)

URL_remover = lambda x: re.sub(r'http\S+', '', x)

def normalize_text( text):
    text = remove_special_chars(text)
    text = remove_non_ascii(text)
    text = to_lowercase(text)
    text = replace_numbers(text)
    text = URL_remover(text)
    words = sequencing(text)
    words = remove_punct(words)
    words = stop_words_remove(words)
    words=lemmatize_words(words)
    words=lemmatize_verbs(words)
    words=list_text(words)
    return words

def normalize_corpus(corpus):
      return [normalize_text(t) for t in corpus]

In [None]:
corpus_train=normalize_corpus(list(train_df.text))
corpus_test=normalize_corpus(list(test_df.text))

In [None]:
corpus_train[:10]

# Tokenizer

In [None]:
display.Image("../input/tokenizerr/tok.png",width=700)

### **Tokenization is essentially splitting a phrase, sentence, paragraph, or an entire text document into smaller units, such as individual words or terms**

In [None]:
from keras.preprocessing.text import Tokenizer
tok = Tokenizer(oov_token='UNK',num_words=vocab)
texts=corpus_train
tok.fit_on_texts(texts)

In [None]:
len(tok.word_index.keys())

In [None]:
#token dictionary
tok.word_index

Sorry for all that scrolling :)

### **So far, we have tranformed the text into binary/digital form that can be understood by ML models.**           
### **However, we can further apply or extract different features from the vectorized form.**                            
### **In other words, we can represent the sequence of word indices we obtained in different forms**                        

## BoW with keras tokenize

### 3 BoW models for the following features:

> 1) Binary (default, is word present or not)                                                  
> 2) Count + Freq (count of each word in text)                                 
> 3) TF-IDF (frequency-inverse scoring for each word)                           

In [None]:
bow = tok.texts_to_matrix(texts[:10], mode='count')
bow.shape

In [None]:
#simple illustration for bow
bow

# 1) Binary

In [None]:
x_binary = tok.texts_to_matrix(texts, mode='binary')
y_binary = train_df['target']
x_binary.shape , x_binary.shape[0]

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x_binary, y_binary, test_size=0.4, random_state=42)

In [None]:
import tensorflow as tf
tf.keras.backend.clear_session()
model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(x_binary.shape[1],)))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer=optimizers.RMSprop(lr=0.001),
              loss=losses.binary_crossentropy,
              metrics=[metrics.binary_accuracy])
			  
history = model.fit(x_train,
                    y_train,
                    epochs=20,
                    batch_size=512,
                    validation_data=(x_val, y_val))
history_dict = history.history
history_dict.keys()

In [None]:
acc = history.history['binary_accuracy']
val_acc = history.history['val_binary_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
plt.clf()   # clear figure
acc_values = history_dict['binary_accuracy']
val_acc_values = history_dict['val_binary_accuracy']

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()


# 2) Count

In [None]:
x_count = tok.texts_to_matrix(texts, mode='count')
y_count = train_df['target']
x_count.shape , x_count.shape[1]

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x_count, y_count, test_size=0.4, random_state=42)

In [None]:
tf.keras.backend.clear_session()
modelc = models.Sequential()
modelc.add(layers.Dense(16, activation='relu', input_shape=(x_binary.shape[1],)))
modelc.add(layers.Dense(16, activation='relu'))
modelc.add(layers.Dense(1, activation='sigmoid'))

modelc.compile(optimizer=optimizers.RMSprop(lr=0.001),
              loss=losses.binary_crossentropy,
              metrics=[metrics.binary_accuracy])

history = modelc.fit(x_train,
                    y_train,
                    epochs=20,
                    batch_size=512,
                    validation_data=(x_val, y_val))
history_dict = history.history
history_dict.keys()


In [None]:
acc = history.history['binary_accuracy']
val_acc = history.history['val_binary_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
plt.clf()   # clear figure
acc_values = history_dict['binary_accuracy']
val_acc_values = history_dict['val_binary_accuracy']

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

# 3) TF_IDF

In [None]:
x_idf = tok.texts_to_matrix(texts, mode='tfidf')
y_idf = train_df['target']
x_idf.shape , x_idf.shape[1]

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x_idf, y_idf, test_size=0.4, random_state=42)

In [None]:
from keras.layers import Dropout


In [None]:
tf.keras.backend.clear_session()
model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(x_idf.shape[1],)))
#model.add(Dropout(0.2))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer=optimizers.RMSprop(lr=0.001),
              loss=losses.binary_crossentropy,
              metrics=[metrics.binary_accuracy])

history = model.fit(x_train,
                    y_train,
                    epochs=20,
                    batch_size=512,
                    validation_data=(x_val, y_val))
history_dict = history.history
history_dict.keys()

In [None]:
plt.clf()   # clear figure
acc_values = history_dict['binary_accuracy']
val_acc_values = history_dict['val_binary_accuracy']

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

>**TF-IDF not normalized?**
>
>**In the above setting, TFIDF in keras texts_to_matrix produce non normalized values. This is because the IDF ~= 1/DF,               
but log(1+DF) For this bug, it's better to use sklearn TfidfVectorizer**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=10000)
vectorizer.fit(texts)


x_idf = vectorizer.transform(texts)

import scipy 
x_idf =scipy.sparse.csr_matrix.todense(x_idf)


y_idf = train_df['target']
x_idf.shape , x_idf.shape[1]

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x_idf, y_idf, test_size=0.4, random_state=42)

In [None]:
tf.keras.backend.clear_session()
model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(x_idf.shape[1],)))
#model.add(Dropout(0.2))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer=optimizers.RMSprop(lr=0.001),
              loss=losses.binary_crossentropy,
              metrics=[metrics.binary_accuracy])

history = model.fit(x_train,
                    y_train,
                    epochs=20,
                    batch_size=512,
                    validation_data=(x_val, y_val))
history_dict = history.history
history_dict.keys()

In [None]:
acc = history.history['binary_accuracy']
val_acc = history.history['val_binary_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
plt.clf()   # clear figure
acc_values = history_dict['binary_accuracy']
val_acc_values = history_dict['val_binary_accuracy']

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

**As we can see, the count bow model preformed best for the same NN dense model with the highest accuracy ; howerver the freq idf was least to overfit!**

Lets submit !

In [None]:
x_test = tok.texts_to_matrix(corpus_test, mode='count')
s=np.array(modelc.predict(x_test)>0.5).astype('int')
subm=pd.DataFrame(test_id ,columns=['id'])
subm['target']=s

In [None]:
subm.to_csv('submission1.csv',index=False)