#### Purpose of this kernel - continuance from kernel 6

* Achieve further preprocessing improvements to increase embeddings coverage

#### Import libraries

In [1]:
# General
import pandas as pd
import numpy as np
import os
import gc
import sys

# Preprocessing
import seaborn as sns
import re
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from gensim.models import KeyedVectors

# Modeling
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

# Training
from sklearn.model_selection import StratifiedKFold
  # splits train-set into into train and validation folds
    
# Evaluation
from keras.callbacks import Callback
from sklearn.metrics import f1_score, precision_score, recall_score
import matplotlib.pyplot as plt

Using TensorFlow backend.


#### Hyperparameters

In [2]:
# Fast Run Testing
#total_train_samples = 100000 # max is 1306122
#total_test_samples = 2000 # max is 375806
total_train_samples = 1306122 # max is 1306122
total_test_samples = 375806 # max is 375806

# Preprocessing
maxlen = 130 # 130 covers about 75% of all bad questions completely

# Modeling
embedding_dim = 300 # set to 300 to be able to compare with pre-trained embeddings

# Training
kfolds = 3
model_epochs = 10

### Load Data

In [3]:
df = pd.read_csv("../input/train.csv")
str_ = 'Train data loaded'
os.system('echo '+str_)

0

In [4]:
df = df[:total_train_samples] # for Testing purposes
num_samples,n = df.shape
print("Shape for this run: ", num_samples, n)

X = df.loc[:, 'question_text'].values
y = df.loc[:, 'target'].values

# Since Neural Networks are only able to perform transformations on tensors 
y = np.asarray(y) # Transformation target labels to numpy array 

print('Shape data tensor:', X.shape) 
print('Shape target tensor:', y.shape) # 1D Tensor

pd.set_option('display.max_colwidth', 1500) # inrease display column size
df.head(3)

Shape for this run:  1306122 3
Shape data tensor: (1306122,)
Shape target tensor: (1306122,)


Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province as a nation in the 1960s?,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you encourage people to adopt and not shop?",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity affect space geometry?,0


### Data Preparation (1)  - tokenization

In [5]:
def my_tokenizer(texts):
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(texts) 
        sequences = tokenizer.texts_to_sequences(texts)
        padded_seq = pad_sequences(sequences, maxlen=maxlen)  
        word_index = tokenizer.word_index  
        
        return padded_seq, word_index
    
# Apply tokenization on whole dataset
#padded_seq, word_index = my_tokenizer(X)
#os.system('echo Tokenization completed')
#print("Found {} unique tokens".format(len(word_index)))
#print("Top 5 most frequent words: {}".format(
#    {word: word_index[word] for word in list(word_index)[:5]}))

### Data Preparation (2)  - Embeddings

In [6]:
# Embeddings path
_glove = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
_paragram =  '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
_wiki_news = '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
_google_news = '../input/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'

embeddings = [{'name': 'glove', 'embeddings_path': _glove},
              {'name': 'paragram', 'embeddings_path': _paragram},
              {'name': 'fasttext', 'embeddings_path': _wiki_news},
              {'name': 'googlenews', 'embeddings_path': _google_news}]

#### Definition of functions to load and analyse embeddings

In [7]:
# Function to create embedding matrix
embedding_matrices = {}
words_in_embedding = {}
def create_model_embedding_matrix(embeddings_name,word_index,embeddings_dict):

    embedding_dim = 300 # (vector size 300!)
    embedding_matrix = np.zeros((len(word_index)+1, embedding_dim))
    unknown_words_dict = {}
    num_known_words = 0  

    # Filling up matrix
    for word, i in word_index.items(): 
        
        if embeddings_name in ['glove', 'paragram', 'fasttext']:
            embedding_vector = embeddings_dict.get(word) # get vector for word from embedding 
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
                num_known_words +=1
            else:
                unknown_words_dict[word] = word_index[word] 
                
        if embeddings_name == 'googlenews':
            try:
                embedding_vector = embeddings_dict[word]  
                embedding_matrix[i] = embedding_vector
                num_known_words +=1
            except:
                unknown_words_dict[word] = word_index[word]
    try: 
        words_in_embedding[embeddings_name] = list(embeddings_dict.keys())
    except:
        print("Error during generation of key list {}".format(embeddings_name))
        print(sys.exc_info()[0])
    
    print('  Embeddings_matrix created')
    print('    Shape embedding_matrix: {}'.format(embedding_matrix.shape))
    print('  Found Embeddings for {:.2f}% of all words'
          .format((num_known_words / len(word_index))*100))
    print('  Unknown Words: {:.2f}%'.format((len(unknown_words_dict) / len(word_index))*100))
    # Top 50 unknown words
    print("Top 50 unknown words: {}\n".format(
    {w: unknown_words_dict[w] for w in list(unknown_words_dict)[:50]}))
    
    del num_known_words, unknown_words_dict 
    del embedding_matrix; gc.collect() 

In [8]:
# Function to load + analyze Embeddings
def load_and_analyse_Embeddings(embeddings_name, embeddings_path):
    
    if embeddings_name in ['glove', 'paragram', 'fasttext']:  
        embeddings_dict = {} # create empty embedding dictionary
        embedding_file = open(embeddings_path, encoding ="utf8", errors = 'ignore') # load embedding from path

        # Fill embedding dict with word: vector(coefs) pairs
        for line in embedding_file:
            line_values = line.split(' ') # read in values of respective line (= vector)
            word = line_values[0] #  # first value in line represents the word
            coefs = np.asarray(line_values[1:], dtype='float32') # all values represent vector
            embeddings_dict[word] = coefs # add key(word), value(vector) pairs to dict

        embedding_file.close() 
        
        os.system('echo '+ embeddings_name + 'loaded')
        print('  ',embeddings_name, 'loaded')
        print('  {} word vectors within {} dict'.format(len(embeddings_dict),embeddings_name))
        
        # Use pre-trained embedding to create final embeddings matrix
        create_model_embedding_matrix(embeddings_name,word_index,embeddings_dict)
        del embeddings_dict, line_values,word,coefs
                
    if embeddings_name == 'googlenews':
        embeddings_file = KeyedVectors.load_word2vec_format(embeddings_path, binary=True)
        
        os.system('echo '+ embeddings_name + 'loaded')
        print('  ',embeddings_name, 'loaded')
        
        # Use pre-trained embedding to create final embeddings matrix
        create_model_embedding_matrix(embeddings_name,word_index,embeddings_file)
        del embeddings_file
        
    # MEMORY MANAGEMENT!
    del embeddings_name, embeddings_path
    gc.collect()
    
   # return embeddings_dict

### Data Preparation (3)  - Data Cleaning

#### Definition mapping and functions

In [9]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", 
                       "could've": "could have", "couldn't": "could not", "didn't": "did not",  
                       "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", 
                       "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", 
                       "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  
                       "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have",
                       "I'm": "I am","i'm": "i am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", 
                       "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", 
                       "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", 
                       "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", 
                       "mayn't": "may not", "might've": "might have","mightn't": "might not",
                       "mightn't've": "might not have", "must've": "must have", "mustn't": "must not", 
                       "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have",
                       "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", 
                       "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", 
                       "she'd": "she would", "she'd've": "she would have", "she'll": "she will", 
                       "she'll've": "she will have", "she's": "she is", "should've": "should have", 
                       "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have",
                       "so's": "so as", "this's": "this is","that'd": "that would", 
                       "that'd've": "that would have", "that's": "that is", "there'd": "there would", 
                       "there'd've": "there would have", "there's": "there is", "here's": "here is",
                       "they'd": "they would", "they'd've": "they would have", "they'll": "they will", 
                       "they'll've": "they will have", "they're": "they are", "they've": "they have", 
                       "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", 
                       "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", 
                       "weren't": "were not","what`s": "what is", "what'll": "what will", "what'll've": "what will have", 
                       "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", 
                       "when've": "when have", "where'd": "where did", "where's": "where is", 
                       "where've": "where have", "who'll": "who will", "who'll've": "who will have", 
                       "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", 
                       "will've": "will have", "won't": "will not", "won't've": "will not have", 
                       "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", 
                       "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have",
                       "y'all're": "you all are","y'all've": "you all have","you'd": "you would", 
                       "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", 
                       "you're": "you are", "you've": "you have"}

# dict from https://www.kaggle.com/theoviel/improve-your-score-with-text-preprocessing-v2 
correct_spell_dict = {'colour': 'color', 'centre': 'center', 'favourite': 'favorite',
                    'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater',
                    'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization',
                    'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ',
                    'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What',
                    'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are',
                    'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many',
                    'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best',
                    'howdoes': 'how does', 'mastrubation': 'masturbation',
                    'mastrubate': 'masturbate', "mastrubating": 'masturbating',
                    "mcdonald's":'mcdonalds',
                    'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist',
                    'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 
                    'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what',
                    'watsapp': 'whatsapp', 'demonitisation': 'demonetization',
                    'demonitization': 'demonetization', 'demonetisation': 'demonetization',
                    'pokémon': 'pokemon'}

# Kernel "fork-embeddings-keras-v04"
specials_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", 
                 "²": "2", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"', '”': '"', 
                 '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 
                 'à': 'a', '−': '-', 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', '\u200b': ' ',
                 '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''}

punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'

In [10]:
import re

def further_cleaning(x):
    x = str(x)
    x = x.lower()
    x = re.sub('[’‘´`]', "'", x)
    for word in x.split():
        if word in specials_mapping.keys():
            x = re.sub(word, specials_mapping.get(word),x)
        if word in contraction_mapping.keys():
            x = re.sub(word, contraction_mapping.get(word),x)
    x = re.sub('\'s\s', ' ', x)
    for p in punct:
        x = x.replace(p, f' {p} ')
    return x

def clean_numbers(x):
    # replaces one digit by #, two following digits by ## etc.
    x = re.sub('[0-9]{5,}', '#####', x) 
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

def correct_spelling(x):
    x = str(x)
    x = x.lower()
    for word in x.split():
        if word in correct_spell_dict.keys():
            x = x.replace(word, correct_spell_dict[word])
    return x

#### Apply preprocessing functions

In [11]:
os.system('echo Applying preprocessing functions..')
df["question_text"] = df["question_text"].apply(lambda x: correct_spelling(x))
os.system('echo correct_spelling done')
df["question_text"] = df["question_text"].apply(lambda x: clean_numbers(x))
os.system('echo clean_numbers done')
df["question_text"] = df["question_text"].apply(lambda x: further_cleaning(x))
os.system('echo further_cleaning done')

X = df.loc[:, 'question_text'].values
y = np.asarray(df.loc[:, 'target'].values)

padded_seq, word_index = my_tokenizer(X) # Tokenization
os.system('echo Tokenization 2 completed')

0

#### Iteration loop to compare different embeddings (3)

In [12]:
for embedding in embeddings:
    emb_name = embedding['name']
    emb_path = embedding['embeddings_path']
    print("Running procedure on {}:".format(emb_name))
    
    load_and_analyse_Embeddings(emb_name, emb_path) # loading embedding

Running procedure on glove:
   glove loaded
  2196016 word vectors within glove dict
  Embeddings_matrix created
    Shape embedding_matrix: (187145, 300)
  Found Embeddings for 63.16% of all words
  Unknown Words: 36.84%
Top 50 unknown words: {'quorans': 1900, 'brexit': 2842, 'cryptocurrencies': 2965, 'redmi': 3658, 'kvpy': 3874, 'paytm': 3884, 'iiser': 3962, 'ethereum': 4035, 'iisc': 4600, 'jinping': 5579, '₹': 5769, 'viteee': 6083, 'iocl': 6251, 'nmims': 6658, 'rohingya': 6834, 'upes': 6840, 'fortnite': 6855, 'coinbase': 7057, 'nsit': 7125, 'cpec': 7184, 'iitians': 7297, 'oneplus': 7412, 'jadavpur': 7435, 'udemy': 7829, 'lyft': 7899, 'uceed': 7990, 'bahubali': 8019, 'afcat': 8053, 'coep': 8202, 'bhakts': 8423, 'upwork': 8586, 'machedo': 8734, 'gdpr': 8827, 'nlu': 8846, 'adityanath': 8863, 'upsee': 8932, 'boruto': 9082, 'bnbr': 9212, 'chsl': 9236, 'kernan': 9388, 'amcat': 9563, 'josaa': 9574, 'udacity': 9612, 'kylo': 9660, 'alshamsi': 9715, 'vishwanathan': 9731, 'iitian': 9773, 'dceu

In [13]:
print("mcdonald's" in words_in_embedding['glove'])
print("mcdonalds" in words_in_embedding['glove'])

print("woman's" in words_in_embedding['glove'])
print("woman" in words_in_embedding['glove'])

print("people's" in words_in_embedding['glove'])
print("people" in words_in_embedding['glove'])

False
True
False
True
False
True


#### Comparing Different Stemming  & Lemmatisation methods

In [14]:
paragram_out = {'quorans': 1901, 'brexit': 2852, 'redmi': 3659, "'the": 4605, '₹': 6002, 'coinbase': 7040, '“the': 7246, 'oneplus': 7473, 'uceed': 7976, "'i": 8422, 'bhakts': 8429, 'upwork': 8561, "5'": 8756, 'machedo': 8780, 'gdpr': 8801, 'adityanath': 8843, 'boruto': 9065, 'bnbr': 9183, "isn't": 9232, '“i': 9623, 'alshamsi': 9683, 'dceu': 9793, "parents'": 9895, 'litecoin': 9994, 'iiest': 10091, 'unacademy': 10164, 'sjws': 10252, "qur'an": 10285, 'qoura': 10354, "aren't": 10361, 'zerodha': 10511, 'tensorflow': 10983, 'doklam': 11349, 'kavalireddi': 11449, 'lnmiit': 11546, '°c': 11707, 'muoet': 11774, "others'": 11947, "countries'": 12032, "us'": 12066, "you'": 12147, 'etc…': 12164, 'nicmar': 12259, 'vajiram': 12495, 'adhaar': 12569, 'zebpay': 12809, 'srmjee': 12827, 'elitmus': 12899, "5'4": 13044}
unknown_words_list = paragram_out.keys()

In [15]:
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer 
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

porter = PorterStemmer()
lancaster = LancasterStemmer()
englishStemmer=SnowballStemmer("english")
wordnet_lemmatizer = WordNetLemmatizer()

print("{0:15}{1:10}{2:10}{3:15}{4:15}{5:15}".format("Word","Porter Stemmer","lancaster Stemmer",
                                        "Snowball Stemmer", "Lemma", "remove_suffix_s"))
for word in unknown_words_list:
    print("{0:15}{1:10}{2:10}{3:15}{4:15}{5:15}".format(word,porter.stem(word),lancaster.stem(word)
                                                  , englishStemmer.stem(word), 
                                                  wordnet_lemmatizer.lemmatize(word, pos = "v"),
                                                 remove_suffix_s(word)))

Word           Porter Stemmerlancaster StemmerSnowball StemmerLemma          remove_suffix_s


NameError: name 'remove_suffix_s' is not defined