#### Purpose of this kernel

- Train 2 Models with embedding concatenation

#### Import libraries

In [1]:
%%capture --no-stdout

# General
import pandas as pd
import numpy as np
import os
import gc
import sys
import time

# Preprocessing
import seaborn as sns
import re
from re import *
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from gensim.models import KeyedVectors
import nltk
from nltk.stem import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import SnowballStemmer 
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet as wn
lemmatizer = nltk.WordNetLemmatizer()

# Modeling
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense
from keras.layers import SimpleRNN, GRU, Bidirectional, LSTM,CuDNNLSTM, CuDNNGRU
from keras.layers import SpatialDropout1D, Dropout
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.layers import BatchNormalization

# Training
# from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit

# Evaluation
from keras.callbacks import Callback
from sklearn.metrics import f1_score, precision_score, recall_score
import matplotlib.pyplot as plt

In [2]:
# Check status and availability of GPU
import tensorflow as tf
print("GPU on?  - ", tf.test.is_gpu_available())
print("Available GPUs: ", tf.test.gpu_device_name())

# confirm Keras sees the GPU
from keras import backend
assert len(backend.tensorflow_backend._get_available_gpus()) > 0

GPU on?  -  True
Available GPUs:  /device:GPU:0


In [3]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/quora-insincere-questions-classification/sample_submission.csv
/kaggle/input/quora-insincere-questions-classification/train.csv
/kaggle/input/quora-insincere-questions-classification/test.csv
/kaggle/input/quora-insincere-questions-classification/embeddings/paragram_300_sl999/paragram_300_sl999.txt
/kaggle/input/quora-insincere-questions-classification/embeddings/paragram_300_sl999/README.txt
/kaggle/input/quora-insincere-questions-classification/embeddings/glove.840B.300d/glove.840B.300d.txt
/kaggle/input/quora-insincere-questions-classification/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec
/kaggle/input/quora-insincere-questions-classification/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin


#### Hyperparameters

In [4]:
# Fast Run Testing
#total_train_samples = 5000 # max is 1306122
total_train_samples = 1306122 # max is 1306122

# Preprocessing
maxlen = 130 # 130 - covers about 75% of all bad questions completely
max_words = 9999999 # if all words shall be used, type huge number here

# Training
kfolds = 2 # 80/20 split
model_epochs = 5

## Load Data

In [5]:
df = pd.read_csv("/kaggle/input/quora-insincere-questions-classification/train.csv")
str_ = 'Train data loaded'
os.system('echo '+str_)

0

In [6]:
df = df[:total_train_samples] # for Testing purposes
num_samples,n = df.shape
print("Shape for this run: ", num_samples, n)

pd.set_option('display.max_colwidth', 1500) # inrease display column size
df.head(3)

Shape for this run:  1306122 3


Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province as a nation in the 1960s?,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you encourage people to adopt and not shop?",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity affect space geometry?,0


## Preprocessing

### Data Preparation (1)  - tokenization

In [7]:
def my_tokenizer(texts):
        tokenizer = Tokenizer() 
        tokenizer.fit_on_texts(texts) 
        sequences = tokenizer.texts_to_sequences(texts)
        padded_seq = pad_sequences(sequences, maxlen=maxlen)  
        word_index = tokenizer.word_index  
        
        return padded_seq, word_index, tokenizer

#### Definition of functions to load and analyse embeddings

In [8]:
# Functions for lemmatization from http://textmining.wp.hs-hannover.de/Preprocessing.html

def wntag(pttag):
    if pttag in ['JJ', 'JJR', 'JJS']:
        return wn.ADJ
    elif pttag in ['NN', 'NNS', 'NNP', 'NNPS']:
        return wn.NOUN
    elif pttag in ['RB', 'RBR', 'RBS']:
        return wn.ADV
    elif pttag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
        return wn.VERB
    return None

def lemmatize(lemmatizer,word,pos):
    if pos == None:
        return word
    else:
        return lemmatizer.lemmatize(word,pos)

In [9]:
# Function to create embedding matrix
embedding_matrices = {}
words_in_embedding = {}
def create_model_embedding_matrix(embeddings_name,word_index,max_words, embeddings_dict):

    embedding_dim = 300 # (vector size 300!)
    embedding_matrix = np.zeros((max_words+1, embedding_dim))
    unknown_words_list = []
    num_known_words = 0  
        
    ps = PorterStemmer()
    ps_counter = 0
    lc = LancasterStemmer()
    lc_counter = 0
    sb = SnowballStemmer("english")
    sb_counter = 0
    lemma_counter = 0

    # Filling up matrix
    for word, i in word_index.items(): 
        
        if embeddings_name in ['glove', 'paragram', 'fasttext'] and i <= max_words:
            
            embedding_vector = embeddings_dict.get(word) # get vector for word from embedding 
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
                num_known_words +=1
                continue # if embedding found - process next word
                
            word_c = word.lower()
            embedding_vector = embeddings_dict.get(word_c)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
                num_known_words +=1
                continue # if embedding found - process next word
                
            word_c = word.capitalize()
            embedding_vector = embeddings_dict.get(word_c)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
                num_known_words +=1
                continue # if embedding found - process next word
                
            word_c = word.upper()
            embedding_vector = embeddings_dict.get(word_c)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
                num_known_words +=1
                continue # if embedding found - process next word
                
            word_c = ps.stem(word)
            embedding_vector = embeddings_dict.get(word_c)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
                num_known_words +=1
                ps_counter +=1
                continue # if embedding found - process next word
                
            word_c = lc.stem(word)
            embedding_vector = embeddings_dict.get(word_c)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
                num_known_words +=1
                lc_counter +=1
                continue # if embedding found - process next word
                
            word_c = sb.stem(word)
            embedding_vector = embeddings_dict.get(word_c)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
                num_known_words +=1
                sb_counter +=1
                continue # if embedding found - process next word
                
            word_c = lemmatize(lemmatizer,pos_tag([word])[0][0],wntag(pos_tag([word])[0][1]))
            embedding_vector = embeddings_dict.get(word_c)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
                num_known_words +=1
                lemma_counter +=1
                continue # if embedding found - process next word
                
            else:
                unknown_words_list.append(word)
                
        if embeddings_name == 'googlenews' and i <= max_words:
            
            try:
                word_c = word
                embedding_vector = embeddings_dict[word_c]  
                if embedding_vector is not None:
                    embedding_matrix[i] = embedding_vector
                    num_known_words +=1
                    continue # if embedding found - process next word

                word_c = word.lower()
                embedding_vector = embeddings_dict[word_c]  
                if embedding_vector is not None:
                    embedding_matrix[i] = embedding_vector
                    num_known_words +=1
                    continue # if embedding found - process next word
                
                word_c = word.capitalize()
                embedding_vector = embeddings_dict[word_c]
                if embedding_vector is not None:
                    embedding_matrix[i] = embedding_vector
                    num_known_words +=1 
                    continue # if embedding found - process next word
                
                word_c = word.upper()
                embedding_vector = embeddings_dict[word_c]   
                if embedding_vector is not None:
                    embedding_matrix[i] = embedding_vector
                    num_known_words +=1
                    continue # if embedding found - process next word
                    
                word_c = ps.stem(word)
                embedding_vector = embeddings_dict[word_c]  
                if embedding_vector is not None:
                    embedding_matrix[i] = embedding_vector
                    num_known_words +=1
                    ps_counter +=1
                    continue # if embedding found - process next word
                    
                word_c = lc.stem(word)
                embedding_vector = embeddings_dict[word_c] 
                if embedding_vector is not None:
                    embedding_matrix[i] = embedding_vector
                    num_known_words +=1
                    lc_counter +=1
                    continue # if embedding found - process next word
                    
                word_c = sb.stem(word)
                embedding_vector = embeddings_dict[word_c] 
                if embedding_vector is not None:
                    embedding_matrix[i] = embedding_vector
                    num_known_words +=1
                    sb_counter +=1
                    continue # if embedding found - process next word
                    
                word_c = lemmatize(lemmatizer,pos_tag([word])[0][0],wntag(pos_tag([word])[0][1]))
                embedding_vector = embeddings_dict[word_c] 
                if embedding_vector is not None:
                    embedding_matrix[i] = embedding_vector
                    num_known_words +=1
                    lemma_counter +=1
                    continue # if embedding found - process next word
                    
            except:
                unknown_words_list.append(word)
                
    try: 
        words_in_embedding[embeddings_name] = list(embeddings_dict.keys())
    except:
        try:
            words_in_embedding[embeddings_name] = list(embeddings_dict.wv.vocab)
        except:
            print("Error during generation of key list {}".format(embeddings_name))
            print(sys.exc_info()[0])
    
   
    print('  Embeddings_matrix created')
    print('  Shape embedding_matrix: {}'.format(embedding_matrix.shape))
    print('  Found Embeddings for {:.2f}% of all words'
          .format((num_known_words / max_words)*100))
    print("  num_known_words :", num_known_words)
    print("  num words in word_index: ", max_words)
    print('  Unknown Words: {:.2f}%'.
          format(((len(unknown_words_list)) / max_words)*100))
    print("  Words found by PorterStemmer: {}".format(ps_counter))
    print("  Words found by LancasterStemmer: {}".format(lc_counter))
    print("  Words found by SnowballStemmer: {}".format(sb_counter))
    print("  Words found by Lemmatisation: {}".format(lemma_counter))
          
    # Top 50 unknown words
    print("  Top 50 unknown words:\n {}\n".format(unknown_words_list[:50]))
    
    del num_known_words, unknown_words_list,ps,lc,sb, ps_counter, lc_counter, sb_counter
    del lemma_counter; gc.collect()
    
    return embedding_matrix

In [10]:
# Function to load + analyze Embeddings
def load_and_analyse_Embeddings(embeddings_name, embeddings_path, max_words):
    
    if embeddings_name in ['glove', 'paragram', 'fasttext']:  
        embeddings_dict = {} # create empty embedding dictionary
        embedding_file = open(embeddings_path, encoding ="utf8", errors = 'ignore') # load embedding from path

        # Fill embedding dict with word: vector(coefs) pairs
        for line in embedding_file:
            line_values = line.split(' ') # read in values of respective line (= vector)
            word = line_values[0] #  # first value in line represents the word
            coefs = np.asarray(line_values[1:], dtype='float32') # all values represent vector
            embeddings_dict[word] = coefs # add key(word), value(vector) pairs to dict

        embedding_file.close() 
        
        os.system('echo '+ embeddings_name + 'loaded')
        print('  ',embeddings_name, 'loaded')
        print('  {} word vectors within {} dict'.format(len(embeddings_dict),embeddings_name))
        
        # Use pre-trained embedding to create final embeddings matrix
        embedding_matrix = create_model_embedding_matrix(embeddings_name,word_index,max_words, embeddings_dict)
        del embeddings_dict, line_values,word,coefs
                
    if embeddings_name == 'googlenews':
        embeddings_file = KeyedVectors.load_word2vec_format(embeddings_path, binary=True)
        
        os.system('echo '+ embeddings_name + 'loaded')
        print('  ',embeddings_name, 'loaded')
        
        # Use pre-trained embedding to create final embeddings matrix
        embedding_matrix = create_model_embedding_matrix(embeddings_name,word_index,max_words, embeddings_file)
        del embeddings_file
        
    # MEMORY MANAGEMENT!
    del embeddings_name, embeddings_path
    gc.collect()
    
    return embedding_matrix

In [11]:
def concatenate_embeddings(conc_embedding, embedding_matrix):
    
    if conc_embedding is not None:
        conc_embedding = np.concatenate((conc_embedding,embedding_matrix), axis = 1 )
        print("Added embedding. New shape: {}".format(conc_embedding.shape))
    else:
        conc_embedding = embedding_matrix
        print("Added embedding. First shape: {}".format(conc_embedding.shape))
        
    del embedding_matrix; gc.collect()
    return conc_embedding

### Data Preparation (3)  - Data Cleaning

#### Definition mapping and functions

In [12]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", 
                       "could've": "could have", "couldn't": "could not", "didn't": "did not",  
                       "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", 
                       "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", 
                       "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  
                       "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have",
                       "I'm": "I am","i'm": "i am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", 
                       "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", 
                       "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", 
                       "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", 
                       "mayn't": "may not", "might've": "might have","mightn't": "might not",
                       "mightn't've": "might not have", "must've": "must have", "mustn't": "must not", 
                       "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have",
                       "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", 
                       "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", 
                       "she'd": "she would", "she'd've": "she would have", "she'll": "she will", 
                       "she'll've": "she will have", "she's": "she is", "should've": "should have", 
                       "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have",
                       "so's": "so as", "this's": "this is","that'd": "that would", 
                       "that'd've": "that would have", "that's": "that is", "there'd": "there would", 
                       "there'd've": "there would have", "there's": "there is", "here's": "here is",
                       "they'd": "they would", "they'd've": "they would have", "they'll": "they will", 
                       "they'll've": "they will have", "they're": "they are", "they've": "they have", 
                       "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", 
                       "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", 
                       "weren't": "were not","what`s": "what is", "what'll": "what will", "what'll've": "what will have", 
                       "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", 
                       "when've": "when have", "where'd": "where did", "where's": "where is", 
                       "where've": "where have", "who'll": "who will", "who'll've": "who will have", 
                       "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", 
                       "will've": "will have", "won't": "will not", "won't've": "will not have", 
                       "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", 
                       "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have",
                       "y'all're": "you all are","y'all've": "you all have","you'd": "you would", 
                       "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", 
                       "you're": "you are", "you've": "you have"}

# dict from https://www.kaggle.com/theoviel/improve-your-score-with-text-preprocessing-v2 
correct_spell_dict = {'colour': 'color', 'centre': 'center', 'favourite': 'favorite',
                    'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater',
                    'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization',
                    'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ',
                    'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What',
                    'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are',
                    'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many',
                    'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best',
                    'howdoes': 'how does', 'mastrubation': 'masturbation',
                    'mastrubate': 'masturbate', "mastrubating": 'masturbating',
                    "mcdonald's":'mcdonalds',
                    'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist',
                    'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 
                    'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what',
                    'watsapp': 'whatsapp', 'demonitisation': 'demonetization',
                    'demonitization': 'demonetization', 'demonetisation': 'demonetization',
                    'pokémon': 'pokemon', 'quoras': 'quora', 'quorans': 'quora'}

# Kernel "fork-embeddings-keras-v04"
specials_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", 
                 "²": "2", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"', '”': '"', 
                 '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 
                 'à': 'a', '−': '-', 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', '\u200b': ' ',
                 '…': ' ... ', '\ufeff': '', 'करना': '', 'है': '', 'ε−': ''}

punct = "/-?!.,#$%\()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&' + '\''

In [13]:
def preprocessing(x):
    x = str(x)
    x = re.sub('[’‘´`]', "'", x) 
    
    # replaces one digit by #, two following digits by ## etc.
    x = re.sub('[0-9]{5,}', '#####', str(x)) 
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    x = re.sub('[0-9]\'[0-9]', 'feet inches', x) # e.g. 5'5 → feet inches
    
    for word in x.split():
        if word.lower() in contraction_mapping.keys():
            x = x.replace(word, contraction_mapping[word.lower()])
        if word in correct_spell_dict.keys():
            x = x.replace(word, correct_spell_dict[word])
        if word in specials_mapping.keys():
            x = x.replace(word, specials_mapping[word])
        if word[0] in punct and len(word) != 1: # remove punctuation directly in front of word
            x = x.replace(word[0], '') 
        
    x = ' '.join(word_tokenize(x)) # separates puncutation from words
               
    return x

#### Apply preprocessing functions

In [14]:
runtime_dict = {}
start_prep = time.time()

In [15]:
os.system('echo Applying preprocessing functions..')
df["question_text"] = df["question_text"].fillna(" ").apply(lambda x: preprocessing(x))
os.system('echo prepocessing done')

X = df.loc[:, 'question_text'].values
y = np.asarray(df.loc[:, 'target'].values)

padded_seq, word_index, tokenizer = my_tokenizer(X) # translate text to numerical values
max_words = min(max_words, len(word_index)) # e.g.10k words or all words
      # index +1 because fill process of matrix starts at 1
print(" Number of words in word_index: ", len(word_index))

del X; gc.collect()
os.system('echo Tokenization completed')

 Number of words in word_index:  189036


0

In [16]:
end_prep = time.time()
duration_data_prep = end_prep - start_prep
runtime_dict['Data Preparation'] = round(duration_data_prep)

In [17]:
from sklearn.utils import shuffle
padded_seq, y = shuffle(padded_seq, y)

#### Iteration loop to compare different embeddings (3)

In [18]:
# Embeddings path
_glove = '/kaggle/input/quora-insincere-questions-classification/embeddings/glove.840B.300d/glove.840B.300d.txt'
_paragram =  '/kaggle/input/quora-insincere-questions-classification/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
_wiki_news = '/kaggle/input/quora-insincere-questions-classification/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
_google_news = '/kaggle/input/quora-insincere-questions-classification/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'

embeddings = [
              {'name': 'glove', 'embeddings_path': _glove},
               {'name': 'paragram', 'embeddings_path': _paragram},
              {'name': 'fasttext', 'embeddings_path': _wiki_news} #, Fasttext
              #{'name': 'googlenews', 'embeddings_path': _google_news}
                ]

In [19]:
def get_keras_model(conc_embedding):
    model = Sequential()
    model.add(Embedding(input_dim = max_words+1, # 10k different words/integers
                        output_dim = conc_embedding.shape[1], 
                        weights = [conc_embedding],
                        trainable = False)) 
    
    model.add(SpatialDropout1D(0.3))
    model.add(Bidirectional(CuDNNLSTM(32, return_sequences=True)))
    model.add(Bidirectional(CuDNNLSTM(32)))
    model.add(Dense(1, activation='sigmoid')) # final -  binary classifier
    
    model.compile(optimizer='rmsprop',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
        
    return model

# Model 1

#### Concatenation of Glove and Fasttext

In [20]:
# Concatenation of GloVe and Fasttext
conc_glove_fasttext = None

os.system("echo Loading GloVe")
glove_embedding_matrix = load_and_analyse_Embeddings('glove', _glove, max_words) 
# concatenation embeddings 
conc_glove_fasttext = concatenate_embeddings(conc_glove_fasttext, glove_embedding_matrix)

os.system("echo Loading Fasttext")
embedding_matrix = load_and_analyse_Embeddings('fasttext', _wiki_news, max_words) 
# concatenation embeddings 
conc_glove_fasttext = concatenate_embeddings(conc_glove_fasttext, embedding_matrix)
del embedding_matrix; gc.collect()

os.system("echo Concatenation GloVe and Fasttext created")

   glove loaded
  2196016 word vectors within glove dict
  Embeddings_matrix created
  Shape embedding_matrix: (189037, 300)
  Found Embeddings for 77.41% of all words
  num_known_words : 146341
  num words in word_index:  189036
  Unknown Words: 22.59%
  Words found by PorterStemmer: 3504
  Words found by LancasterStemmer: 4721
  Words found by SnowballStemmer: 87
  Words found by Lemmatisation: 81
  Top 50 unknown words:
 ['brexit', 'cryptocurrencies', 'redmi', 'coinbase', 'oneplus', 'upwork', 'machedo', 'gdpr', 'adityanath', 'boruto', 'bnbr', 'alshamsi', 'dceu', 'litecoin', 'unacademy', 'iiest', "qur'an", 'zerodha', 'tensorflow', 'doklam', 'kavalireddi', 'lnmiit', 'muoet', 'etc…', 'nicmar', 'vajiram', '°c', 'zebpay', 'srmjee', 'elitmus', 'altcoins', 'altcoin', 'hackerrank', 'awdhesh', 'ryzen', 'baahubali', 'koinex', 'demonetisation', 'mhcet', 'byju', 'srmjeee', 'sgsits', 'ftre', 'skripal', 'nanodegree', 'gurugram', 'hotstar', 'mhtcet', 'x²', 'bmsce']

Added embedding. First shape: (

0

In [21]:
end_embeddings= time.time()
duration_embeddings = end_embeddings - end_prep
print(duration_embeddings)
runtime_dict['Model 1 Embeddings'] = round(duration_embeddings)

264.1496078968048


In [22]:
get_keras_model(conc_glove_fasttext).summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 600)         113422200 
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, None, 600)         0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, None, 64)          162304    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 64)                25088     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 113,609,657
Trainable params: 187,457
Non-trainable params: 113,422,200
_________________________________________________________________


### Training Model 1

In [23]:
model_1 = get_keras_model(conc_glove_fasttext) # create new model for current split

model_1.fit(padded_seq, y,
            epochs = model_epochs, 
            batch_size= 512,
            verbose = 0)     

del conc_glove_fasttext; gc.collect()

42

In [24]:
end_model1= time.time()
duration_model1_training = end_embeddings - end_model1
print(duration_model1_training)
runtime_dict['Model 1 Training'] = round(duration_model1_training)

-718.5001041889191


# Model 2

#### Concatenation of Glove and Paragram

In [25]:
os.system("echo Re-Use Glove Embedding Matrix")
conc_glove_paragram = glove_embedding_matrix

os.system("echo Loading Paragram")
embedding_matrix = load_and_analyse_Embeddings('paragram', _paragram, max_words) 
# concatenation embeddings 
conc_glove_paragram = concatenate_embeddings(conc_glove_paragram, embedding_matrix)
del embedding_matrix, glove_embedding_matrix; gc.collect()

os.system("echo Concatenation GloVe and Paragram created")

   paragram loaded
  1703755 word vectors within paragram dict
  Embeddings_matrix created
  Shape embedding_matrix: (189037, 300)
  Found Embeddings for 78.83% of all words
  num_known_words : 149009
  num words in word_index:  189036
  Unknown Words: 21.17%
  Words found by PorterStemmer: 4308
  Words found by LancasterStemmer: 5359
  Words found by SnowballStemmer: 90
  Words found by Lemmatisation: 79
  Top 50 unknown words:
 ['brexit', 'cryptocurrencies', 'redmi', 'coinbase', 'oneplus', 'upwork', 'machedo', 'gdpr', 'adityanath', 'boruto', 'bnbr', 'alshamsi', 'dceu', 'litecoin', 'unacademy', 'iiest', "qur'an", 'zerodha', 'tensorflow', 'doklam', 'kavalireddi', 'lnmiit', 'muoet', 'etc…', 'nicmar', 'vajiram', '°c', 'zebpay', 'srmjee', 'elitmus', 'altcoins', 'altcoin', 'hackerrank', 'awdhesh', 'baahubali', 'koinex', 'demonetisation', 'mhcet', 'byju', 'srmjeee', 'sgsits', 'ftre', 'skripal', 'nanodegree', 'gurugram', 'hotstar', 'mhtcet', 'x²', 'bmsce', 'what\u200b']

Added embedding. New

0

In [26]:
get_keras_model(conc_glove_paragram).summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 600)         113422200 
_________________________________________________________________
spatial_dropout1d_3 (Spatial (None, None, 600)         0         
_________________________________________________________________
bidirectional_5 (Bidirection (None, None, 64)          162304    
_________________________________________________________________
bidirectional_6 (Bidirection (None, 64)                25088     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 113,609,657
Trainable params: 187,457
Non-trainable params: 113,422,200
_________________________________________________________________


### Training Model 2

In [27]:
model_2 = get_keras_model(conc_glove_paragram) # create new model for current split

model_2.fit(padded_seq, y,
            epochs = model_epochs, 
            batch_size= 512,
            verbose = 0)     

del conc_glove_paragram; gc.collect()

140

# Model Ensemble and Submission

In [28]:
test = pd.read_csv("/kaggle/input/quora-insincere-questions-classification/test.csv")
test.head(5)

Unnamed: 0,qid,question_text
0,0000163e3ea7c7a74cd7,Why do so many women become so rude and arrogant when they get just a little bit of wealth and power?
1,00002bd4fb5d505b9161,When should I apply for RV college of engineering and BMS college of engineering? Should I wait for the COMEDK result or am I supposed to apply before the result?
2,00007756b4a147d2b0b3,What is it really like to be a nurse practitioner?
3,000086e4b7e1c7146103,Who are entrepreneurs?
4,0000c4c3fbe8785a3090,Is education really making good people nowadays?


In [29]:
# Preprocessing
os.system('echo applying preprocessing functions to testset..')
test["question_text"] = test["question_text"].fillna(" ").apply(lambda x: preprocessing(x))
os.system('echo prepocessing testset done')

X_test = test.loc[:, 'question_text'].values

# Transformation questions to  sequence
sequences = tokenizer.texts_to_sequences(X_test)
padded_seq_test = pad_sequences(sequences, maxlen=maxlen)  

In [30]:
# Ensemble Model
preds_1 = model_1.predict(padded_seq_test, batch_size = 512, verbose = 1)
preds_2 = model_2.predict(padded_seq_test, batch_size = 512, verbose = 1)
final_preds = 0.5 * preds_1 + 0.5 * preds_2

# Create a submission dataframe and append relevant columns
submission = pd.DataFrame()
submission['qid'] = test['qid'].values
submission['prediction'] = (final_preds > 0.38).astype(int) # round sigmoid results to integers

# Do test and my submission Dataframe have the same length?
if len(submission) == len(test):
    print("Submission dataframe has the same length as test dataframe with shape:{}".format(submission.shape))
else:
    print("Something is wrong!")
    
submission.head()

Submission dataframe has the same length as test dataframe with shape:(375806, 2)


Unnamed: 0,qid,prediction
0,0000163e3ea7c7a74cd7,1
1,00002bd4fb5d505b9161,0
2,00007756b4a147d2b0b3,0
3,000086e4b7e1c7146103,0
4,0000c4c3fbe8785a3090,0


In [31]:
# Submission as Output
submission.to_csv('submission.csv', index = False)