# Download dataset from kaggle <br>
ref: https://www.kaggle.com/c/quora-question-pairs/data <br>

Data fields <br>



*   id - the id of a training set question pair
*   qid1, qid2 - unique ids of each question (only available in train.csv)
*   question1, question2 - the full text of each question
*   is_duplicate - the target variable, set to 1 if question1 and question2 have essentially the same meaning, and 0 otherwise.

In [None]:
import os
from google.colab import drive

drive.mount('/content/drive')
os.environ['KAGGLE_CONFIG_DIR'] = "/content/drive/My Drive/kaggle"
%cd /content/drive/My Drive/colab_data/datasets
if (not os.path.isdir('quora-question-pairs')):
  os.mkdir('quora-question-pairs')
%cd quora-question-pairs
!kaggle competitions download -c quora-question-pairs
!unzip \*.zip  && rm *.zip

# Load and clean data

In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/colab_data/datasets/quora-question-pairs

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/colab_data/datasets/quora-question-pairs


In [2]:
import os
import pandas as pd
import spacy
import re

from time import time

from nltk.stem import SnowballStemmer
snowStem=SnowballStemmer('english')

pd.set_option('display.max_colwidth', None)
ROOT_DIR = '/content/drive/My Drive/colab_data/datasets/quora-question-pairs'

In [3]:
df = pd.read_csv(os.path.join(ROOT_DIR,'train.csv'))
print(df.shape)
df = df.dropna().reset_index(drop=True)
df.isnull().sum()

(404290, 6)


id              0
qid1            0
qid2            0
question1       0
question2       0
is_duplicate    0
dtype: int64

In [4]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in share market in india?,What is the step by step guide to invest in share market?,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Diamond?,What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?,0
2,2,5,6,How can I increase the speed of my internet connection while using a VPN?,How can Internet speed be increased by hacking through DNS?,0
3,3,7,8,Why am I mentally very lonely? How can I solve it?,"Find the remainder when [math]23^{24}[/math] is divided by 24,23?",0
4,4,9,10,"Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?",Which fish would survive in salt water?,0


In [5]:
nlp = spacy.load('en', disable=['ner', 'parser'])

def cleaning(doc):
    txt = [snowStem.stem(token.text) for token in doc]
    if len(txt) > 2:
        return ' '.join(txt)

In [6]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['question1'])

t = time()
txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_threads=-1)]
print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))
df["clean_question1"] = txt

KeyboardInterrupt: ignored

In [None]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['question2'])

t = time()
txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_threads=-1)]
print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))
df["clean_question2"] = txt

In [None]:
df = df.dropna().reset_index(drop=True)
print(df.isnull().sum())

In [None]:
df_clean = df.drop(columns=['question1', 'question2'])
df_clean.head()

In [None]:
df_clean.to_csv("train_clean.csv")

In [7]:
df_clean = pd.read_csv(os.path.join(ROOT_DIR,'train_clean.csv'))
df_clean.head()

Unnamed: 0.1,Unnamed: 0,id,qid1,qid2,is_duplicate,clean_question1,clean_question2
0,0,0,1,2,0,what is the step by step guid to invest in share market in india,what is the step by step guid to invest in share market
1,1,1,3,4,0,what is the stori of kohinoor koh i noor diamond,what would happen if the indian govern stole the kohinoor koh i noor diamond back
2,2,2,5,6,0,how can i increas the speed of my internet connect while use a vpn,how can internet speed be increas by hack through dns
3,3,3,7,8,0,whi am i mental veri lone how can i solv it,find the remaind when math math is divid by
4,4,4,9,10,0,which one dissolv in water quik sugar salt methan and carbon di oxid,which fish would surviv in salt water


# Build Vocab and train word2vec

In [8]:
from gensim.models.phrases import Phrases, Phraser

text = list(df_clean['clean_question1']) + list(df_clean['clean_question2'])
sent = [row.split() for row in text]
phrases = Phrases(sent, min_count=30, progress_per=10000)
bigram = Phraser(phrases)
sentences = bigram[sent]


KeyboardInterrupt: ignored

In [None]:
import multiprocessing
from gensim.models import Word2Vec
cores = multiprocessing.cpu_count()

In [None]:
w2v_model = Word2Vec(min_count=5,
                     window=5,
                     size=200,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [None]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
token2Id = {}
id2token = {}

index = 1

for vocab in w2v_model.wv.vocab.keys():

  token2Id[vocab] = index
  id2token[str(index)] = vocab
  index += 1

In [None]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
w2v_model.init_sims(replace=True)
w2v_wv = w2v_model.wv

In [None]:
import json
with open('token2Id.json', 'w') as json_file:
        json.dump(token2Id, json_file, indent=4)
with open('id2token.json', 'w') as json_file:
        json.dump(id2token, json_file, indent=4)

w2v_wv.save("w2v-200d.kv")

In [19]:
token2Id = {}
id2token = {}

In [22]:
import json
with open('token2Id.json') as json_file:
        token2Id = json.load(json_file)

with open('id2token.json') as json_file:
        id2token = json.load(json_file)

# Proprocess Text

In [11]:
def preprocess(text):
  ids = [token2Id[word] for word in text.split(' ') if word in token2Id.keys()]
  return ids

In [12]:
preprocessed = [preprocess(row) for row in df_clean['clean_question1']]
df_clean["question1"] = preprocessed
preprocessed = [preprocess(row) for row in df_clean['clean_question2']]
df_clean["question2"] = preprocessed
df_clean.head()

Unnamed: 0.1,Unnamed: 0,id,qid1,qid2,is_duplicate,clean_question1,clean_question2,question1,question2
0,0,0,1,2,0,what is the step by step guid to invest in share market in india,what is the step by step guid to invest in share market,"[1, 2, 3, 4, 5, 4, 6, 7, 8, 9, 1454, 2291, 9, 11]","[1, 2, 3, 4, 5, 4, 6, 7, 8, 9, 1454, 2291]"
1,1,1,3,4,0,what is the stori of kohinoor koh i noor diamond,what would happen if the indian govern stole the kohinoor koh i noor diamond back,"[1, 2, 3, 12, 13, 14, 15, 16, 17, 18]","[1, 100, 259, 276, 3, 391, 182, 3708, 3, 14, 15, 16, 17, 18, 226]"
2,2,2,5,6,0,how can i increas the speed of my internet connect while use a vpn,how can internet speed be increas by hack through dns,"[19, 20, 16, 21, 3, 22, 13, 23, 384, 1470, 25, 26, 27, 28]","[19, 20, 384, 22, 60, 21, 5, 69, 150, 20348]"
3,3,3,7,8,0,whi am i mental veri lone how can i solv it,find the remaind when math math is divid by,"[29, 30, 16, 31, 32, 33, 19, 20, 16, 34, 35]","[72, 3, 8241, 63, 1216, 1216, 2, 8240, 5]"
4,4,4,9,10,0,which one dissolv in water quik sugar salt methan and carbon di oxid,which fish would surviv in salt water,"[36, 37, 38, 9, 39, 40, 41, 42, 43, 44, 45, 46]","[36, 2638, 100, 3821, 9, 41, 39]"


In [None]:
df_preprocessed = df_clean.drop(columns=['clean_question1', 'clean_question2'])
df_preprocessed.to_csv('train_preprocessed.csv')

In [13]:
df_preprocessed = pd.read_csv(os.path.join(ROOT_DIR,'train_preprocessed.csv'))
df_preprocessed.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,qid1,qid2,is_duplicate,question1,question2
0,0,0,0,1,2,0,"[1, 2, 3, 4, 5, 4, 6, 7, 8, 9, 1454, 2291, 9, 11]","[1, 2, 3, 4, 5, 4, 6, 7, 8, 9, 1454, 2291]"
1,1,1,1,3,4,0,"[1, 2, 3, 12, 13, 14, 15, 16, 17, 18]","[1, 100, 259, 276, 3, 391, 182, 3708, 3, 14, 15, 16, 17, 18, 226]"
2,2,2,2,5,6,0,"[19, 20, 16, 21, 3, 22, 13, 23, 384, 1470, 25, 26, 27, 28]","[19, 20, 384, 22, 60, 21, 5, 69, 150, 20348]"
3,3,3,3,7,8,0,"[29, 30, 16, 31, 32, 33, 19, 20, 16, 34, 35]","[72, 3, 8241, 63, 1216, 1216, 2, 8240, 5]"
4,4,4,4,9,10,0,"[36, 37, 38, 9, 39, 40, 41, 42, 43, 44, 45, 46]","[36, 2638, 100, 3821, 9, 41, 39]"


# Setup pretrained embedding

In [14]:
import numpy as np
import torch.nn as nn
from gensim.models import KeyedVectors

In [17]:
embedding_dim = 200
vocab_size = len(token2Id.keys())+1
print(vocab_size)

22812


In [15]:
w2v = KeyedVectors.load('w2v-200d.kv')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [26]:
weights_matrix = np.random.rand(vocab_size, embedding_dim)
for token, idx in token2Id.items():
  if token in w2v.vocab:
        weights_matrix[idx] = w2v[token]

np.save('pretrained_emb.npy', weights_matrix)
weights_matrix.shape

(22812, 200)