## Reading the cleaned data from csv

In [2]:
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

In [3]:
## Loading the whole dataset for training the vectorizer on the whole dataset
og_data = pd.read_csv('cleaned_questions.csv')
og_data.head()

Unnamed: 0,id,qid1,qid2,is_duplicate,clean_question1,clean_question2,lemmatized_question1,lemmatized_question2,len_q1,len_q2
0,0,1,2,0,step step guide invest share market india,step step guide invest share market,step step guide invest share market india,step step guide invest share market,41,35
1,1,3,4,0,story kohinoor koh noor diamond,would happen indian government stole kohinoor ...,story kohinoor koh noor diamond,would happen indian government steal kohinoor ...,31,67
2,2,5,6,0,increase speed internet connection using vpn,internet speed increased hacking dns,increase speed internet connection use vpn,internet speed increase hack dns,42,32
3,3,7,8,0,mentally lonely solve,find remainder 23 power 24 divided 24 23,mentally lonely solve,find remainder 23 power 24 divide 24 23,21,39
4,4,9,10,0,one dissolve water quikly sugar salt methane c...,fish would survive salt water,one dissolve water quikly sugar salt methane c...,fish would survive salt water,60,29


In [4]:
## Loading both the datasets based on nltk and spacy
nltk = pd.read_csv('cleaned_questions_nltk.csv')
nltk.head()

Unnamed: 0,id,qid1,qid2,is_duplicate,lengthq1,lengthq2,common_words,q1_wordlen,q2_wordlen,word_difference,clean_question1,clean_question2
0,236588,466074,466075,0,120,119,19,22,22,0,good gift foreign visitor bring invite someone...,good gift foreign visitor bring invite someone...
1,284623,413904,559402,0,61,39,1,12,8,4,good alternative cut brisket can not find,best wood smoke brisket
2,37445,74608,74609,0,44,64,3,8,12,4,horror movie jump scare,possible create good horror film without jump ...
3,299330,587921,587922,0,76,39,1,12,7,5,ethical take vegetarian v vegan v non vegetari...,non vegetarian date vegetarian
4,204421,403323,403324,0,56,63,2,9,10,1,good tip young biotech enterpreneurs,must young entrepreneur know build company


In [5]:
spacy = pd.read_csv('cleaned_questions_spacy.csv')
spacy.head()

Unnamed: 0,id,qid1,qid2,is_duplicate,lengthq1,lengthq2,common_words,q1_wordlen,q2_wordlen,word_difference,clean_question1,clean_question2
0,236588,466074,466075,0,120,119,19,22,22,0,good gift foreign visitor bring invite someone...,good gift foreign visitor bring invite someone...
1,284623,413904,559402,0,61,39,1,12,8,4,good alternative cut brisket can not find,good wood smoke brisket
2,37445,74608,74609,0,44,64,3,8,12,4,horror movie jump scare,possible create good horror film without jump ...
3,299330,587921,587922,0,76,39,1,12,7,5,ethical take vegetarian vs vegan vs non vegeta...,non vegetarian date vegetarian
4,204421,403323,403324,0,56,63,2,9,10,1,good tip young biotech enterpreneur,must young entrepreneur know build company


## 3.1 Vectorising using TFIDF

In [6]:
# Vectorizing the 'clean_question1' and 'clean_question2' columns of the og_data, spacy and nltk datasets

# Setting up min_df, max_df and other params to ensure matrix is not that sparse
tfidf = TfidfVectorizer(min_df=10, max_df=0.7)

# Removing nulls before fitting and transforming as TfidfVectorizer does not handle nulls
og_data['clean_question1'].fillna("", inplace=True)
og_data['clean_question2'].fillna("", inplace=True)
nltk['clean_question1'].fillna("", inplace=True)
nltk['clean_question2'].fillna("", inplace=True)
spacy['clean_question1'].fillna("", inplace=True)
spacy['clean_question2'].fillna("", inplace=True)

tfidf.fit(pd.concat([og_data['clean_question1'], og_data['clean_question2']]))

tfidf_nltk_q1 = tfidf.transform(nltk['clean_question1'])
tfidf_nltk_q2 = tfidf.transform(nltk['clean_question2'])

tfidf_spacy_q1 = tfidf.transform(spacy['clean_question1'])
tfidf_spacy_q2 = tfidf.transform(spacy['clean_question2'])

tfidf_nltk_q1.shape, tfidf_nltk_q2.shape, tfidf_spacy_q1.shape, tfidf_spacy_q2.shape

((10000, 20577), (10000, 20577), (10000, 20577), (10000, 20577))

Even after using min_df and max_df, the matrix is still very sparse. This is because the dataset is very large and the number of unique words is also very large. This is why we will use Word2Vec to convert the text into vectors.

## 3.2 Vectorising using Word2Vec

In [7]:
# Combining tokens from q1 and q2 of og_data for training the Word2Vec model
combined_tokens = og_data['clean_question1'].apply(word_tokenize).tolist() + og_data['clean_question2'].apply(word_tokenize).tolist()

# Training the Word2Vec model
model = Word2Vec(combined_tokens, vector_size=100, window=5, min_count=1, workers=4)
model.train(combined_tokens, total_examples=model.corpus_count, epochs=10)

# Adjusting the get_average_word2vec function to work with the vector model and tokens
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=100):
    if len(tokens_list) < 1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

# Adjusting the get_word2vec_embeddings function to handle both q1 and q2
def get_word2vec_embeddings(vectors, clean_questions):
    tokens = clean_questions.apply(word_tokenize)
    embeddings = tokens.apply(lambda x: get_average_word2vec(x, vectors))
    return list(embeddings)

# Applying the function to get the Word2Vec embeddings for both questions in nltk and spacy datasets
nltk_q1_embeddings = get_word2vec_embeddings(model.wv, nltk['clean_question1'])
nltk_q2_embeddings = get_word2vec_embeddings(model.wv, nltk['clean_question2'])
spacy_q1_embeddings = get_word2vec_embeddings(model.wv, spacy['clean_question1'])
spacy_q2_embeddings = get_word2vec_embeddings(model.wv, spacy['clean_question2'])

# Verifying the shape of the embeddings
np.array(nltk_q1_embeddings).shape, np.array(nltk_q2_embeddings).shape, np.array(spacy_q1_embeddings).shape, np.array(spacy_q2_embeddings).shape


((10000, 100), (10000, 100), (10000, 100), (10000, 100))

### Concatenating the word2vec embeddings for q1 and q2 of both nltk and spacy datasets and adding them to the original datasets

In [8]:
# Converting the embeddings to dataframes
temp1 = pd.DataFrame(nltk_q1_embeddings, index=nltk.index)
temp2 = pd.DataFrame(nltk_q2_embeddings, index=nltk.index)

# Concatenating the embeddings to the nltk dataframe
nltk = pd.concat([nltk, temp1, temp2], axis=1)

# Repeating the process for the spacy dataframe
temp1 = pd.DataFrame(spacy_q1_embeddings, index=spacy.index)
temp2 = pd.DataFrame(spacy_q2_embeddings, index=spacy.index)

spacy = pd.concat([spacy, temp1, temp2], axis=1)



### Saving the dataframes to csv for modelling

In [16]:
nltk.to_csv('nltk_embeddings.csv', index=False)
spacy.to_csv('spacy_embeddings.csv', index=False)