## Reading the cleaned data from csv

In [23]:
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.decomposition import TruncatedSVD
import scipy
from transformers import BertTokenizer, BertModel
import torch
from joblib import dump
from joblib import load

In [24]:
## Loading the whole dataset for training the vectorizer on the whole dataset
og_data = pd.read_csv('cleaned_questions.csv')
og_data.head()

Unnamed: 0,id,qid1,qid2,is_duplicate,clean_question1,clean_question2,lemmatized_question1,lemmatized_question2,len_q1,len_q2
0,0,1,2,0,step step guide invest share market india,step step guide invest share market,step step guide invest share market india,step step guide invest share market,41,35
1,1,3,4,0,story kohinoor koh noor diamond,would happen indian government stole kohinoor ...,story kohinoor koh noor diamond,would happen indian government steal kohinoor ...,31,67
2,2,5,6,0,increase speed internet connection using vpn,internet speed increased hacking dns,increase speed internet connection use vpn,internet speed increase hack dns,42,32
3,3,7,8,0,mentally lonely solve,find remainder 23 power 24 divided 24 23,mentally lonely solve,find remainder 23 power 24 divide 24 23,21,39
4,4,9,10,0,one dissolve water quikly sugar salt methane c...,fish would survive salt water,one dissolve water quikly sugar salt methane c...,fish would survive salt water,60,29


In [25]:
## Loading both the datasets based on nltk and spacy
nltk_data = pd.read_csv('cleaned_questions_nltk.csv')
nltk_data.head()

Unnamed: 0,id,qid1,qid2,is_duplicate,lengthq1,lengthq2,common_words,q1_wordlen,q2_wordlen,word_difference,clean_question1,clean_question2
0,236588,466074,466075,0,120,119,19,22,22,0,good gift foreign visitor bring invite someone...,good gift foreign visitor bring invite someone...
1,284623,413904,559402,0,61,39,1,12,8,4,good alternative cut brisket can not find,best wood smoke brisket
2,37445,74608,74609,0,44,64,3,8,12,4,horror movie jump scare,possible create good horror film without jump ...
3,299330,587921,587922,0,76,39,1,12,7,5,ethical take vegetarian v vegan v non vegetari...,non vegetarian date vegetarian
4,204421,403323,403324,0,56,63,2,9,10,1,good tip young biotech enterpreneurs,must young entrepreneur know build company


In [26]:
spacy_data = pd.read_csv('cleaned_questions_spacy.csv')
spacy_data.head()

Unnamed: 0,id,qid1,qid2,is_duplicate,lengthq1,lengthq2,common_words,q1_wordlen,q2_wordlen,word_difference,clean_question1,clean_question2
0,236588,466074,466075,0,120,119,19,22,22,0,good gift foreign visitor bring invite someone...,good gift foreign visitor bring invite someone...
1,284623,413904,559402,0,61,39,1,12,8,4,good alternative cut brisket can not find,good wood smoke brisket
2,37445,74608,74609,0,44,64,3,8,12,4,horror movie jump scare,possible create good horror film without jump ...
3,299330,587921,587922,0,76,39,1,12,7,5,ethical take vegetarian vs vegan vs non vegeta...,non vegetarian date vegetarian
4,204421,403323,403324,0,56,63,2,9,10,1,good tip young biotech enterpreneur,must young entrepreneur know build company


## 3.1 Vectorising using TFIDF

In [27]:
def preprocess_data(data):
    data['clean_question1'].fillna("", inplace=True)
    data['clean_question2'].fillna("", inplace=True)
    return data

def vectorize_questions(data, tfidf_vectorizer):
    tfidf_q1 = tfidf_vectorizer.transform(data['clean_question1'])
    tfidf_q2 = tfidf_vectorizer.transform(data['clean_question2'])
    return tfidf_q1, tfidf_q2

# # Preprocess OG data
# og_data_preprocessed = preprocess_data(og_data)

# # Train TF-IDF Vectorizer on OG data
# tfidf_vectorizer = TfidfVectorizer(min_df=10, max_df=0.5)
# tfidf_vectorizer.fit(pd.concat([og_data_preprocessed['clean_question1'], og_data_preprocessed['clean_question2']]))

# # Vectorize OG data questions
# tfidf_og_q1, tfidf_og_q2 = vectorize_questions(og_data_preprocessed, tfidf_vectorizer)

# # Combine and train Truncated SVD on the vectorized OG data
# svd = TruncatedSVD(n_components=300, random_state=42)
# combined_tfidf_og = scipy.sparse.vstack((tfidf_og_q1, tfidf_og_q2))
# svd.fit(combined_tfidf_og)


# # Save the TF-IDF vectorizer
# dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')

# # Save the Truncated SVD model
# dump(svd, 'svd_model.joblib')


# # Load the TF-IDF vectorizer
tfidf_vectorizer = load('tfidf_vectorizer.joblib')

# # Load the Truncated SVD model
svd = load('svd_model.joblib')

In [28]:
def reduce_dimensionality(tfidf_q1, tfidf_q2, svd):
    tfidf_q1_reduced = svd.transform(tfidf_q1)
    tfidf_q2_reduced = svd.transform(tfidf_q2)
    return tfidf_q1_reduced, tfidf_q2_reduced


def calculate_squared_differences(tfidf_q1_reduced, tfidf_q2_reduced):
    squared_differences = np.square(tfidf_q1_reduced - tfidf_q2_reduced)
    return squared_differences



def process_pipeline(data, tfidf_vectorizer, svd):
    # Step 1: Preprocess the data
    data = preprocess_data(data)
    
    # Step 2: Vectorize the questions with pre-trained TF-IDF vectorizer
    tfidf_q1, tfidf_q2 = vectorize_questions(data, tfidf_vectorizer)
    
    # Step 3: Reduce the dimensionality with pre-trained Truncated SVD
    tfidf_q1_reduced, tfidf_q2_reduced = reduce_dimensionality(tfidf_q1, tfidf_q2, svd)
    
    # Step 4: Calculate squared differences
    squared_differences = calculate_squared_differences(tfidf_q1_reduced, tfidf_q2_reduced)
    
    return squared_differences

In [40]:
# Example: Processing the 'nltk' dataset through the pipeline
squared_differences_nltk = process_pipeline(nltk_data, tfidf_vectorizer, svd)

# Example: Processing the 'spacy' dataset through the pipeline
squared_differences_spacy = process_pipeline(spacy_data, tfidf_vectorizer, svd)

In [41]:

# Convert squared differences to DataFrame
squared_differences_nltk_df = pd.DataFrame(squared_differences_nltk)
squared_differences_spacy_df = pd.DataFrame(squared_differences_spacy)

# Append squared differences to the nltk dataframe
nltk_data = pd.concat([nltk_data, squared_differences_nltk_df], axis=1)

# Append squared differences to the spacy dataframe
spacy_data = pd.concat([spacy_data, squared_differences_spacy_df], axis=1)

# Save the nltk dataframe
nltk_data.to_csv('nltk_embeddings.csv', index=False)

# Save the spacy dataframe
spacy_data.to_csv('spacy_embeddings.csv', index=False)


## 3.2 Vectorising using BERT

In [29]:
def bert_embeddings(texts, model_name='bert-base-uncased', max_length=128, batch_size=32):
    # Load pre-trained model tokenizer and model
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertModel.from_pretrained(model_name, output_hidden_states=True)
    
    # Check if CUDA is available and if not, use CPU
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    model.eval()  # Set the model to evaluation mode
    
    embeddings = []

    # Calculate total batches
    total_batches = len(texts) // batch_size + (len(texts) % batch_size != 0)
    print(f"Total batches: {total_batches}")

    # Process texts in batches
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size].tolist()  # Convert pandas series to list
        
        # Tokenize and encode the batch
        encoded_input = tokenizer(batch_texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
        encoded_input = {key: value.to(device) for key, value in encoded_input.items()}
        
        # Get embeddings
        with torch.no_grad():
            outputs = model(**encoded_input)
        
        # Move embeddings to CPU and convert to numpy
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.extend(batch_embeddings)

        # Print batches done
        print(f"Batches done: {i // batch_size + 1}")
    
    return np.array(embeddings)

def process_pipeline_bert(data):
    # Step 1: Preprocess the data
    data = preprocess_data(data)
    
    # Step 2: Get BERT embeddings for the questions
    q1_embeddings = bert_embeddings(data['clean_question1'])
    q2_embeddings = bert_embeddings(data['clean_question2'])
    
    # Step 3: Calculate squared differences
    squared_differences = calculate_squared_differences(q1_embeddings, q2_embeddings)
    
    return squared_differences

## Getting squared differences for the nltk dataset using BERT embeddings
nltk_squared_differences = process_pipeline_bert(nltk_data)

# Convert the squared differences to a dataframe
nltk_squared_differences_df = pd.DataFrame(nltk_squared_differences)

# Append squared differences to the nltk dataframe
nltk_data = pd.concat([nltk_data, nltk_squared_differences_df], axis=1)

## Now for Spacy dataset

## Getting squared differences for the spacy dataset using BERT embeddings
spacy_squared_differences = process_pipeline_bert(spacy_data)

# Convert the squared differences to a dataframe
spacy_squared_differences_df = pd.DataFrame(spacy_squared_differences)

# Append squared differences to the spacy dataframe
spacy_data = pd.concat([spacy_data, spacy_squared_differences_df], axis=1)




Total batches: 313
Batches done: 1
Batches done: 2
Batches done: 3
Batches done: 4
Batches done: 5
Batches done: 6
Batches done: 7
Batches done: 8
Batches done: 9
Batches done: 10
Batches done: 11
Batches done: 12
Batches done: 13
Batches done: 14
Batches done: 15
Batches done: 16
Batches done: 17
Batches done: 18
Batches done: 19
Batches done: 20
Batches done: 21
Batches done: 22
Batches done: 23
Batches done: 24
Batches done: 25
Batches done: 26
Batches done: 27
Batches done: 28
Batches done: 29
Batches done: 30
Batches done: 31
Batches done: 32
Batches done: 33
Batches done: 34
Batches done: 35
Batches done: 36
Batches done: 37
Batches done: 38
Batches done: 39
Batches done: 40
Batches done: 41
Batches done: 42
Batches done: 43
Batches done: 44
Batches done: 45
Batches done: 46
Batches done: 47
Batches done: 48
Batches done: 49
Batches done: 50
Batches done: 51
Batches done: 52
Batches done: 53
Batches done: 54
Batches done: 55
Batches done: 56
Batches done: 57
Batches done: 58
Batc

In [33]:
# Save the nltk dataframe
nltk_data.to_csv('nltk_embeddings_bert.csv', index=False)

# Save the spacy dataframe
spacy_data.to_csv('spacy_embeddings_bert.csv', index=False)