In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
import spacy

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/nandaniyadav/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nandaniyadav/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/nandaniyadav/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nandaniyadav/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [42]:
data = pd.read_csv('questions.csv')

In [43]:
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [44]:
data.is_duplicate.value_counts()

0    255045
1    149306
Name: is_duplicate, dtype: int64

## Tasks at our disposal
#### 1. EDA
#### 2. Data Cleaning - Stopwords , Alphanucmeric only
#### 3. Feature Engineering - Vectorizers
#### 4. Model Building - train/test split , initial model. Parameteization with updating model params and corpus methods like vectorizer etc 
#### 5. Model Evaluation - COnfusion matrix , metrics peresicion recall accuracy f1
#### 6. Hosting on streamlit
#### 7. 2 page report
#### 8. Video recording
#### 9. Presentation 15 mins in class

# 1. EDA


In [45]:
#Dataset Shape
print("dataframe shape: ",data.shape)

#null vs not null
data.isnull().sum()

dataframe shape:  (404351, 6)


id              0
qid1            0
qid2            0
question1       1
question2       2
is_duplicate    0
dtype: int64

In [46]:
#remove null values
data = data.dropna()

In [47]:
data.isnull().sum()

id              0
qid1            0
qid2            0
question1       0
question2       0
is_duplicate    0
dtype: int64

# 2. Data Cleaning

#### 2.1 Basic Cleaning

In [48]:
# Download stopwords from NLTK
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Function to expand contractions
def expand_contractions(text):
    # Dictionary of English contractions
    contractions_dict = {"don't": "do not", "doesn't": "does not", "didn't": "did not",
                         # Add more contractions as needed
                        }
    # Regular expression for finding contractions
    contractions_re = re.compile('(%s)' % '|'.join(contractions_dict.keys()))

    def replace(match):
        return contractions_dict[match.group(0)]

    return contractions_re.sub(replace, text)

#function to handle LaTeX expressions
def clean_math_text(text):

    replacements = {
        # Basic operations and structures
        r'\\frac\{(.*?)\}\{(.*?)\}': r'\1 over \2',
        r'\\sqrt\{(.*?)\}': r'square root of \1',
        r'\\sum_(\{.*?\})\^(\{.*?\})': r'sum from \1 to \2',
        r'\\int_(\{.*?\})\^(\{.*?\})': r'integral from \1 to \2',
        r'\\log_(\{.*?\})\{(.*?)\}': r'log base \1 of \2',
        r'\\lim_(\{.*?\})': r'limit as \1',
        r'(\d+)\^(\{?\d+\}?)': r'\1 to the power of \2',
        r'\\infty': 'infinity',
        r'\\pm': 'plus or minus',
        # Greek letters
        r'\\alpha': 'alpha', r'\\beta': 'beta', r'\\gamma': 'gamma',
        r'\\delta': 'delta', r'\\epsilon': 'epsilon', r'\\zeta': 'zeta',
        r'\\eta': 'eta', r'\\theta': 'theta', r'\\iota': 'iota',
        r'\\kappa': 'kappa', r'\\lambda': 'lambda', r'\\mu': 'mu',
        r'\\nu': 'nu', r'\\xi': 'xi', r'\\omicron': 'omicron',
        r'\\pi': 'pi', r'\\rho': 'rho', r'\\sigma': 'sigma',
        r'\\tau': 'tau', r'\\upsilon': 'upsilon', r'\\phi': 'phi',
        r'\\chi': 'chi', r'\\psi': 'psi', r'\\omega': 'omega',
        # Trigonometric functions
        r'\\sin': 'sine', r'\\cos': 'cosine', r'\\tan': 'tangent',
        r'\\csc': 'cosecant', r'\\sec': 'secant', r'\\cot': 'cotangent',
        # Differential and partial differential
        r'\\partial': 'partial', r'\\nabla': 'nabla',
        r'\\mathrm\{d\}': 'd',  # For derivatives
        # Other mathematical symbols
        r'\\times': 'times', r'\\div': 'divided by', r'\\cdot': 'dot',
        # Additional symbols and operations
        r'\+': 'plus', r'\-': 'minus', r'\*': 'times',
        # Handling general exponentiation
        r'\\exp\{(.*?)\}': r'e to the power of \1',  # For exponential functions
        r'(\w+)\^(\w+)': r'\1 to the power of \2',  # General exponentiation
        # Handling \mathop
        r'\\mathop\{\\rm ([^}]+)\}': r'operator \1'    }
    
    # Function to apply replacements to a matched object
    def apply_replacements(match):
        # Extracting the matched text excluding the [math] tags
        math_text = match.group(1) # match.group(0) includes the whole match, so match.group(1) is the first capture group
        
        # Applying all replacements to the math_text
        for pattern, replacement in replacements.items():
            math_text = re.sub(pattern, replacement, math_text)
        
        # Return the transformed math_text
        return math_text

    # Use=ing re.sub with a function that applies the replacements for each [math] section
    # Pattern captures the content between [math] and [/math] tags
    pattern = r'\[math\](.*?)\[/math\]'
    clean_text = re.sub(pattern, apply_replacements, text)

    # Removing unnecessary braces and cleanup, applied globally to the whole text
    clean_text = re.sub(r'\{|\}', '', clean_text)
    clean_text = re.sub(r'\s+', ' ', clean_text).strip()

    return clean_text

# Function to clean text
def clean_text(text):
    #handling LaTex expressions
    text = clean_math_text(text)
    # Lowercase conversion
    text = text.lower()
    # Removing HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Removing URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Expanding contractions
    text = expand_contractions(text)
    # Removing special characters
    text = re.sub(r'\W', ' ', text)
    # Removing extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # removing stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Applying the cleaning function to DataFrame
data['clean_question1'] = data['question1'].apply(clean_text)
data['clean_question2'] = data['question2'].apply(clean_text)

#dropping the original columns
data.drop(['question1', 'question2'], axis=1, inplace=True)

# Displaying the cleaned dataset
data.head()


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nandaniyadav/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,id,qid1,qid2,is_duplicate,clean_question1,clean_question2
0,0,1,2,0,step step guide invest share market india,step step guide invest share market
1,1,3,4,0,story kohinoor koh noor diamond,would happen indian government stole kohinoor ...
2,2,5,6,0,increase speed internet connection using vpn,internet speed increased hacking dns
3,3,7,8,0,mentally lonely solve,find remainder 23 power 24 divided 24 23
4,4,9,10,0,one dissolve water quikly sugar salt methane c...,fish would survive salt water


In [49]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 404348 entries, 0 to 404350
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   id               404348 non-null  int64 
 1   qid1             404348 non-null  int64 
 2   qid2             404348 non-null  int64 
 3   is_duplicate     404348 non-null  int64 
 4   clean_question1  404348 non-null  object
 5   clean_question2  404348 non-null  object
dtypes: int64(4), object(2)
memory usage: 21.6+ MB


### 2.2 Lemmatization

#### 2.2.1 Lemmatization using SpaCy

In [50]:
##DO not run, just for reference
# Loading spaCy's English language model
# nlp = spacy.load("en_core_web_sm")

# # Function to lemmatize text
# def lemmatize_text(text):
#     doc = nlp(text)
#     lemmatized_list = [token.lemma_ for token in doc if token.is_alpha]
    
#     return ' '.join(lemmatized_list)

# data['clean_question1'] = data['clean_question1'].apply(lemmatize_text)
# data['clean_question2'] = data['clean_question2'].apply(lemmatize_text)

In [51]:
data.head()

Unnamed: 0,id,qid1,qid2,is_duplicate,clean_question1,clean_question2
0,0,1,2,0,step step guide invest share market india,step step guide invest share market
1,1,3,4,0,story kohinoor koh noor diamond,would happen indian government stole kohinoor ...
2,2,5,6,0,increase speed internet connection using vpn,internet speed increased hacking dns
3,3,7,8,0,mentally lonely solve,find remainder 23 power 24 divided 24 23
4,4,9,10,0,one dissolve water quikly sugar salt methane c...,fish would survive salt water


#### 2.2.2 Lemmatization using NLTK

 part of speech (POS) tagging is necessary for lemmatization in NLTK, especially in our use case of comparing questions to determine duplicates. The reason is that NLTK's lemmatization depends on POS tags to correctly identify the base form of words. Different words require different lemmatization processes based on their POS (e.g., verbs, nouns, adjectives). Without POS tagging, lemmatization may not accurately reduce words to their base or dictionary form, which could affect the comparison and analysis of question pairs for duplicates.

In [52]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize_sentence(sentence):
    # if pd.isnull(sentence):  # Check if the sentence is NaN
    #     return ""  # Return empty string for NaN values
    
    # Tokenize the text
    tokens = nltk.word_tokenize(sentence)
    
    # Tag tokens with part-of-speech
    pos_tagged = nltk.pos_tag(tokens)
    
    # Function to convert nltk tag to wordnet tag
    def nltk_tag_to_wordnet_tag(nltk_tag):
        if nltk_tag.startswith('J'):
            return wordnet.ADJ
        elif nltk_tag.startswith('V'):
            return wordnet.VERB
        elif nltk_tag.startswith('N'):
            return wordnet.NOUN
        elif nltk_tag.startswith('R'):
            return wordnet.ADV
        else:          
            return None
    
    # Lemmatize each token
    lemmatized_sentence = []
    for word, tag in pos_tagged:
        wordnet_tag = nltk_tag_to_wordnet_tag(tag)
        if wordnet_tag is not None:
            lemmatized_sentence.append(lemmatizer.lemmatize(word, wordnet_tag))
        else:
            lemmatized_sentence.append(word)
    
    # Reconstruct the sentence
    return ' '.join(lemmatized_sentence)

# Replace NaN values with an empty string and apply the lemmatization
data['lemmatized_question1'] = data['clean_question1'].apply(lemmatize_sentence)
data['lemmatized_question2'] = data['clean_question2'].apply(lemmatize_sentence)


We will proceed with Lemmatization using NLTK and POS tagging. It is more faster that SpaCy and we have a large dataset to process.

In [53]:
#making a new column to find the length of the questions
data['len_q1'] = data['lemmatized_question1'].apply(lambda x: len(x))
data['len_q2'] = data['lemmatized_question2'].apply(lambda x: len(x))

In [54]:
#saving cleaned file as csv
data.to_csv('cleaned_questions.csv', index=False)

In [56]:
data.head()

Unnamed: 0,id,qid1,qid2,is_duplicate,clean_question1,clean_question2,lemmatized_question1,lemmatized_question2,len_q1,len_q2
0,0,1,2,0,step step guide invest share market india,step step guide invest share market,step step guide invest share market india,step step guide invest share market,41,35
1,1,3,4,0,story kohinoor koh noor diamond,would happen indian government stole kohinoor ...,story kohinoor koh noor diamond,would happen indian government steal kohinoor ...,31,67
2,2,5,6,0,increase speed internet connection using vpn,internet speed increased hacking dns,increase speed internet connection use vpn,internet speed increase hack dns,42,32
3,3,7,8,0,mentally lonely solve,find remainder 23 power 24 divided 24 23,mentally lonely solve,find remainder 23 power 24 divide 24 23,21,39
4,4,9,10,0,one dissolve water quikly sugar salt methane c...,fish would survive salt water,one dissolve water quikly sugar salt methane c...,fish would survive salt water,60,29


## 3.1 Vectorising using TFIDF

In [19]:
#vecoring the q1 and q2
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf_q1 = tfidf.fit_transform(data['q1_clean'])
tfidf_q2 = tfidf.transform(data['q2_clean'])

tfidf_q1.shape, tfidf_q2.shape

((404348, 69029), (404348, 69029))

In [28]:
#printing a single vecotor of q1
print(data['question1'][0])
print(data['q1_clean'][0])
print(tfidf_q1[0],"\n")

print(data['question1'][1])
print(data['q1_clean'][1])
print(tfidf_q1[1])

What is the step by step guide to invest in share market in india?
step step guide invest share market india
  (0, 31162)	0.20685307207285322
  (0, 38279)	0.2879106302305413
  (0, 55758)	0.33466054360188274
  (0, 32212)	0.3263199368967071
  (0, 27487)	0.406764291381842
  (0, 58540)	0.7002711661711661 

What is the story of Kohinoor (Koh-i-Noor) Diamond?
story kohinoor kohinoor diamond
  (0, 19131)	0.3614777376192901
  (0, 34844)	0.895095953812048
  (0, 58733)	0.261030800241925


## 3.2 Vectorising using Word2Vec

In [33]:
#vecotorising using word2vec
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

#tokenizing the questions
data['q1_tokens'] = data['q1_clean'].apply(lambda x: word_tokenize(x))
data['q2_tokens'] = data['q2_clean'].apply(lambda x: word_tokenize(x))

data.head()


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_clean,q2_clean,len_q1,len_q2,q1_tokens,q2_tokens
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,step step guide invest share market india,step step guide invest share market,41,35,"[step, step, guide, invest, share, market, india]","[step, step, guide, invest, share, market]"
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,story kohinoor kohinoor diamond,would happen indian government steal kohinoor ...,31,67,"[story, kohinoor, kohinoor, diamond]","[would, happen, indian, government, steal, koh..."
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,increase speed internet connection use vpn,internet speed increase hack dns,42,32,"[increase, speed, internet, connection, use, vpn]","[internet, speed, increase, hack, dns]"
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,mentally lonely solve,find remainder math2324math divide 2423,21,39,"[mentally, lonely, solve]","[find, remainder, math2324math, divide, 2423]"
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,one dissolve water quikly sugar salt methane c...,fish would survive salt water,60,29,"[one, dissolve, water, quikly, sugar, salt, me...","[fish, would, survive, salt, water]"


In [40]:
# Combining tokens from q1 and q2 for training the Word2Vec model
combined_tokens = data['q1_tokens'].tolist() + data['q2_tokens'].tolist()

# Training the Word2Vec model
model = Word2Vec(combined_tokens, vector_size=100, window=5, min_count=1, workers=4)
model.train(combined_tokens, total_examples=model.corpus_count, epochs=10)

# Adjusting the get_average_word2vec function to work with the vector model and tokens
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=100):
    if len(tokens_list) < 1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

# Adjusting the get_word2vec_embeddings function to handle both q1 and q2
def get_word2vec_embeddings(vectors, tokens):
    embeddings = tokens.apply(lambda x: get_average_word2vec(x, vectors))
    return list(embeddings)

# Applying the function to get the Word2Vec embeddings for both questions
q1_embeddings = get_word2vec_embeddings(model.wv, data['q1_tokens'])
q2_embeddings = get_word2vec_embeddings(model.wv, data['q2_tokens'])

# Verifying the shape of the embeddings
np.array(q1_embeddings).shape, np.array(q2_embeddings).shape

((404348, 100), (404348, 100))

In [41]:
#printing the embeddings

print(data['question1'][0])
print(data['q1_clean'][0])
print(data['q1_tokens'][0])
print(q1_embeddings[0],"\n")

What is the step by step guide to invest in share market in india?
step step guide invest share market india
['step', 'step', 'guide', 'invest', 'share', 'market', 'india']
[-0.3253462   0.26244473 -0.08660941  0.37559232 -0.163662   -0.4149309
  0.7285487  -0.25965905 -0.24108954  0.4867224   0.14792326  0.37749586
  0.42332098  0.61909854  0.33205995 -0.432712   -0.11139467 -0.09459262
 -0.02723896  0.29754353  1.2064008   0.5660844  -1.7789133  -0.11842483
  1.3022009  -1.3495013  -0.25312966 -0.39462867 -1.2683771   0.5267576
 -0.13193925 -0.24340534  0.54764193 -0.8048476  -0.28579298 -0.41477618
 -0.282823    1.0840447   0.528861   -0.32653052 -1.5251633  -0.54182595
  0.30241483  0.2887678  -1.1410592   0.8777286   0.74321955 -1.486635
  0.7986768   0.01431977  1.2304507   0.92896974 -0.5894543   0.5027923
  0.60770565 -0.11172057 -0.4060655  -0.08909579  0.01043369  0.22586478
 -2.0197217   0.47006813 -0.37562582  0.5702934   0.40824673 -0.82841676
 -0.5137155   0.02935838  0.1