<a href="https://colab.research.google.com/github/SrikanthDodle/Quora-Duplicate-Questions-Pairs/blob/main/Duplicate_Questions_Classification_using_Multiple_Techniques.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Import Required Libraries**

In [1]:
import numpy as np  # NumPy for numerical operations
import pandas as pd  # Pandas for data manipulation


In [None]:

# Reading the Quora training dataset into a Pandas DataFrame
df = pd.read_csv('/content/train.csv', encoding='utf-8')

# Displaying the shape (number of rows and columns) of the DataFrame
df.shape

(404290, 6)

In [None]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [None]:
df.columns

Index(['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate'], dtype='object')

In [None]:
# Drop the unnecessary columns
df = df[['question1', 'question2', 'is_duplicate']]
df.head()

Unnamed: 0,question1,question2,is_duplicate
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [None]:
df.columns

Index(['question1', 'question2', 'is_duplicate'], dtype='object')

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# Data preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove non-word characters
    text = re.sub(r'\W', ' ', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    # Remove leading/trailing spaces
    text = text.strip()
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join tokens back into a single string
    text = ' '.join(tokens)
    return text

In [None]:
df_sampled = df.sample(n=40000, random_state=42)

In [None]:
# Apply preprocessing to the sampled questions
df_sampled['question1'] = df_sampled['question1'].apply(preprocess_text)
df_sampled['question2'] = df_sampled['question2'].apply(preprocess_text)

# Display the first few rows to verify preprocessing
print(df_sampled[['question1', 'question2']].head())

                                                question1  \
8067                                play pokémon go korea   
368101                           best side dish crab cake   
70497   advisable better material crash test automobil...   
226567                  improve logical programming skill   
73186                             close see 3rd world war   

                                question2  
8067                play pokémon go china  
368101     good side dish buffalo chicken  
70497        best server setup buddypress  
226567  improve logical skill programming  
73186                 close world war iii  


In [None]:
# Import necessary libraries for vectorization and machine learning models
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scipy.sparse import hstack

In [None]:
# Define the target variable
y = df_sampled['is_duplicate']

# Split the data into training and testing sets
X_train_q1, X_test_q1, y_train, y_test = train_test_split(df_sampled['question1'], y, test_size=0.2, random_state=42)
X_train_q2, X_test_q2, _, _ = train_test_split(df_sampled['question2'], y, test_size=0.2, random_state=42)

# Initialize CountVectorizer
count_vectorizer = CountVectorizer()

# Fit and transform the training data, transform the testing data for both questions
X_train_q1_vec = count_vectorizer.fit_transform(X_train_q1)
X_test_q1_vec = count_vectorizer.transform(X_test_q1)
X_train_q2_vec = count_vectorizer.fit_transform(X_train_q2)
X_test_q2_vec = count_vectorizer.transform(X_test_q2)

# Concatenate the vectorized representations of both questions
X_train_vec = hstack([X_train_q1_vec, X_train_q2_vec])
X_test_vec = hstack([X_test_q1_vec, X_test_q2_vec])


In [None]:
# Initialize and train Logistic Regression model
logistic_regression_model = LogisticRegression(max_iter=1000)
logistic_regression_model.fit(X_train_vec, y_train)

# Make predictions on the test set
y_pred_lr = logistic_regression_model.predict(X_test_vec)

# Evaluate performance
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr)
recall_lr = recall_score(y_test, y_pred_lr)
f1_lr = f1_score(y_test, y_pred_lr)

# Print the evaluation metrics
print('Logistic Regression with CountVectorizer:')
print(f'Accuracy: {accuracy_lr:.4f}')
print(f'Precision: {precision_lr:.4f}')
print(f'Recall: {recall_lr:.4f}')
print(f'F1 Score: {f1_lr:.4f}')
print('-' * 30)

Logistic Regression with CountVectorizer:
Accuracy: 0.7037
Precision: 0.6056
Recall: 0.5503
F1 Score: 0.5766
------------------------------


In [None]:
# Initialize and train Decision Tree model
decision_tree_model = DecisionTreeClassifier()
decision_tree_model.fit(X_train_vec, y_train)

# Make predictions on the test set
y_pred_dt = decision_tree_model.predict(X_test_vec)

# Evaluate performance
accuracy_dt = accuracy_score(y_test, y_pred_dt)
precision_dt = precision_score(y_test, y_pred_dt)
recall_dt = recall_score(y_test, y_pred_dt)
f1_dt = f1_score(y_test, y_pred_dt)

# Print the evaluation metrics
print('Decision Tree with CountVectorizer:')
print(f'Accuracy: {accuracy_dt:.4f}')
print(f'Precision: {precision_dt:.4f}')
print(f'Recall: {recall_dt:.4f}')
print(f'F1 Score: {f1_dt:.4f}')
print('-' * 30)

Decision Tree with CountVectorizer:
Accuracy: 0.6957
Precision: 0.5937
Recall: 0.5390
F1 Score: 0.5650
------------------------------


In [None]:
# Initialize and train Random Forest model
random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train_vec, y_train)

# Make predictions on the test set
y_pred_rf = random_forest_model.predict(X_test_vec)

# Evaluate performance
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)

# Print the evaluation metrics
print('Random Forest with CountVectorizer:')
print(f'Accuracy: {accuracy_rf:.4f}')
print(f'Precision: {precision_rf:.4f}')
print(f'Recall: {recall_rf:.4f}')
print(f'F1 Score: {f1_rf:.4f}')
print('-' * 30)

Random Forest with CountVectorizer:
Accuracy: 0.7351
Precision: 0.6702
Recall: 0.5465
F1 Score: 0.6021
------------------------------


In [None]:
# Initialize and train XGBoost model
xgb_model = XGBClassifier()
xgb_model.fit(X_train_vec, y_train)

# Make predictions on the test set
y_pred_xgb = xgb_model.predict(X_test_vec)

# Evaluate performance
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb)
recall_xgb = recall_score(y_test, y_pred_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb)

# Print the evaluation metrics
print('XGBoost with CountVectorizer:')
print(f'Accuracy: {accuracy_xgb:.4f}')
print(f'Precision: {precision_xgb:.4f}')
print(f'Recall: {recall_xgb:.4f}')
print(f'F1 Score: {f1_xgb:.4f}')
print('-' * 30)

XGBoost with CountVectorizer:
Accuracy: 0.7191
Precision: 0.7399
Recall: 0.3607
F1 Score: 0.4850
------------------------------


In [None]:
# Import necessary libraries for TF-IDF vectorization and models
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data, transform the testing data for both questions
X_train_q1_tfidf = tfidf_vectorizer.fit_transform(X_train_q1)
X_test_q1_tfidf = tfidf_vectorizer.transform(X_test_q1)
X_train_q2_tfidf = tfidf_vectorizer.fit_transform(X_train_q2)
X_test_q2_tfidf = tfidf_vectorizer.transform(X_test_q2)

# Concatenate the vectorized representations of both questions
X_train_tfidf = hstack([X_train_q1_tfidf, X_train_q2_tfidf])
X_test_tfidf = hstack([X_test_q1_tfidf, X_test_q2_tfidf])

# Train Logistic Regression
logistic_regression_model = LogisticRegression(max_iter=1000)
logistic_regression_model.fit(X_train_tfidf, y_train)
y_pred_lr = logistic_regression_model.predict(X_test_tfidf)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr)
recall_lr = recall_score(y_test, y_pred_lr)
f1_lr = f1_score(y_test, y_pred_lr)

print('Logistic Regression with TF-IDF:')
print(f'Accuracy: {accuracy_lr:.4f}')
print(f'Precision: {precision_lr:.4f}')
print(f'Recall: {recall_lr:.4f}')
print(f'F1 Score: {f1_lr:.4f}')
print('-' * 30)


Logistic Regression with TF-IDF:
Accuracy: 0.7251
Precision: 0.6712
Recall: 0.4906
F1 Score: 0.5669
------------------------------


In [None]:
# Train Random Forest
random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train_tfidf, y_train)
y_pred_rf = random_forest_model.predict(X_test_tfidf)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)

print('Random Forest with TF-IDF:')
print(f'Accuracy: {accuracy_rf:.4f}')
print(f'Precision: {precision_rf:.4f}')
print(f'Recall: {recall_rf:.4f}')
print(f'F1 Score: {f1_rf:.4f}')
print('-' * 30)

Random Forest with TF-IDF:
Accuracy: 0.7518
Precision: 0.6904
Recall: 0.5854
F1 Score: 0.6336
------------------------------


In [None]:
# Train Decision Tree
decision_tree_model = DecisionTreeClassifier()
decision_tree_model.fit(X_train_tfidf, y_train)
y_pred_dt = decision_tree_model.predict(X_test_tfidf)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
precision_dt = precision_score(y_test, y_pred_dt)
recall_dt = recall_score(y_test, y_pred_dt)
f1_dt = f1_score(y_test, y_pred_dt)

print('Decision Tree with TF-IDF:')
print(f'Accuracy: {accuracy_dt:.4f}')
print(f'Precision: {precision_dt:.4f}')
print(f'Recall: {recall_dt:.4f}')
print(f'F1 Score: {f1_dt:.4f}')
print('-' * 30)

Decision Tree with TF-IDF:
Accuracy: 0.6744
Precision: 0.5556
Recall: 0.5585
F1 Score: 0.5570
------------------------------


In [None]:
# Train XGBoost
xgb_model = XGBClassifier()
xgb_model.fit(X_train_tfidf, y_train)
y_pred_xgb = xgb_model.predict(X_test_tfidf)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb)
recall_xgb = recall_score(y_test, y_pred_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb)

print('XGBoost with TF-IDF:')
print(f'Accuracy: {accuracy_xgb:.4f}')
print(f'Precision: {precision_xgb:.4f}')
print(f'Recall: {recall_xgb:.4f}')
print(f'F1 Score: {f1_xgb:.4f}')
print('-' * 30)

XGBoost with TF-IDF:
Accuracy: 0.7262
Precision: 0.7159
Recall: 0.4200
F1 Score: 0.5294
------------------------------


In [None]:
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
# Define a function to convert sentences to vectors using Word2Vec
def text_to_w2v_vectors(texts, model):
    vectors = []
    for text in texts:
        words = text.split()
        word_vectors = [model.wv[word] for word in words if word in model.wv]
        if word_vectors:
            vectors.append(np.mean(word_vectors, axis=0))
        else:
            vectors.append(np.zeros(model.vector_size))
    return np.array(vectors)



In [None]:
# Define the target variable
y = df_sampled['is_duplicate']

# Split the data into training and testing sets for questions
X_train_q1, X_test_q1, y_train, y_test = train_test_split(df_sampled['question1'], y, test_size=0.2, random_state=42)
X_train_q2, X_test_q2, _, _ = train_test_split(df_sampled['question2'], y, test_size=0.2, random_state=42)

# Train Word2Vec model on the combined questions
combined_questions = X_train_q1.tolist() + X_train_q2.tolist()
w2v_model = Word2Vec(sentences=[q.split() for q in combined_questions], vector_size=100, window=5, min_count=1, workers=4)

# Convert questions to Word2Vec vectors
X_train_q1_w2v = text_to_w2v_vectors(X_train_q1, w2v_model)
X_test_q1_w2v = text_to_w2v_vectors(X_test_q1, w2v_model)
X_train_q2_w2v = text_to_w2v_vectors(X_train_q2, w2v_model)
X_test_q2_w2v = text_to_w2v_vectors(X_test_q2, w2v_model)

# Concatenate the Word2Vec representations of both questions
X_train_w2v = np.concatenate((X_train_q1_w2v, X_train_q2_w2v), axis=1)
X_test_w2v = np.concatenate((X_test_q1_w2v, X_test_q2_w2v), axis=1)

# Ensure the lengths match
assert X_train_w2v.shape[0] == len(y_train)
assert X_test_w2v.shape[0] == len(y_test)

In [None]:
# Model training and evaluation

# Logistic Regression
lr_model = LogisticRegression(max_iter=200)
lr_model.fit(X_train_w2v, y_train)
y_pred_lr = lr_model.predict(X_test_w2v)
print("Logistic Regression:")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Precision:", precision_score(y_test, y_pred_lr))
print("Recall:", recall_score(y_test, y_pred_lr))
print("F1 Score:", f1_score(y_test, y_pred_lr))
print()



Logistic Regression:
Accuracy: 0.700875
Precision: 0.6541095890410958
Recall: 0.39072621888851006
F1 Score: 0.48922091782283883



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# Decision Tree
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train_w2v, y_train)
y_pred_dt = dt_model.predict(X_test_w2v)
print("Decision Tree:")
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Precision:", precision_score(y_test, y_pred_dt))
print("Recall:", recall_score(y_test, y_pred_dt))
print("F1 Score:", f1_score(y_test, y_pred_dt))
print()

Decision Tree:
Accuracy: 0.6385
Precision: 0.5067858325057928
Recall: 0.5219911353562905
F1 Score: 0.5142761168962042



In [None]:
# Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train_w2v, y_train)
y_pred_rf = rf_model.predict(X_test_w2v)
print("Random Forest:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Precision:", precision_score(y_test, y_pred_rf))
print("Recall:", recall_score(y_test, y_pred_rf))
print("F1 Score:", f1_score(y_test, y_pred_rf))
print()

Random Forest:
Accuracy: 0.732
Precision: 0.6925329428989752
Recall: 0.4838049778383907
F1 Score: 0.5696507426736251



In [None]:

# XGBoost
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train_w2v, y_train)
y_pred_xgb = xgb_model.predict(X_test_w2v)
print("XGBoost:")
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Precision:", precision_score(y_test, y_pred_xgb))
print("Recall:", recall_score(y_test, y_pred_xgb))
print("F1 Score:", f1_score(y_test, y_pred_xgb))
print()

XGBoost:
Accuracy: 0.72225
Precision: 0.6467189434585225
Recall: 0.5342652574156154
F1 Score: 0.5851381628080657

