In [23]:
import pandas as pd
import numpy as np
import re
import string
import nltk

In [24]:
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

In [26]:
import matplotlib.pyplot as plt
df = pd.read_csv("questions.csv")


In [27]:
df

Unnamed: 0,question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."
...,...
4994,Who is currently winning the presidential elec...
4995,"What has a better ROI, marketing on radio sta..."
4996,Which mobile is good for 50k?
4997,Is the character Jane in the movie Predestinat...


In [28]:
# Data cleaning and preprocessing

import nltk
nltk.download('stopwords')
nltk.download('punkt')
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove digits
    text = re.sub(r'\d+', '', text)
    # Remove stopwords
    stopwords = nltk.corpus.stopwords.words('english')
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stopwords]
    # Stemming
    stemmer = SnowballStemmer('english')
    tokens = [stemmer.stem(token) for token in tokens]
    return " ".join(tokens)



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bhargavram/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bhargavram/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [29]:
df['cleaned_text'] = df['question'].apply(clean_text)

In [30]:
from sklearn.model_selection import train_test_split
# Split the dataset into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


In [31]:
# Feature extraction
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix_train = tfidf_vectorizer.fit_transform(train_df['cleaned_text'])
tfidf_matrix_test = tfidf_vectorizer.transform(test_df['cleaned_text'])


In [32]:
# Similarity calculation
similarity_matrix_train = cosine_similarity(tfidf_matrix_train)
similarity_matrix_train

array([[1.        , 0.09272997, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.09272997, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [33]:
# Clustering
num_clusters = 5
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix_train)
clusters_train = km.labels_.tolist()


In [34]:
# Duplicate detection
import seaborn as sns

train_questions = train_df['question'].tolist()
clustered_questions_train = {i: [] for i in range(num_clusters)}
for i, cluster in enumerate(clusters_train):
    clustered_questions_train[cluster].append(train_questions[i])
    
duplicates_train = []

In [35]:

for cluster in clustered_questions_train:
    if len(clustered_questions_train[cluster]) > 0.1:
        group = clustered_questions_train[cluster]
        pairwise_similarity = similarity_matrix_train[np.ix_([train_questions.index(q) for q in group], [train_questions.index(q) for q in group])]
        indices = np.argmin(pairwise_similarity, axis=1)
        duplicates_train.extend([(group[i], group[j]) for i, j in zip(range(len(group)), indices) if i < indices[i]])

In [36]:
# Print the duplicates in the training set
print("Duplicate questions in the training set:")
for pair in duplicates_train:
    print(pair)

Duplicate questions in the training set:
('What is it like to own and drive a Nissan GT-R?', 'Why is dividing by zero an error or undefined?')
('What are the best books for basic electrical engineering?', 'What are the ways to learn about stock market?')
('What is the best black Friday deal for this year?', 'What are the ways to learn about stock market?')
('What is the best computer science course I can take that will not focus too much on the language they use?', 'What are the ways to learn about stock market?')
('What is the best plugin list for WordPress blog?', 'What are the ways to learn about stock market?')
('What are some best examples of Presence of mind?', 'What are the ways to learn about stock market?')
('What are best Hollywood movies?', 'What are the ways to learn about stock market?')
('What is the best way to reduce abdominal fat?', 'What are the newest or best gadgets, tools, techniques that help one learn quicker, memorize tons of complex books, sleep less, or be mor

In [37]:
user_input = input("Enter a question: ")

Enter a question: in which location did mahatma gandhi born


In [38]:
similarity_matrix_test = cosine_similarity(tfidf_matrix_test, tfidf_matrix_train)
test_questions = test_df['question'].tolist()

In [39]:
cleaned_user_input = clean_text(user_input)
tfidf_user_input = tfidf_vectorizer.transform([cleaned_user_input])
similarity_scores = cosine_similarity(tfidf_user_input, tfidf_matrix_train)[0]
indices = np.argpartition(similarity_scores, -3)[-3:]
duplicates  = []

In [40]:
for i in range(len(indices)):
    if similarity_scores[indices[i]] >= 0.1:
        duplicate_question = train_questions[indices[i]]
        duplicates.append(duplicate_question)
if duplicates ==  None:
    print("The question is not a duplicate.")
if duplicates is not  None:
    print("The question is similar to the following question in the training set:")
        
    for i in duplicates:
        print(i)

The question is similar to the following question in the training set:
From what place did Mahatma Gandhi come into the world?
In which location did Mahatma Gandhi originate from?
Can you tell me where Mahatma Gandhi was born?
