In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# Loading Dataset

In [2]:
# Load the train dataset
train_data = pd.read_csv('/content/drive/MyDrive/datasets/quora_train.csv')

In [3]:
# Load the test dataset
test_data = pd.read_csv('/content/drive/MyDrive/datasets/quora_test.csv')

#Preprocessing

In [4]:
train_data.isna().sum()

id              0
qid1            0
qid2            0
question1       1
question2       2
is_duplicate    0
dtype: int64

In [5]:
test_data.isna().sum()

test_id      0
question1    2
question2    4
dtype: int64

In [6]:
train_data.dropna(inplace=True)
test_data.dropna(inplace=True)

In [17]:
# Preprocess the train data
train_data['question1'] = train_data['question1'].astype(str).apply(lambda x: x.lower())
train_data['question2'] = train_data['question2'].astype(str).apply(lambda x: x.lower())

In [None]:
# Preprocess the test data
test_data['question1'] = test_data['question1'].astype(str).apply(lambda x: x.lower())
test_data['question2'] = test_data['question2'].astype(str).apply(lambda x: x.lower())

# Vectorization using TF-IDF

In [None]:
# Convert the question pairs to TF-IDF vectors
train_questions = train_data['question1'].tolist() + train_data['question2'].tolist()
test_questions = test_data['question1'].tolist() + test_data['question2'].tolist()

In [None]:
vectorizer = TfidfVectorizer()
train_tfidf_vectors = vectorizer.fit_transform(train_questions)
test_tfidf_vectors = vectorizer.transform(test_questions)

#Splitting Test Vectors

In [None]:
# Split the test vectors into separate question1 and question2 vectors
num_test_samples = len(test_data)
test_question1_vectors = test_tfidf_vectors[:num_test_samples]
test_question2_vectors = test_tfidf_vectors[num_test_samples:]

#Calculating Cosine Similarity

In [None]:

# Calculate cosine similarity between the question pairs
similarity_scores = (test_question1_vectors.multiply(test_question2_vectors)).sum(axis=1).A1

#Printing Similarity Scores

In [16]:
# Print the similarity scores
for i, score in enumerate(similarity_scores):
    print(f"Question pair {i}: Similarity score = {score}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Question pair 2340790: Similarity score = 0.46550837531596645
Question pair 2340791: Similarity score = 0.0
Question pair 2340792: Similarity score = 0.35182368140142106
Question pair 2340793: Similarity score = 0.5165888266805998
Question pair 2340794: Similarity score = 0.014603945821079057
Question pair 2340795: Similarity score = 0.495846108680908
Question pair 2340796: Similarity score = 0.024828349699001412
Question pair 2340797: Similarity score = 0.7539586179249163
Question pair 2340798: Similarity score = 0.017614976408361946
Question pair 2340799: Similarity score = 0.01885454760568587
Question pair 2340800: Similarity score = 0.0
Question pair 2340801: Similarity score = 0.7458559201536179
Question pair 2340802: Similarity score = 0.05495275865558902
Question pair 2340803: Similarity score = 0.3256803366991537
Question pair 2340804: Similarity score = 0.0
Question pair 2340805: Similarity score = 0.203952306537