## Quora_Question_Pairs to for similarity score

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics.pairwise import cosine_similarity

df = pd.read_csv('datasets\Quora_Question_Pairs.csv')
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [2]:
df = df.sample(1000, random_state=42)
df.reset_index(drop=True, inplace=True)
df.shape

(1000, 6)

In [3]:
def detail_info(data):
    temp_df = pd.DataFrame(index= data.columns)
    
    temp_df['data_type'] = data.dtypes
    temp_df['unique_val'] = data.nunique()
    temp_df['duplicate_val'] = data.duplicated().sum()
    temp_df['missing_val'] = data.isnull().sum()
    temp_df['missing_val_%'] = round(data.isnull().mean()*100,2)
    
    return temp_df

detail_info(df)

Unnamed: 0,data_type,unique_val,duplicate_val,missing_val,missing_val_%
id,int64,1000,0,0,0.0
qid1,int64,997,0,0,0.0
qid2,int64,998,0,0,0.0
question1,object,997,0,0,0.0
question2,object,998,0,0,0.0
is_duplicate,int64,2,0,0,0.0


In [4]:
df.drop(["id", "qid1", "qid2"], axis=1, inplace=True)
df.head()

Unnamed: 0,question1,question2,is_duplicate
0,How do I play Pokémon GO in Korea?,How do I play Pokémon GO in China?,0
1,What are some of the best side dishes for crab...,What are some good side dishes for buffalo chi...,0
2,Which is more advisable and better material fo...,What is the best server setup for buddypress?,0
3,How do I improve logical programming skills?,How can I improve my logical skills for progra...,1
4,How close we are to see 3rd world war?,How close is a World War III?,1


In [5]:
df["is_duplicate"].value_counts()

0    637
1    363
Name: is_duplicate, dtype: int64

In [6]:
X = df[['question1', 'question2']]
y = df['is_duplicate']

In [7]:
import spacy
import string
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

def preprocess_text(text):
    tokens = nlp(text)
    tokens = [word.lemma_.lower().strip() if word.lemma_ != '-PRON-' else word.lower_ for word in tokens]
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [word for word in tokens if word not in string.punctuation]
    tokens = ' '.join(tokens)
    
    return tokens

X['question1'] = X['question1'].apply(preprocess_text)
X['question2'] = X['question2'].apply(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['question1'] = X['question1'].apply(preprocess_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['question2'] = X['question2'].apply(preprocess_text)


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Create TF-IDF vectors for the questions
tfidf = TfidfVectorizer()
tfidf.fit(X_train.values.flatten())

X_train_qns1_tfidf = tfidf.transform(X_train['question1'])
X_train_qns2_tfidf = tfidf.transform(X_train['question2'])
X_train_tfidf = X_train_qns1_tfidf - X_train_qns2_tfidf

# Calculate the cosine similarity between the TF-IDF vectors
similarity_scores_train = cosine_similarity(X_train_qns1_tfidf, X_train_qns2_tfidf)
similarity_scores_train

array([[0.29464446, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.34403252, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.8774555 , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.66681519, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.78710838,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.54605904]])

In [10]:
X_test_qns1_tfidf = tfidf.transform(X_test['question1'])
X_test_qns2_tfidf = tfidf.transform(X_test['question2'])
X_test_tfidf = X_test_qns1_tfidf - X_test_qns2_tfidf

# cosine similarity for the test questions
similarity_scores_test = cosine_similarity(X_test_qns1_tfidf, X_test_qns2_tfidf)
similarity_scores_test

array([[0.1627319 , 0.        , 0.14514061, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.20393498, 0.        , 0.        , ..., 0.3966714 , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.30557394, 0.2045172 ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [11]:
# similarity scores for test questions

for i, score in enumerate(similarity_scores_test):
    print(f"Similarity score for question pair {i+1}: {score[0]}")

Similarity score for question pair 1: 0.1627319041528694
Similarity score for question pair 2: 0.0
Similarity score for question pair 3: 0.0
Similarity score for question pair 4: 0.0
Similarity score for question pair 5: 0.0
Similarity score for question pair 6: 0.0
Similarity score for question pair 7: 0.0
Similarity score for question pair 8: 0.13864044620665308
Similarity score for question pair 9: 0.0
Similarity score for question pair 10: 0.0
Similarity score for question pair 11: 0.0
Similarity score for question pair 12: 0.0
Similarity score for question pair 13: 0.0
Similarity score for question pair 14: 0.0
Similarity score for question pair 15: 0.0
Similarity score for question pair 16: 0.0
Similarity score for question pair 17: 0.0
Similarity score for question pair 18: 0.0
Similarity score for question pair 19: 0.0
Similarity score for question pair 20: 0.0
Similarity score for question pair 21: 0.0
Similarity score for question pair 22: 0.0
Similarity score for question pa

In [12]:
# To find similarity between the questions using a supervised Algorithm

model = RandomForestClassifier()
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.585