In [3]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm

from nltk.stem import WordNetLemmatizer  # For Lemmetization of words
from nltk.corpus import stopwords  # Load list of stopwords
from nltk import word_tokenize # Convert paragraph in tokens

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
df=pd.read_csv('Precily_Text_Similarity.csv')

In [5]:
df.shape 

(3000, 2)

In [6]:
df.isnull().sum()

text1    0
text2    0
dtype: int64

In [7]:
df1=df.copy()

In [8]:
def preprocess_text(text_column):
    preprocessed_text = []
    for sentence in tqdm(text_column.values):
        sent = re.sub('[^A-Za-z0-9]+', ' ', sentence)
        sent = ' '.join(e for e in sent.split() if e not in stopwords.words('english'))
        preprocessed_text.append(sent.lower().strip())
    return preprocessed_text

# Example usage
df1['text1'] = preprocess_text(df1['text1'])
df1['text2'] = preprocess_text(df1['text2'])

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 3000/3000 [47:51<00:00,  1.04it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 3000/3000 [28:15<00:00,  1.77it/s]


In [10]:
def word_tokenizer(text):
    #tokenizes and stems the text
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer() 
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return tokens

In [11]:
for i in tqdm(df1.index):
    s1=df1['text1'][i]
    s2=df1['text2'][i]
    s1words = ' '.join(word_tokenizer(s1))
    s2words = ' '.join(word_tokenizer(s2))
    df1['text1'][i]=s1words
    df1['text2'][i]=s2words

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 3000/3000 [00:15<00:00, 194.45it/s]


In [31]:
tfidf=TfidfVectorizer()

In [13]:
def calculate_similarity(row):
    tfidf_matrix = tfidf.fit_transform(row[['text1', 'text2']].values.astype('U'))
    cos_sim=cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])
    return cos_sim[0][0]
df1['similarity'] = df1.apply(calculate_similarity, axis=1)

In [16]:
df1['binary_score'] = (df1['similarity'] > 0.5).astype(int)