In [87]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import gensim.downloader as api
import numpy as np

from sentence_transformers import SentenceTransformer

In [2]:
df = pd.read_csv('/content/Text_Similarity_Dataset.csv')
df.head(5)

Unnamed: 0,Unique_ID,text1,text2
0,0,savvy searchers fail to spot ads internet sear...,newcastle 2-1 bolton kieron dyer smashed home ...
1,1,millions to miss out on the net by 2025 40% o...,nasdaq planning $100m share sale the owner of ...
2,2,young debut cut short by ginepri fifteen-year-...,ruddock backs yapp s credentials wales coach m...
3,3,diageo to buy us wine firm diageo the world s...,mci shares climb on takeover bid shares in us ...
4,4,be careful how you code a new european directi...,media gadgets get moving pocket-sized devices ...


In [37]:
pattern = r'\w+'
tokenizer = RegexpTokenizer(pattern)


In [38]:
import nltk

# Force re-download the necessary resources
# nltk.download('punkt', force=True)
nltk.download('stopwords', force=True)
nltk.download('wordnet', force=True)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [39]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [40]:
def preprocess_text(text):
  if isinstance(text, str):  # Ensure input is a string
      text = text.lower() # Convert to lowercase

      text = re.sub(r'[^\w\s]','', text) # Remove punctuation and special characters

      tokens = tokenizer.tokenize(text)# Tokenize text

      tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords

      tokens = [lemmatizer.lemmatize(word) for word in tokens]# Lemmatize words

      return ' '.join(tokens)# Join tokens back into a string
  else:
    return "" # Return an empty string for NaN or non-string values










In [41]:
df['cleaned_text1'] = df['text1'].apply(preprocess_text)
df['cleaned_text2'] = df['text2'].apply(preprocess_text)

In [26]:
print(df.columns)  # Displays all column names


Index(['Unique_ID', 'text1', 'text2', 'cleaned_text1', 'cleaned_text2'], dtype='object')


In [42]:
print(df[['text1','cleaned_text1','text2','cleaned_text2']].head(5))

                                               text1  \
0  savvy searchers fail to spot ads internet sear...   
1  millions to miss out on the net by 2025  40% o...   
2  young debut cut short by ginepri fifteen-year-...   
3  diageo to buy us wine firm diageo  the world s...   
4  be careful how you code a new european directi...   

                                       cleaned_text1  \
0  savvy searcher fail spot ad internet search en...   
1  million miss net 2025 40 uk population still w...   
2  young debut cut short ginepri fifteenyearold d...   
3  diageo buy u wine firm diageo world biggest sp...   
4  careful code new european directive could put ...   

                                               text2  \
0  newcastle 2-1 bolton kieron dyer smashed home ...   
1  nasdaq planning $100m share sale the owner of ...   
2  ruddock backs yapp s credentials wales coach m...   
3  mci shares climb on takeover bid shares in us ...   
4  media gadgets get moving pocket-sized devic

**TFIDF**

In [45]:
vectorizer = TfidfVectorizer()

combined_text = df['cleaned_text1'].tolist() + df['cleaned_text2'].tolist()# Combine both text columns for fitting the vectorizer


# Fit the vectorizer on the combined text and transform both text columns
vectorizer.fit(combined_text)

tfidf1 = vectorizer.transform(df['cleaned_text1'])
tfidf2 = vectorizer.transform(df['cleaned_text2'])

In [47]:
df['cosine_similarity'] = [cosine_similarity(tfidf1[i],tfidf2[i])[0][0] for i in range(len(df))]

In [48]:
print(df.columns)  # Displays all column names

Index(['Unique_ID', 'text1', 'text2', 'cleaned_text1', 'cleaned_text2',
       'cosine_similarity'],
      dtype='object')


In [50]:
print(df[['Unique_ID','text1','cleaned_text1','text2','cleaned_text2','cosine_similarity']].head(5))

   Unique_ID                                              text1  \
0          0  savvy searchers fail to spot ads internet sear...   
1          1  millions to miss out on the net by 2025  40% o...   
2          2  young debut cut short by ginepri fifteen-year-...   
3          3  diageo to buy us wine firm diageo  the world s...   
4          4  be careful how you code a new european directi...   

                                       cleaned_text1  \
0  savvy searcher fail spot ad internet search en...   
1  million miss net 2025 40 uk population still w...   
2  young debut cut short ginepri fifteenyearold d...   
3  diageo buy u wine firm diageo world biggest sp...   
4  careful code new european directive could put ...   

                                               text2  \
0  newcastle 2-1 bolton kieron dyer smashed home ...   
1  nasdaq planning $100m share sale the owner of ...   
2  ruddock backs yapp s credentials wales coach m...   
3  mci shares climb on takeover bid 

**Bag-of-Words (BoW)**

In [52]:
vectorizer = CountVectorizer()

In [59]:
text_vectorize = vectorizer.fit_transform(df['cleaned_text1'].tolist() + df['cleaned_text2'].tolist())

In [60]:
count_text1 = text_vectorize[:len(df)]
count_text2 = text_vectorize[len(df):]

In [63]:
df['cosine_similarity'] = [cosine_similarity(count_text1[i],count_text2[i])[0][0] for i in range(len(df))]

In [64]:
print(df[['Unique_ID','text1','cleaned_text1','text2','cleaned_text2','cosine_similarity']].head(5))

   Unique_ID                                              text1  \
0          0  savvy searchers fail to spot ads internet sear...   
1          1  millions to miss out on the net by 2025  40% o...   
2          2  young debut cut short by ginepri fifteen-year-...   
3          3  diageo to buy us wine firm diageo  the world s...   
4          4  be careful how you code a new european directi...   

                                       cleaned_text1  \
0  savvy searcher fail spot ad internet search en...   
1  million miss net 2025 40 uk population still w...   
2  young debut cut short ginepri fifteenyearold d...   
3  diageo buy u wine firm diageo world biggest sp...   
4  careful code new european directive could put ...   

                                               text2  \
0  newcastle 2-1 bolton kieron dyer smashed home ...   
1  nasdaq planning $100m share sale the owner of ...   
2  ruddock backs yapp s credentials wales coach m...   
3  mci shares climb on takeover bid 

**Word2Vec Embeddings**

In [66]:
word2vec_model = api.load('word2vec-google-news-300') # Pre-trained 300D vectors



In [75]:
# Function to get sentence embeddings by averaging word vectors
def get_sentence_embedding(sentence):
  words = sentence.split()
  vectors = []

  for word in words:
    try:
      vectors.append(word2vec_model[word])
    except KeyError:
      continue



  if not vectors:
    return np.zeros(300)
  return np.mean(vectors , axis = 0)


In [79]:
df['embedding_text1'] = df['cleaned_text1'].apply(get_sentence_embedding)
df['embedding_text2'] = df['cleaned_text2'].apply(get_sentence_embedding)

In [85]:
df['cosine_similarity'] = [
    cosine_similarity([df['embedding_text1'][i]], [df['embedding_text2'][i]])[0][0]
    for i in range(len(df))
]


In [86]:
print(df[['Unique_ID','text1','cleaned_text1','text2','cleaned_text2','cosine_similarity']].head(5))

   Unique_ID                                              text1  \
0          0  savvy searchers fail to spot ads internet sear...   
1          1  millions to miss out on the net by 2025  40% o...   
2          2  young debut cut short by ginepri fifteen-year-...   
3          3  diageo to buy us wine firm diageo  the world s...   
4          4  be careful how you code a new european directi...   

                                       cleaned_text1  \
0  savvy searcher fail spot ad internet search en...   
1  million miss net 2025 40 uk population still w...   
2  young debut cut short ginepri fifteenyearold d...   
3  diageo buy u wine firm diageo world biggest sp...   
4  careful code new european directive could put ...   

                                               text2  \
0  newcastle 2-1 bolton kieron dyer smashed home ...   
1  nasdaq planning $100m share sale the owner of ...   
2  ruddock backs yapp s credentials wales coach m...   
3  mci shares climb on takeover bid 

**BERT**

In [88]:
BERT_Transformer  = SentenceTransformer('all-MiniLM-L6-v2') # Small, fast BERT model


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [92]:
df['emdedding_text_Bert1'] = df['cleaned_text1'].apply(lambda x : BERT_Transformer.encode(x))
df['emdedding_text2_Bert'] = df['cleaned_text2'].apply(lambda x : BERT_Transformer.encode(x))

In [93]:
df['cosine_similarity_BERT'] = [
    cosine_similarity([df['emdedding_text_Bert1'][i]], [df['emdedding_text2_Bert'][i]])[0][0]
    for i in range(len(df))
]


In [94]:
print(df[['Unique_ID','text1','cleaned_text1','text2','cleaned_text2','cosine_similarity_BERT']].head(5))

   Unique_ID                                              text1  \
0          0  savvy searchers fail to spot ads internet sear...   
1          1  millions to miss out on the net by 2025  40% o...   
2          2  young debut cut short by ginepri fifteen-year-...   
3          3  diageo to buy us wine firm diageo  the world s...   
4          4  be careful how you code a new european directi...   

                                       cleaned_text1  \
0  savvy searcher fail spot ad internet search en...   
1  million miss net 2025 40 uk population still w...   
2  young debut cut short ginepri fifteenyearold d...   
3  diageo buy u wine firm diageo world biggest sp...   
4  careful code new european directive could put ...   

                                               text2  \
0  newcastle 2-1 bolton kieron dyer smashed home ...   
1  nasdaq planning $100m share sale the owner of ...   
2  ruddock backs yapp s credentials wales coach m...   
3  mci shares climb on takeover bid 