<a href="https://colab.research.google.com/github/Nivedha1524/NLP/blob/main/2403A52279_LAB-07.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import re
import string

In [2]:
import nltk
from nltk.corpus import stopwords, wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:
data = [
    ("The doctor is treating a patient", "A physician is helping a sick person"),
    ("I love machine learning", "I enjoy studying AI"),
    ("The cat sits on the mat", "The cat is sitting on the mat"),
    ("He plays football", "She is cooking dinner"),
    ("Weather is very hot today", "It is extremely warm outside"),
    ("Python is a programming language", "Bananas are yellow"),
    ("I am reading a book", "I am studying from a textbook"),
    ("Cars move fast", "Vehicles travel quickly"),
    ("She likes music", "She enjoys songs"),
    ("Open the door", "Close the window")
]

df = pd.DataFrame(data, columns=["Sentence1", "Sentence2"])

print(df.head())

                          Sentence1                             Sentence2
0  The doctor is treating a patient  A physician is helping a sick person
1           I love machine learning                   I enjoy studying AI
2           The cat sits on the mat         The cat is sitting on the mat
3                 He plays football                 She is cooking dinner
4         Weather is very hot today          It is extremely warm outside


**Text Preprocessing**

In [7]:
nltk.download('punkt_tab')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))

    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words]

    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return " ".join(tokens)

df["clean1"] = df["Sentence1"].apply(preprocess)
df["clean2"] = df["Sentence2"].apply(preprocess)

print(df[["clean1","clean2"]].head())

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


                    clean1                         clean2
0  doctor treating patient  physician helping sick person
1    love machine learning              enjoy studying ai
2             cat sits mat                cat sitting mat
3            play football                 cooking dinner
4        weather hot today         extremely warm outside


**Numerical Representation (TF-IDF)**

In [8]:
vectorizer = TfidfVectorizer()

all_sentences = pd.concat([df["clean1"], df["clean2"]])
vectorizer.fit(all_sentences)

vec1 = vectorizer.transform(df["clean1"])
vec2 = vectorizer.transform(df["clean2"])

**Cosine Similarity**

In [9]:
cosine_scores = []

for i in range(len(df)):
    score = cosine_similarity(vec1[i], vec2[i])[0][0]
    cosine_scores.append(score)

df["Cosine"] = cosine_scores

print(df[["Sentence1","Sentence2","Cosine"]])

                          Sentence1                             Sentence2  \
0  The doctor is treating a patient  A physician is helping a sick person   
1           I love machine learning                   I enjoy studying AI   
2           The cat sits on the mat         The cat is sitting on the mat   
3                 He plays football                 She is cooking dinner   
4         Weather is very hot today          It is extremely warm outside   
5  Python is a programming language                    Bananas are yellow   
6               I am reading a book         I am studying from a textbook   
7                    Cars move fast               Vehicles travel quickly   
8                   She likes music                      She enjoys songs   
9                     Open the door                      Close the window   

     Cosine  
0  0.000000  
1  0.000000  
2  0.607125  
3  0.000000  
4  0.000000  
5  0.000000  
6  0.000000  
7  0.000000  
8  0.000000  
9  0.000000 

**Jaccard Similarity**

In [10]:
def jaccard_similarity(s1, s2):
    set1 = set(s1.split())
    set2 = set(s2.split())

    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))

    return intersection / union if union != 0 else 0

df["Jaccard"] = df.apply(lambda x: jaccard_similarity(x["clean1"], x["clean2"]), axis=1)

print(df[["Sentence1","Sentence2","Jaccard"]])

                          Sentence1                             Sentence2  \
0  The doctor is treating a patient  A physician is helping a sick person   
1           I love machine learning                   I enjoy studying AI   
2           The cat sits on the mat         The cat is sitting on the mat   
3                 He plays football                 She is cooking dinner   
4         Weather is very hot today          It is extremely warm outside   
5  Python is a programming language                    Bananas are yellow   
6               I am reading a book         I am studying from a textbook   
7                    Cars move fast               Vehicles travel quickly   
8                   She likes music                      She enjoys songs   
9                     Open the door                      Close the window   

   Jaccard  
0      0.0  
1      0.0  
2      0.5  
3      0.0  
4      0.0  
5      0.0  
6      0.0  
7      0.0  
8      0.0  
9      0.0  


**WordNet Semantic Similarity (Wu-Palmer)**

In [11]:
def wordnet_similarity(s1, s2):
    words1 = s1.split()
    words2 = s2.split()

    scores = []

    for w1 in words1:
        syn1 = wn.synsets(w1)
        if not syn1:
            continue

        for w2 in words2:
            syn2 = wn.synsets(w2)
            if not syn2:
                continue

            sim = syn1[0].wup_similarity(syn2[0])
            if sim:
                scores.append(sim)

    return np.mean(scores) if scores else 0

df["WordNet"] = df.apply(lambda x: wordnet_similarity(x["clean1"], x["clean2"]), axis=1)

print(df[["Sentence1","Sentence2","WordNet"]])

                          Sentence1                             Sentence2  \
0  The doctor is treating a patient  A physician is helping a sick person   
1           I love machine learning                   I enjoy studying AI   
2           The cat sits on the mat         The cat is sitting on the mat   
3                 He plays football                 She is cooking dinner   
4         Weather is very hot today          It is extremely warm outside   
5  Python is a programming language                    Bananas are yellow   
6               I am reading a book         I am studying from a textbook   
7                    Cars move fast               Vehicles travel quickly   
8                   She likes music                      She enjoys songs   
9                     Open the door                      Close the window   

    WordNet  
0  0.357736  
1  0.243568  
2  0.377556  
3  0.255190  
4  0.260590  
5  0.230617  
6  0.526021  
7  0.309457  
8  0.368118  
9  0.321356 

**Compare All Methods**

In [12]:
print("\n=== Comparison Results ===")
print(df[["Sentence1","Sentence2","Cosine","Jaccard","WordNet"]])

print("\nAverage Scores:")
print(df[["Cosine","Jaccard","WordNet"]].mean())


=== Comparison Results ===
                          Sentence1                             Sentence2  \
0  The doctor is treating a patient  A physician is helping a sick person   
1           I love machine learning                   I enjoy studying AI   
2           The cat sits on the mat         The cat is sitting on the mat   
3                 He plays football                 She is cooking dinner   
4         Weather is very hot today          It is extremely warm outside   
5  Python is a programming language                    Bananas are yellow   
6               I am reading a book         I am studying from a textbook   
7                    Cars move fast               Vehicles travel quickly   
8                   She likes music                      She enjoys songs   
9                     Open the door                      Close the window   

     Cosine  Jaccard   WordNet  
0  0.000000      0.0  0.357736  
1  0.000000      0.0  0.243568  
2  0.607125      0.5  0.3

**Simple Interpretation Output**

In [13]:
for i,row in df.iterrows():
    print("\nPair",i+1)
    print("S1:",row["Sentence1"])
    print("S2:",row["Sentence2"])
    print("Cosine:",round(row["Cosine"],3))
    print("Jaccard:",round(row["Jaccard"],3))
    print("WordNet:",round(row["WordNet"],3))


Pair 1
S1: The doctor is treating a patient
S2: A physician is helping a sick person
Cosine: 0.0
Jaccard: 0.0
WordNet: 0.358

Pair 2
S1: I love machine learning
S2: I enjoy studying AI
Cosine: 0.0
Jaccard: 0.0
WordNet: 0.244

Pair 3
S1: The cat sits on the mat
S2: The cat is sitting on the mat
Cosine: 0.607
Jaccard: 0.5
WordNet: 0.378

Pair 4
S1: He plays football
S2: She is cooking dinner
Cosine: 0.0
Jaccard: 0.0
WordNet: 0.255

Pair 5
S1: Weather is very hot today
S2: It is extremely warm outside
Cosine: 0.0
Jaccard: 0.0
WordNet: 0.261

Pair 6
S1: Python is a programming language
S2: Bananas are yellow
Cosine: 0.0
Jaccard: 0.0
WordNet: 0.231

Pair 7
S1: I am reading a book
S2: I am studying from a textbook
Cosine: 0.0
Jaccard: 0.0
WordNet: 0.526

Pair 8
S1: Cars move fast
S2: Vehicles travel quickly
Cosine: 0.0
Jaccard: 0.0
WordNet: 0.309

Pair 9
S1: She likes music
S2: She enjoys songs
Cosine: 0.0
Jaccard: 0.0
WordNet: 0.368

Pair 10
S1: Open the door
S2: Close the window
Cosine: 0