In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.metrics import jaccard_distance

# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')  # Added this line

def preprocess(sentence):
    # Tokenize the sentence
    tokens = word_tokenize(sentence)
    # Lowercase all words
    tokens = [word.lower() for word in tokens]
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in filtered_tokens]
    return lemmatized_tokens

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def sentence_similarity(sentence1, sentence2):
    # Preprocess both sentences
    tokens1 = preprocess(sentence1)
    tokens2 = preprocess(sentence2)
    # Compute Jaccard similarity
    jaccard_sim = 1 - jaccard_distance(set(tokens1), set(tokens2))
    return jaccard_sim

# Example usage:
# sentence1 = "The quick brown fox jumps over the lazy dog"
# sentence2 = "A fast brown fox jumps over a lazy dog"
# similarity = sentence_similarity(sentence1, sentence2)
# print("Similarity between the two sentences:", similarity)

count = 0

[nltk_data] Downloading package punkt to /home/raone/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/raone/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/raone/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/raone/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [2]:
Expected_reply = """
  Bramhaputra
  Lohit 
  Dihing 
  Siang
  Manas
  Kapili
  Disang
  Subahnshri
  """

In [3]:
RAG_reply = """
Based on the provided context, there are three hostels mentioned in the document:

1. Hostel 1 (Core1)
2. Hostel 2 (Core2)
3. Hostel 3 (Core3)
4. Hostel 4 (Core4)
5. View Point
6. CCD outlet (just outside the library)

So, the answer to your question is: Hostels 1, 2, 3, 4, and View Point are the hostels in IIT Guwahati.

"""
similarity_RAG = sentence_similarity(Expected_reply, RAG_reply)
#print("Similarity between the two sentences:", similarity_RAG*100)

In [4]:
RAGNER_reply = """
Based on the provided documents, the following are the hostels mentioned in IIT Guwahati:

1. Lohit
2. Dihing.
3. Siang
4. Manas
5. Brahmaputra (mentioned as the location of a CCD outlet)

Therefore, the answer to the question is:

Lohit, Dihing., Siang, Manas, and Brahmaputra.
"""

similarity_RAGNER = sentence_similarity(Expected_reply, RAGNER_reply)
#print("Similarity between the two sentences:", similarity_RAGNER*100)

In [5]:
if RAGNER_reply > RAG_reply:
  print("RAGNER is better")
else:
  print("RAGNER is not that great")

RAGNER is better
