# Import libraries

In [14]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Preprocessing

In [2]:
# Download required NLTK data files
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to C:\Users\Subhayan
[nltk_data]     Das\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Subhayan
[nltk_data]     Das\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Subhayan
[nltk_data]     Das\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Initialize tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [8]:
def preprocess_sentence(sentence):
    """
    Preprocess a single sentence: remove special characters, stop words, and apply lemmatization.
    """
    # 1. Remove special characters and numbers
    sentence = re.sub(r"[^a-zA-Z\s]", "", sentence)
    
    # 2. Tokenize the sentence
    words = word_tokenize(sentence.lower())  # Convert to lowercase and tokenize
    
    # 3. Remove stop words
#     filtered_words = [word for word in words if word not in stop_words]
    
#     # 4. Lemmatize words
#     lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    
    # 4. Lemmatize words
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    
    # 5. Reconstruct the sentence
    return " ".join(lemmatized_words)

In [9]:
known_ans = "The Dandi March, also known as the Salt March, was a pivotal act of civil disobedience led by Mahatma Gandhi in 1930, aimed at protesting British colonial rule in India. The march began on March 12, 1930, when Gandhi and 78 followers set out from Sabarmati Ashram in Ahmedabad to the coastal village of Dandi, a distance of 240 miles. The purpose of the march was to defy the British monopoly on salt production and the salt tax, which oppressed the Indian population. Upon reaching Dandi, Gandhi made salt from seawater, symbolically breaking the law. The march galvanized millions across India, sparking widespread participation in the Salt Satyagraha movement, and is considered a defining moment in the Indian independence struggle, showcasing the power of nonviolent resistance."

# Correct answer (just rephrased)
unknown_ans1 = "The Dandi March, also referred to as the Salt March, was a significant act of nonviolent resistance led by Mahatma Gandhi in 1930 to challenge British colonial policies in India. Starting on March 12, 1930, Gandhi, along with 78 followers, embarked on a 240-mile journey from Sabarmati Ashram in Ahmedabad to the coastal village of Dandi. The purpose of the march was to protest the British-imposed salt tax and monopoly on salt production, which burdened the Indian people. Upon reaching Dandi, Gandhi symbolically defied the law by making salt from seawater. The march ignited widespread participation in the Salt Satyagraha movement and became a crucial turning point in India’s fight for independence, demonstrating the power of peaceful civil disobedience."

# Short answer
unknown_ans2 = "The Dandi March, led by Mahatma Gandhi in 1930, was a key act of civil disobedience against British rule. Gandhi and 78 followers marched 240 miles from Sabarmati to Dandi to protest the British salt tax. Upon reaching Dandi, Gandhi made salt from seawater, breaking the law. The march sparked widespread participation in the Salt Satyagraha and became a turning point in India’s independence movement, highlighting the effectiveness of nonviolent protest."

# Completely different context based answer
unknown_ans3 = "Mahatma Gandhi pursued his law studies in London at University College London from 1888 to 1891. Initially, he faced challenges adjusting to life in a foreign country, dealing with racial prejudice and cultural differences. However, his determination to become a barrister led him to complete his studies and earn a law degree. Gandhi’s time in London also exposed him to various ideas, including those of Western philosophers and thinkers such as Henry David Thoreau and Leo Tolstoy, which later influenced his philosophy of nonviolence and civil disobedience. After completing his education, Gandhi returned to India, where he initially struggled to establish a legal practice before embarking on his transformative journey of social and political activism."



# Unsupervised modeling with encodings using Sentence Bert

In [10]:
# Step 1: Load Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [11]:
# Encode the paragraphs to get their embeddings
embedding1 = model.encode(preprocess_sentence(known_ans))
embedding2 = model.encode(preprocess_sentence(unknown_ans1))
embedding3 = model.encode(preprocess_sentence(unknown_ans2))
embedding4 = model.encode(preprocess_sentence(unknown_ans3))


In [15]:
# Calculate cosine similarity
similarity = cosine_similarity([embedding1], [embedding2])

# Print similarity percentage
print(f"Similarity: {similarity[0][0] * 100:.2f}%")

Similarity: 97.82%


In [16]:
# Calculate cosine similarity
similarity = cosine_similarity([embedding1], [embedding3])

# Print similarity percentage
print(f"Similarity: {similarity[0][0] * 100:.2f}%")

Similarity: 94.70%


In [17]:
# Calculate cosine similarity
similarity = cosine_similarity([embedding1], [embedding4])

# Print similarity percentage
print(f"Similarity: {similarity[0][0] * 100:.2f}%")

Similarity: 35.30%
