Aim : Write  a python program to detect similar sentences from given paragraph.

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:

# Load the file containing sentences
def load_sentences(file_path):
   with open(file_path, 'r') as file:
    sentences = file.readlines()
   return [sentence.strip() for sentence in sentences]

# Preprocess the input sentence
def preprocess_sentence(sentence):
   # Tokenize
   tokens = word_tokenize(sentence.lower())

   # Remove stopwords
   stop_words = set(stopwords.words('english'))
   tokens = [token for token in tokens if token not in stop_words]

   # Lemmatize
   lemmatizer = WordNetLemmatizer()
   tokens = [lemmatizer.lemmatize(token) for token in tokens]

   return ' '.join(tokens)

# Get the most similar sentence
def get_most_similar_sentence(user_input, sentences):
   # Preprocess input sentence
   preprocessed_user_input = preprocess_sentence(user_input)

   # Preprocess sentences
   preprocessed_sentences = [preprocess_sentence(sentence) for sentence in
sentences]

   # Create TF-IDF vectorizer
   vectorizer = TfidfVectorizer()

   # Generate TF-IDF matrix
   tfidf_matrix = vectorizer.fit_transform([preprocessed_user_input] +
preprocessed_sentences)

   # Calculate similarity scores
   similarity_scores = (tfidf_matrix * tfidf_matrix.T).A[0][1:]

   # Find the index of the most similar sentence
   most_similar_index = similarity_scores.argmax()
   most_similar_sentence = sentences[most_similar_index]

   return most_similar_sentence

# Main program
def main():
   file_path = '/content/sentences.txt'  # Path to the file containing sentences
   sentences = load_sentences(file_path)

   user_input = 'I love cooking.'

   most_similar_sentence = get_most_similar_sentence(user_input, sentences)
   print('Most similar sentence:', most_similar_sentence)

if __name__ == '__main__':
   main()

In [None]:
def preprocess(sentence):
    sentence = sentence.lower()
    tokens = word_tokenize(sentence)
    tokens = [word for word in tokens if word.isalnum()]
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return " ".join(tokens)

In [None]:
processed_sentences = [preprocess(sentence) for sentence in sentences]

In [None]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(processed_sentences)

In [None]:
similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
threshold = 0.9

In [None]:
print("Similar Sentences:")
for i in range(len(sentences)):
    for j in range(i+1, len(sentences)):
        if similarities[i][j] >= threshold:
            print(f"Similarity: {similarities[i][j]:.2f}")
            print(f"Sentence {i+1}: {sentences[i]}")
            print(f"Sentence {j+1}: {sentences[j]}")
            print()

Similar Sentences:
Similarity: 1.00
Sentence 3: The aroma of freshly baked bread filled the air as I entered the bakery.
Sentence 11: The aroma of freshly baked bread filled the air as I entered the bakery.

Similarity: 1.00
Sentence 9: The chef skillfully prepared a gourmet meal using locally sourced ingredients.
Sentence 12: The chef skillfully prepared a gourmet meal using locally sourced ingredients.



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np

#
texts=sentences

# Vectorization of the texts
vectorizer = TfidfVectorizer(stop_words="english")
X = vectorizer.fit_transform(texts)
# Used words (axis in our multi-dimensional space)
words = vectorizer.get_feature_names_out()
print("words", words)

n_clusters = 3
number_of_seeds_to_try = 10
max_iter = 300

model = KMeans(n_clusters=n_clusters, max_iter=max_iter, n_init=number_of_seeds_to_try).fit(X)

labels = model.labels_
# Indices of preferable words in each cluster
ordered_words = model.cluster_centers_.argsort()[:, ::-1]

print("centers:", model.cluster_centers_)
print("labels", labels)
print("inertia:", model.inertia_)

texts_per_cluster = np.zeros(n_clusters)
for i_cluster in range(n_clusters):
    for label in labels:
        if label == i_cluster:
            texts_per_cluster[i_cluster] += 1

print("Top words per cluster:")
for i_cluster in range(n_clusters):
    print("Cluster:", i_cluster, "texts:", int(texts_per_cluster[i_cluster])),
    for term in ordered_words[i_cluster, :10]:
        print("\t" + words[term])

print("\n")
print("Prediction")

text_to_predict = "Why batman was defeated by superman so easily?"
Y = vectorizer.transform([text_to_predict])
predicted_cluster = model.predict(Y)[0]
texts_per_cluster[predicted_cluster] += 1

print(text_to_predict)
print("Cluster:", predicted_cluster, "texts:", int(texts_per_cluster[predicted_cluster])),
for term in ordered_words[predicted_cluster, :10]:
    print("\t" + words[term])


words ['air' 'aroma' 'backyard' 'baked' 'bakery' 'believe' 'branches' 'bread'
 'campfire' 'candle' 'cascaded' 'casting' 'cat' 'chef' 'clear' 'clover'
 'crashed' 'creating' 'crystal' 'curious' 'depths' 'echoed' 'entered'
 'filled' 'flame' 'flickering' 'forest' 'freshly' 'friends' 'gazed' 'glow'
 'gourmet' 'hidden' 'horizon' 'ingredients' 'laughter' 'leaf' 'life'
 'locally' 'meadow' 'meal' 'oak' 'old' 'planet' 'pool' 'pounced'
 'prepared' 'reaching' 'rhythm' 'rocky' 'room' 'rose' 'shared' 'shore'
 'skillfully' 'sky' 'soothing' 'sourced' 'stars' 'stood' 'stories' 'sun'
 'tall' 'tree' 'using' 'warm' 'waterfall' 'waves' 'wise' 'wondering']
centers: [[0.         0.         0.0625     0.         0.         0.0625
  0.04166667 0.         0.04724556 0.05103104 0.04419417 0.04724556
  0.05103104 0.         0.04419417 0.0625     0.04724556 0.04724556
  0.04419417 0.05103104 0.04419417 0.04724556 0.         0.
  0.05103104 0.05103104 0.04419417 0.         0.04724556 0.0559017
  0.04724556 0.      

In [4]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285387 sha256=2d95646314cb5260afff67a084f93f841963da1c22be5d4c490f76d710aeeef1
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


In [5]:
import json
import pandas as pd
import numpy as np
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [10]:
pip install spark-nlp

Collecting spark-nlp
  Downloading spark_nlp-5.1.0-py2.py3-none-any.whl (531 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m531.2/531.2 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: spark-nlp
Successfully installed spark-nlp-5.1.0


In [14]:
import sparknlp
#from sparknlp.annotator import sparknlp.base
from sparknlp.pretrained import PretrainedPipeline
spark = sparknlp.start()

# If you change the model, re-run all the cells below. # Applicable models: tfhub_use, tfhub_use_lg MODEL_NAME = "tfhub_use" os.environ['MODEL_NAME'] = MODEL_NAME
# To compare the similarity of sentences, enter them as strings in this list.
text_list = [
"Sign up for our mailing list to get free offers and updates about our products!",
"It was raining, so I waited beneath the balcony outside the cafe.",
 "I stayed under the deck of the cafe because it was rainy outside.",
 "I like the cafe down the street because it's not too loud in there.",
"The coffee shop near where I live is quiet, so I like to go there.",
"Web traffic analysis shows that most Internet users browse on mobile nowadays.",
"The analytics show that modern web users mostly use their phone instead of their computers."
]

In [16]:
tokenizer = Tokenizer()
tokenizer.setInputCols(['document']) tokenizer.setOutputCol('token')
# Encodes the text as a single vector representing semantic features. sentence_encoder = UniversalSentenceEncoder.pretrained(name=MODEL_NAME) sentence_encoder.setInputCols(['document', 'token']) sentence_encoder.setOutputCol('sentence_embeddings')
nlp_pipeline = Pipeline(stages=[ document_assembler, tokenizer,
sentence_encoder
])
# Fit the model to an empty data frame so it can be used on inputs. empty_df = spark.createDataFrame([['']]).toDF('text') pipeline_model = nlp_pipeline.fit(empty_df)
light_pipeline = LightPipeline(pipeline_model)

SyntaxError: ignored

In [None]:
def get_similarity(input_list):
df = spark.createDataFrame(pd.DataFrame({'text': input_list})) result = light_pipeline.transform(df)
embeddings = []
for r in result.collect(): embeddings.append(r.sentence_embeddings[0].embeddings)
embeddings_matrix = np.array(embeddings)
return np.matmul(embeddings_matrix, embeddings_matrix.transpose())