**NECESSARY PACAKAGES ARE IMPORTED**

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Lambda

**DATA IMPORTED**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
dataset=pd.read_excel('/content/drive/MyDrive/mlp/Precily_Text_Similarity.xlsx')

In [None]:
texts1 = dataset['text1'].values
texts2 = dataset['text2'].values

In [None]:
print(texts1)

['broadband challenges tv viewing the number of europeans with broadband has exploded over the past 12 months  with the web eating into tv viewing habits  research suggests.  just over 54 million people are hooked up to the net via broadband  up from 34 million a year ago  according to market analysts nielsen/netratings. the total number of people online in europe has broken the 100 million mark. the popularity of the net has meant that many are turning away from tv  say analysts jupiter research. it found that a quarter of web users said they spent less time watching tv in favour of the net  the report by nielsen/netratings found that the number of people with fast internet access had risen by 60% over the past year.  the biggest jump was in italy  where it rose by 120%. britain was close behind  with broadband users almost doubling in a year. the growth has been fuelled by lower prices and a wider choice of always-on  fast-net subscription plans.  twelve months ago high speed interne

In [None]:
# Combine text1 and text2 for tokenization
texts = np.concatenate((texts1, texts2))

In [None]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences1 = tokenizer.texts_to_sequences(texts1)
sequences2 = tokenizer.texts_to_sequences(texts2)

In [None]:
print(sequences1)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
# Pad the sequences
max_seq_length = max(max(len(seq) for seq in sequences1), max(len(seq) for seq in sequences2))
padded_sequences1 = pad_sequences(sequences1, maxlen=max_seq_length)
padded_sequences2 = pad_sequences(sequences2, maxlen=max_seq_length)

**EMBEDDING METHOD**--**LSTM **

In [None]:
# Define the Siamese neural network architecture
embedding_dim = 100
lstm_units = 64

input_a = Input(shape=(max_seq_length,))
input_b = Input(shape=(max_seq_length,))

embedding_layer = Embedding(len(tokenizer.word_index) + 1, embedding_dim)
lstm_layer = LSTM(lstm_units)

encoded_a = lstm_layer(embedding_layer(input_a))
encoded_b = lstm_layer(embedding_layer(input_b))

distance = Lambda(lambda x: tf.abs(x[0] - x[1]))([encoded_a, encoded_b])
output = Dense(1, activation='sigmoid')(distance)

model = Model(inputs=[input_a, input_b], outputs=output)

**MODEL**

In [None]:
# Compile and train the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit([padded_sequences1, padded_sequences2], np.zeros(len(texts1)), epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7c4d4a09dab0>

**TESTING**

In [None]:
# Use the trained model for predicting similarity
test_texts1 = ["Some text for testing", "Another text for testing"]
test_texts2 = ["Text to compare", "Different text to compare"]

In [None]:
test_sequences1 = tokenizer.texts_to_sequences(test_texts1)
test_sequences2 = tokenizer.texts_to_sequences(test_texts2)

In [None]:
test_padded_sequences1 = pad_sequences(test_sequences1, maxlen=max_seq_length)
test_padded_sequences2 = pad_sequences(test_sequences2, maxlen=max_seq_length)

predictions = model.predict([test_padded_sequences1, test_padded_sequences2])
for i, pred in enumerate(predictions):
    print("Similarity between '{}' and '{}' is: {:.2f}".format(test_texts1[i], test_texts2[i], pred[0]))

Similarity between 'Some text for testing' and 'Text to compare' is: 0.20
Similarity between 'Another text for testing' and 'Different text to compare' is: 0.13


**COSINE SIMILARITY METHOD**

In [None]:
import string
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def preprocess(text):
    text = text.lower()  # Convert to lowercase
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    return text

def cosine_similarity_string(string1, string2):
    preprocessed_string1 = preprocess(string1)
    preprocessed_string2 = preprocess(string2)

    vectorizer = CountVectorizer().fit_transform([preprocessed_string1, preprocessed_string2])
    cosine_sim = cosine_similarity(vectorizer[0], vectorizer[1]).flatten()
    return cosine_sim[0]

similarities = []
for text1, text2 in zip(texts1, texts2):
    similarity = cosine_similarity_string(text1, text2)
    similarities.append(similarity)

print("Cosine similarities:")
for similarity in similarities:
    print(similarity)

Cosine similarities:
0.6894162915261057
0.43076461504892627
0.5674248327222069
0.6679992495426249
0.5578235171065947
0.459236160601464
0.6028474206840945
0.6595993515841848
0.5628538066971729
0.47004939651364097
0.6447138133267084
0.6205242893332575
0.6986728920894946
0.6506160328191511
0.4824236085884415
0.6782625040983064
0.6627499139571061
0.6947784753289259
0.7149529317134727
0.5722141733935246
0.5838512670028068
0.5148271168731073
0.6903835554414817
0.6103891839705954
0.5321843856483653
0.5816718488337415
0.5337355443475531
0.7032160181130305
0.5669917869525173
0.602043375324418
0.5236848156484795
0.6159772143834233
0.5481816521291049
0.6431463209845588
0.7331726211199778
0.6434344499842248
0.709276041191502
0.7107441187651616
0.5640815163800302
0.5624145805611797
0.4979016729524317
0.5990017820518388
0.6547175355940278
0.6769351400483687
0.5715931596953494
0.47636972574965925
0.7336329351915737
0.5783648965492372
0.5486937586844544
0.5466026414306305
0.6676767305312233
0.60012222