## Downloading Libary

In [None]:
!pip install nltk
!pip install gensim
!pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


## Importing Libraries

In [None]:
import pandas as pd
import gensim
import numpy as np
import nltk
nltk.download('stopwords')
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from gensim.models import KeyedVectors
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords as nltk_stopwords
from nltk.metrics import edit_distance
from fuzzywuzzy import process

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Downloading the Pretrained Word2Vec vectors

In [None]:
!gdown 0B7XkCwpI5KDYNlNUTTlSS21pQmM

Downloading...
From: https://drive.google.com/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM
To: /content/GoogleNews-vectors-negative300.bin.gz
100% 1.65G/1.65G [00:19<00:00, 85.4MB/s]


## Loading the first 1 Million Vectors

In [None]:
location = 'GoogleNews-vectors-negative300.bin.gz'
wv = KeyedVectors.load_word2vec_format(location, binary=True, limit=1000000)

output_file = 'vectors.txt'
wv.save_word2vec_format(output_file, binary=False)


## Init Pipeline

In [None]:
vectors_file = 'vectors.txt'
word_vectors = KeyedVectors.load_word2vec_format(vectors_file, binary=False)

## Processing the Phrases

In [None]:
phrases_data = pd.read_csv('phrases (1).csv', encoding="latin1")

phrases = phrases_data['Phrases'].tolist()

# def clean_phrase(phrase):
#     cleaned_phrase = " ".join(set(phrase.split()))
#     words = cleaned_phrase.split()
#     stop_words = set(nltk_stopwords.words('english'))
#     cleaned_words = [word for word in words if word.lower() not in stop_words]
#     return " ".join(cleaned_words)

def calculate_phrase_embedding(phrase):
    words = phrase.split()
    phrase_vectors = [word_vectors[word] for word in words if word in word_vectors]
    if len(phrase_vectors) > 0:
        return np.sum(phrase_vectors, axis=0) / len(phrase_vectors)
    else:
        return None

# cleaned_phrases = [clean_phrase(phrase) for phrase in phrases]
phrase_embeddings = {}
for idx, phrase in enumerate(phrases):
    embedding = calculate_phrase_embedding(phrase)
    if embedding is not None:
        phrase_embeddings[idx] = (phrase, embedding)

# Calculate similarity between phrases
similarity_results = {}
for idx1, (phrase1, embedding1) in phrase_embeddings.items():
    for idx2, (phrase2, embedding2) in phrase_embeddings.items():
        if idx1 != idx2 and (idx2, idx1) not in similarity_results:
            # Calculate cosine distance
            cosine_distance = cosine_distances([embedding1], [embedding2])[0][0]
            similarity_results[(idx1, idx2)] = cosine_distance

## Calculating the Cosine Distance

In [None]:
def find_closest_match(user_input):
    user_embedding = calculate_phrase_embedding(user_input)
    if user_embedding is not None:
        closest_match_idx = min(
            phrase_embeddings.keys(),
            key=lambda x: cosine_distances([user_embedding], [phrase_embeddings[x][1]])[0][0]
        )
        closest_match_phrase, closest_match_embedding = phrase_embeddings[closest_match_idx]
        distance = cosine_distances([user_embedding], [closest_match_embedding])[0][0]
        return closest_match_phrase, distance
    else:
        return "No valid embedding found for the input phrase.", None

user_input = input("Enter the Phrase you wanna check ?")
closest_match, distance = find_closest_match(user_input)
print(f"Closest match: {closest_match} | Distance: {distance}")

Enter the Phrase you wanna check ?how company compares to its peers
Closest match: how company compares to its peers? | Distance: 0.08219975233078003


## Checking for the User Input

In [None]:
user_input = input("Enter the Phrase you wanna check ?")
closest_match, distance = find_closest_match(user_input)
print(f"Closest match: {closest_match} | Distance: {distance}")