In [None]:
from google.colab import drive
import os

# mount Google Drive
# When you mount it, you will be asked for permission, so allow it, copy the key, paste it in the input field on the Colab side, and press Enter.
drive.mount('/gdrive')

In [None]:
! pip install fasttext

In [None]:
#Importing necessary libraries
import pandas as pd
import string
import contextlib
import re
import scipy
import fasttext 
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
fasttext.FastText.eprint = print

In [4]:
#Function to open and read the files and return text outputs
def load_doc(filename):
    with open(filename, 'r') as file:
        text = file.read()
    return text

In [5]:
#Preprocessing and Cleaning the files
def clean_doc(doc):
     tokens = doc.split()
     re_punc = re.compile('[%s]' % re.escape(string.punctuation))
     tokens = [re_punc.sub('', w) for w in tokens]
     tokens = [word for word in tokens if word.isalpha()]
     stop_words = set(stopwords.words('english'))
     tokens = [w for w in tokens if not w in stop_words]
     tokens = [word for word in tokens if len(word) > 1]
     return tokens

In [6]:
#Calculation of similarity score between the base_document and query
def cosine_similarity_score(base_document, query):
    with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
        model = fasttext.load_model('/gdrive/My Drive/cc.en.300.bin')
    base_vector = np.mean([model[word] for word in base], axis=0)
    query_vector = np.mean([model[word] for word in query], axis=0)
    cosine = scipy.spatial.distance.cosine(base_document, query_vector)
    return (round((1-cosine)*100,2),'%')

In [7]:
queries = pd.read_csv('/gdrive/My Drive/Queries.csv' , encoding = 'unicode_escape')
queries_df = pd.DataFrame(queries) 

tokenized_queries = [clean_doc(text) for text in queries['text']]

In [8]:
doc = load_doc('/gdrive/My Drive/S1.txt')
base_document = clean_doc(doc)

In [None]:
score = []
for tokens_query in tokenized_queries:
    score.append(cosine_similarity_score(base_document, tokens_query))

In [None]:
#Appending scores to the dataframe
queries_df['score'] = score

In [None]:
#Saving output in (.CSV) format
queries_df.to_csv('/gdrive/My Drive/semantic_similarity.csv')