# Doc2Vec Approach

I am using google colab (as it is free and fast) to train the gensim doc2Vec model (It can also be trained locally but it might take more time and it can lead to some primary memory space constraints))

**Importing the dataset from google drive**

In [None]:
# from google.colab import drive
# import os
# drive.mount('/content/drive/')
# os.chdir('drive/My Drive/Datasets')

# **Assigning DatasetName**

The same dataset name can be kept in local folder. The above code can be ignored, if you want to run this locally, just keep the file in the folder


In [None]:
dataset_name = "/content/Text_Similarity_Dataset.csv"

# **Importing** Python Libraries

In [None]:
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import gensim
import re
import os
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# **Loading data to dataframe**


In [None]:
df = pd.read_csv(dataset_name)

# **Function to preprocess**

This function removes stopwords, punctuations and stems the words


In [None]:
def preprocess_text(text,remove_stopwords=True):
  words = text.lower().split()
  # remove stopwords based on flag
  if remove_stopwords:
    stops = set(stopwords.words('english'))
    words = [w for w in words if w not in stops]
  sentence = " ".join(words)
  sentence = re.sub(r"[^A-Za-z0-9(),!.?\'\`]", " ", sentence)
  sentence = re.sub(r"\'s", " 's ", sentence)
  sentence = re.sub(r"\'ve", " 've ", sentence)
  sentence = re.sub(r"n\'t", " 't ", sentence)
  sentence = re.sub(r"\'re", " 're ", sentence)
  sentence = re.sub(r"\'d", " 'd ", sentence)
  sentence = re.sub(r"\'ll", " 'll ", sentence)
  sentence = re.sub(r",", " ", sentence)
  sentence = re.sub(r"\.", " ", sentence)
  sentence = re.sub(r"!", " ", sentence)
  sentence = re.sub(r"\(", " ( ", sentence)
  sentence = re.sub(r"\)", " ) ", sentence)
  sentence = re.sub(r"\?", " ", sentence)
  sentence = re.sub(r"\s{2,}", " ", sentence)

  words = sentence.split()
  # Shorten words to their stems
  stemmer = SnowballStemmer('english')
  stemmed_words = [stemmer.stem(word) for word in words]
  return " ".join(stemmed_words)

  

**Preprocess all sentences and make a document array**

In [None]:
documents = []
for index,data in enumerate(df.values):
  documents.append(preprocess_text(data[1],True))
  documents.append(preprocess_text(data[2],True))

**Import Gensim module for document to vector and tagged documents to create dataformat for training the doc2vec**

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
# It creates word2Vec for all the words and uses tags as another input to neural network to generate a document vector
tagged_documents = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(documents)]

# Training the model



1.   You can define the epochs
2.   the vector size is the size of the vector space required to represent a document
3. alpha is the learning rate for the network
4. Doc2Vec class is used to create doc2vec model, this model gives the vector representation for the sentence
5. The model is saved so that it can be reused whenever required

The learning rate is decayed to avoid large updates




In [None]:
def train_and_save(force_train=False,saved_model_name="d2v.model"):
  if os.path.exists(saved_model_name) and not force_train:
    return
  max_epochs = 10
  vec_size = 100
  alpha = 0.025

  model = Doc2Vec(vector_size=vec_size,
                  alpha=alpha, 
                  min_alpha=0.00025,
                  min_count=1,
                  dm =1)
    
  model.build_vocab(tagged_documents)

  for epoch in range(max_epochs):
      print('iteration {0}'.format(epoch))
      model.train(tagged_documents,
                  total_examples=model.corpus_count,
                  epochs=model.iter)
      # decrease the learning rate
      model.alpha -= 0.0002
      # fix the learning rate, no decay
      model.min_alpha = model.alpha

  model.save("saved_model_name")
  print("Model Saved")

train_and_save()

iteration 0




iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
Model Saved


# Load the model to demonstrate reusability


In [None]:
model = Doc2Vec.load("saved_model_name")

# Scoring



1.   Generate scores for each pair of sentences in dataset (based on some threshold value assign it label 0 (highly similar) or 1 (not similar)
2.   put the score back to dataset
3. **I have used 70% as threshold for similarity score**






In [None]:
scores = []
threshold = 0.70
for i in range(0,len(documents),2):
  # print(i)
  score = model.n_similarity(word_tokenize(documents[i]),word_tokenize(documents[i+1]))
  scores.append(score)

  """


# Save the result to a new file


In [None]:
data = {'Unique_ID':df['Unique_ID'],'Similarity_Score':scores}
df_to_save = pd.DataFrame(data=data)

In [None]:
df_to_save.to_csv("final_text_similarity_scores.csv",index=False)