In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from scipy import spatial
from sklearn.externals import joblib
stopword = stopwords.words('english')
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!




True

In [0]:
data=pd.read_csv("/content/drive/My Drive/Colab Notebooks/precily/Text_Similarity_Dataset.csv")

In [0]:
data.head()

Unnamed: 0,Unique_ID,text1,text2
0,0,savvy searchers fail to spot ads internet sear...,newcastle 2-1 bolton kieron dyer smashed home ...
1,1,millions to miss out on the net by 2025 40% o...,nasdaq planning $100m share sale the owner of ...
2,2,young debut cut short by ginepri fifteen-year-...,ruddock backs yapp s credentials wales coach m...
3,3,diageo to buy us wine firm diageo the world s...,mci shares climb on takeover bid shares in us ...
4,4,be careful how you code a new european directi...,media gadgets get moving pocket-sized devices ...


# Data Cleaning

In [0]:
def clean(col):
  data[col]=data[col].map(lambda x:x.lower())
  data[col]=data[col].map(lambda x:re.sub(r'\d+','',x))
  data[col]=data[col].map(lambda x:x.translate(str.maketrans('', '', string.punctuation)))
  data[col]=data[col].map(lambda x:x.strip())
  data[col]=data[col].map(lambda x:nltk.word_tokenize(x))
  data[col]=data[col].map(lambda x:' '.join([word for word in x if word not in stopword]))

In [0]:
clean('text1')
clean('text2')

In [0]:
data.head()

Unnamed: 0,Unique_ID,text1,text2
0,0,savvy searchers fail spot ads internet search ...,newcastle bolton kieron dyer smashed home winn...
1,1,millions miss net uk population still without ...,nasdaq planning share sale owner technologydom...
2,2,young debut cut short ginepri fifteenyearold d...,ruddock backs yapp credentials wales coach mik...
3,3,diageo buy us wine firm diageo world biggest s...,mci shares climb takeover bid shares us phone ...
4,4,careful code new european directive could put ...,media gadgets get moving pocketsized devices l...


## Using Doc2Vec

In [0]:
from gensim.models.doc2vec import Doc2Vec,TaggedDocument

In [0]:
d=[]
for i in range(data.shape[0]):
  d.append(TaggedDocument(data["text1"][i].split(),tags=data['Unique_ID']))
  d.append(TaggedDocument(data['text2'][i].split(),tags=data['Unique_ID']))

In [0]:
max_epochs = 100
vec_size = 20
alpha = 0.025
model = Doc2Vec(size=vec_size,alpha=alpha,min_alpha=0.00025,min_count=1,dm =1)



In [0]:
model.build_vocab(d)

In [0]:
for epoch in range(10):
    model.train(d,epochs=model.iter,total_examples=model.corpus_count)
    print("Epoch #{} is complete.".format(epoch+1))

  


Epoch #1 is complete.
Epoch #2 is complete.
Epoch #3 is complete.
Epoch #4 is complete.
Epoch #5 is complete.
Epoch #6 is complete.
Epoch #7 is complete.
Epoch #8 is complete.
Epoch #9 is complete.
Epoch #10 is complete.


# Storing Model

In [0]:
#filename="/content/drive/My Drive/Colab Notebooks/precily/model.sav"
joblib.dump(model, filename)



['/content/drive/My Drive/Colab Notebooks/precily/model.sav']

In [0]:
model1=joblib.load("/content/drive/My Drive/Colab Notebooks/precily/model.sav")

In [0]:
model1.most_similar("firm")

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('buy', 0.9624813199043274),
 ('buyer', 0.9541382789611816),
 ('telecoms', 0.9523208141326904),
 ('sina', 0.9468142986297607),
 ('giant', 0.9437302350997925),
 ('purchase', 0.9434924125671387),
 ('company', 0.9430593252182007),
 ('ownership', 0.9392997622489929),
 ('remittance', 0.9391176104545593),
 ('disposal', 0.933521032333374)]

In [0]:
score=[]
for i in range(data.shape[0]):
  vec1=model1.infer_vector(data['text1'][i])
  vec2=model1.infer_vector(data['text2'][i])
  score.append(spatial.distance.cosine(vec1,vec2))

In [0]:
df=pd.DataFrame(score,data['Unique_ID'],columns=['Similarity_Score'])
df.to_csv("/content/drive/My Drive/Colab Notebooks/precily/result.txt")

# Using Tf-Idf

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
vectorizer = TfidfVectorizer(ngram_range=(1,1))
X = vectorizer.fit_transform(data["text1"])
X_test=vectorizer.transform(data['text2'])

In [0]:
res=X.dot(X_test.T)
r=res.todense()
l=[]
for i in range(res.shape[0]):
  l.append(r[i,i])

In [0]:
df=pd.DataFrame(l,data['Unique_ID'],columns=['Similarity_Score'])
df.to_csv("/content/drive/My Drive/Colab Notebooks/precily/result.txt")