In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings, string
warnings.filterwarnings('ignore')
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader as api
from nltk import word_tokenize

In [2]:
df = pd.read_csv('Precily_Text_Similarity.csv')
df.head()

Unnamed: 0,text1,text2
0,broadband challenges tv viewing the number of ...,gardener wins double in glasgow britain s jaso...
1,rap boss arrested over drug find rap mogul mar...,amnesty chief laments war failure the lack of ...
2,player burn-out worries robinson england coach...,hanks greeted at wintry premiere hollywood sta...
3,hearts of oak 3-2 cotonsport hearts of oak set...,redford s vision of sundance despite sporting ...
4,sir paul rocks super bowl crowds sir paul mcca...,mauresmo opens with victory in la amelie maure...


In [3]:
stemmer = PorterStemmer()
def stem_words(text):
    return ' '.join([stemmer.stem(word) for word in text.split()])
df['text1'] = df['text1'].apply(lambda x: stem_words(x))
df['text2'] = df['text2'].apply(lambda x: stem_words(x))

In [4]:
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
df['text1'] = df['text1'].apply(lambda x: lemmatize_words(x))
df['text2'] = df['text2'].apply(lambda x: lemmatize_words(x))

In [5]:
def text_process(text):
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return ' '.join([word for word in nopunc.split() if word.lower() not in stopwords.words('english') and not word.isdigit()])

In [6]:
df.shape

(3000, 2)

In [7]:
df['text1'] = df['text1'].apply(lambda x: text_process(x))
df['text2'] = df['text2'].apply(lambda x: text_process(x))

In [8]:
saved_df = df.copy()

In [9]:
def jaccard_similarity(a,b):
    intersection = set(a).intersection(set(b))
    union = set(a).union(set(b))
    return len(intersection)/len(union)

In [10]:
jaccard_similarity(df['text1'][0],df['text2'][0])

0.7222222222222222

In [11]:
bow_transformer1 = CountVectorizer(analyzer=text_process).fit(df['text1'])
bow_transformer2 = CountVectorizer(analyzer=text_process).fit(df['text2'])
print(bow_transformer1)
print(bow_transformer2)

CountVectorizer(analyzer=<function text_process at 0x000001F31510F0D0>)
CountVectorizer(analyzer=<function text_process at 0x000001F31510F0D0>)


In [12]:
len(bow_transformer1.vocabulary_), len(bow_transformer2.vocabulary_)

(38, 38)

In [13]:
text4 = df['text1'][3]
bow4 = bow_transformer1.transform([text4])
print(bow4)

  (0, 0)	167
  (0, 4)	1
  (0, 6)	1
  (0, 9)	1
  (0, 10)	1
  (0, 11)	95
  (0, 12)	17
  (0, 13)	32
  (0, 14)	23
  (0, 15)	88
  (0, 16)	24
  (0, 17)	22
  (0, 18)	23
  (0, 19)	56
  (0, 20)	3
  (0, 21)	19
  (0, 22)	39
  (0, 23)	31
  (0, 24)	55
  (0, 25)	70
  (0, 26)	12
  (0, 27)	4
  (0, 28)	55
  (0, 29)	35
  (0, 30)	72
  (0, 31)	29
  (0, 32)	3
  (0, 33)	13
  (0, 34)	1
  (0, 35)	6
  (0, 36)	2


In [14]:
bow_text1 = bow_transformer1.transform(df['text1'])
print(bow_text1.shape)

(3000, 38)


In [15]:
bow_text2 = bow_transformer2.transform(df['text2'])
print(bow_text2.shape)

(3000, 38)


In [16]:
tfidf_transformer1 = TfidfTransformer().fit(bow_text1)
tfidf_transformer2 = TfidfTransformer().fit(bow_text2)

In [17]:
tfidf4 = tfidf_transformer1.transform(bow4)
print(tfidf4)

  (0, 36)	0.013055846780139978
  (0, 35)	0.022250077435161244
  (0, 34)	0.004138953140290266
  (0, 33)	0.04820850110951602
  (0, 32)	0.011128746445899422
  (0, 31)	0.10754204093661267
  (0, 30)	0.2670009292219349
  (0, 29)	0.1297921183717739
  (0, 28)	0.20395904315564473
  (0, 27)	0.0214428335884421
  (0, 26)	0.04450015487032249
  (0, 25)	0.2595842367435478
  (0, 24)	0.20395904315564473
  (0, 23)	0.11495873341499975
  (0, 22)	0.14462550332854807
  (0, 21)	0.07088246164593452
  (0, 20)	0.012989252523926654
  (0, 19)	0.20766738939483825
  (0, 18)	0.08529196350145143
  (0, 17)	0.0815836172622579
  (0, 16)	0.08900030974064498
  (0, 15)	0.3263344690490316
  (0, 14)	0.08529196350145143
  (0, 13)	0.11866707965419329
  (0, 12)	0.06304188606629019
  (0, 11)	0.3522928927233863
  (0, 10)	0.011089971430165385
  (0, 9)	0.010339242840822743
  (0, 6)	0.008505999327078297
  (0, 4)	0.009006977779888707
  (0, 0)	0.6192938219453212


In [18]:
tfidf4.get_shape()

(1, 38)

In [19]:
tfidf_text1 = tfidf_transformer1.transform(bow_text1)
tfidf_text2 = tfidf_transformer2.transform(bow_text2)

In [20]:
print("Amount of non-zero values in TFIDF of text 1:",tfidf_text1.nnz)
print("Amount of non-zero values in TFIDF of text 2:",tfidf_text2.nnz)

Amount of non-zero values in TFIDF of text 1: 85549
Amount of non-zero values in TFIDF of text 2: 85711


In [21]:
print("Shape of TFIDF of text 1:",tfidf_text1.shape)
print("Shape of TFIDF of text 2:",tfidf_text2.shape)

Shape of TFIDF of text 1: (3000, 38)
Shape of TFIDF of text 2: (3000, 38)


In [22]:
print("Sparsity of text 1:",str(np.round((tfidf_text1.nnz/(tfidf_text1.shape[0]*tfidf_text1.shape[1]))*100,2)) + '%')
print("Sparsity of text 2:",str(np.round((tfidf_text2.nnz/(tfidf_text2.shape[0]*tfidf_text2.shape[1]))*100,2)) + '%')

Sparsity of text 1: 75.04%
Sparsity of text 2: 75.19%


In [24]:
cos_similarity = [[]]
for i in range(tfidf_text1.shape[0]):
    cos_similarity.append(cosine_similarity(tfidf_text1[i],tfidf_text2))
cos_similarity = pd.DataFrame(cos_similarity)
cos_similarity.drop(index=cos_similarity.index[0], 
        axis=0, 
        inplace=True)
cos_similarity.head()

Unnamed: 0,0
1,"[0.9785293861463167, 0.9887675564206015, 0.988..."
2,"[0.970073371768313, 0.9818744094402776, 0.9865..."
3,"[0.9643083804078563, 0.9701381455232274, 0.962..."
4,"[0.9759967531076594, 0.982251639784895, 0.9806..."
5,"[0.9834646778322373, 0.9882872871600276, 0.994..."


In [25]:
cos_similarity.shape

(3000, 1)

In [26]:
cosine_similarity_matrix = pd.DataFrame()
for i in range(cos_similarity.shape[0]):
    cosine_similarity_matrix = pd.concat([cosine_similarity_matrix,(pd.DataFrame(cos_similarity.iloc[i].values.tolist()).T)],axis=1)
cosine_similarity_matrix.head()

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,...,0.10,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19
0,0.978529,0.970073,0.964308,0.975997,0.983465,0.974899,0.975446,0.981171,0.974635,0.971818,...,0.976139,0.97735,0.98267,0.970279,0.969073,0.979111,0.982001,0.961637,0.97926,0.975768
1,0.988768,0.981874,0.970138,0.982252,0.988287,0.98137,0.97891,0.989982,0.980058,0.979575,...,0.981408,0.985446,0.983493,0.978564,0.971599,0.977041,0.990116,0.969929,0.981814,0.987708
2,0.988882,0.986596,0.962266,0.980612,0.994628,0.977163,0.977388,0.986011,0.980424,0.985989,...,0.98222,0.980308,0.975918,0.981216,0.977847,0.976986,0.986111,0.960132,0.980092,0.977371
3,0.984609,0.973072,0.960169,0.976659,0.985454,0.973015,0.982949,0.994004,0.984009,0.9824,...,0.988786,0.983359,0.991103,0.984215,0.97829,0.985069,0.983268,0.969183,0.984444,0.984504
4,0.991294,0.981296,0.95634,0.986461,0.989491,0.980954,0.970051,0.984327,0.974304,0.979598,...,0.974879,0.974234,0.98127,0.97189,0.969613,0.974051,0.984597,0.962306,0.977571,0.974417


In [27]:
cosine_similarity_matrix.shape

(3000, 3000)

In [28]:
cosine_similarity_matrix.idxmin()

0    707
0    707
0    707
0    614
0    707
    ... 
0    586
0    707
0    614
0    707
0    707
Length: 3000, dtype: int64

In [29]:
cosine_similarity_matrix.iloc[707].min()

0.881484386605357

In [30]:
wv = api.load('word2vec-google-news-300')

In [31]:
similarity = []

for idx in df.index:
    t1 = df['text1'][idx]
    t2 = df['text2'][idx]
    
    if t1 == t2:
        similarity.append(1)
    else:
        t1_words = word_tokenize(t1)
        t2_words = word_tokenize(t2)
        vocab = wv.vocab
        
        if len(t1_words and t2_words) == 0:
            similarity.append(0)
        else:
            for word in t1_words.copy():
                if word not in vocab:
                    t1_words.remove(word)
            for word in t2_words.copy():
                if word not in vocab:
                    t2_words.remove(word)
            similarity.append(wv.n_similarity(t1_words,t2_words))

In [32]:
similarity = pd.DataFrame(similarity)
similarity.head()

Unnamed: 0,0
0,0.73864
1,0.667202
2,0.77582
3,0.658866
4,0.865341


In [33]:
df = pd.concat([df,similarity],axis=1)
df.head()

Unnamed: 0,text1,text2,0
0,broadband challeng tv view number european bro...,garden win doubl glasgow britain jason garden ...,0.73864
1,rap bos arrest drug find rap mogul marion suge...,amnesti chief lament war failur lack public ou...,0.667202
2,player burnout worri robinson england coach an...,hank greet wintri premier hollywood star tom h...,0.77582
3,heart oak cotonsport heart oak set ghanaian co...,redford vision sundanc despit sport corduroy c...,0.658866
4,sir paul rock super bowl crowd sir paul mccart...,mauresmo open victori la ameli mauresmo maria ...,0.865341


In [35]:
df.columns = ['text1','text2','Similarity Score']

In [36]:
df.head()

Unnamed: 0,text1,text2,Similarity Score
0,broadband challeng tv view number european bro...,garden win doubl glasgow britain jason garden ...,0.73864
1,rap bos arrest drug find rap mogul marion suge...,amnesti chief lament war failur lack public ou...,0.667202
2,player burnout worri robinson england coach an...,hank greet wintri premier hollywood star tom h...,0.77582
3,heart oak cotonsport heart oak set ghanaian co...,redford vision sundanc despit sport corduroy c...,0.658866
4,sir paul rock super bowl crowd sir paul mccart...,mauresmo open victori la ameli mauresmo maria ...,0.865341


In [37]:
df.to_csv('similarity_scores.csv')

In [38]:
pip install voila

Note: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spyder 5.0.0 requires pyqt5<5.13, which is not installed.
spyder 5.0.0 requires pyqtwebengine<5.13, which is not installed.
dash 2.0.0 requires dash-table==5.0.0, which is not installed.
anaconda-project 0.10.2 requires ruamel-yaml, which is not installed.
spyder 5.0.0 requires jedi==0.17.2, but you have jedi 0.18.1 which is incompatible.
spyder 5.0.0 requires parso==0.7.0, but you have parso 0.8.2 which is incompatible.


Collecting voila
  Downloading voila-0.3.5-py3-none-any.whl (1.7 MB)
Collecting websockets>=9.0
  Downloading websockets-10.2-cp38-cp38-win_amd64.whl (97 kB)
Collecting nbconvert<7,>=6.4.5
  Downloading nbconvert-6.5.0-py3-none-any.whl (561 kB)
Collecting pywinpty<2
  Downloading pywinpty-1.1.6-cp38-none-win_amd64.whl (1.4 MB)
Collecting tinycss2
  Downloading tinycss2-1.1.1-py3-none-any.whl (21 kB)
Collecting MarkupSafe>=0.23
  Downloading MarkupSafe-2.1.1-cp38-cp38-win_amd64.whl (17 kB)
Collecting jinja2
  Downloading Jinja2-3.1.1-py3-none-any.whl (132 kB)
Installing collected packages: MarkupSafe, tinycss2, pywinpty, jinja2, nbconvert, websockets, voila
  Attempting uninstall: MarkupSafe
    Found existing installation: MarkupSafe 1.1.1
    Uninstalling MarkupSafe-1.1.1:
      Successfully uninstalled MarkupSafe-1.1.1
  Attempting uninstall: pywinpty
    Found existing installation: pywinpty 2.0.2
    Uninstalling pywinpty-2.0.2:
      Successfully uninstalled pywinpty-2.0.2
  Attem

cookiecutter 1.7.2 requires Jinja2<3.0.0, but you have jinja2 3.1.1 which is incompatible.
cookiecutter 1.7.2 requires MarkupSafe<2.0.0, but you have markupsafe 2.1.1 which is incompatible.
