In [5]:
# Scikit Learn
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Define the documents
corpus = ["I'd like an apple",
          "An apple a day keeps the doctor away",
          "Never compare an apple to an orange",
          "I prefer scikit-learn to Orange",
          "The scikit-learn docs are Orange and Blue"]

# Create a document term matrix
count_vectorizer = CountVectorizer(stop_words ='english')
sparse_matrix = count_vectorizer.fit_transform(corpus)

doc_term_matrix = sparse_matrix.todense()
df = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names_out())
df

Unnamed: 0,apple,away,blue,compare,day,docs,doctor,keeps,learn,like,orange,prefer,scikit
0,1,0,0,0,0,0,0,0,0,1,0,0,0
1,1,1,0,0,1,0,1,1,0,0,0,0,0
2,1,0,0,1,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,1,0,1,1,1
4,0,0,1,0,0,1,0,0,1,0,1,0,1


In [22]:
from sklearn.metrics.pairwise import cosine_similarity
print(cosine_similarity(df,df))

[[1.         0.31622777 0.40824829 0.         0.        ]
 [0.31622777 1.         0.25819889 0.         0.        ]
 [0.40824829 0.25819889 1.         0.28867513 0.25819889]
 [0.         0.         0.28867513 1.         0.67082039]
 [0.         0.         0.25819889 0.67082039 1.        ]]


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

vect = TfidfVectorizer( stop_words='english')
tfidf = vect.fit_transform(corpus)
pairwise_similarity = tfidf * tfidf.T
pairwise_similarity.toarray()


array([[1.        , 0.17668795, 0.27056873, 0.        , 0.        ],
       [0.17668795, 1.        , 0.15439436, 0.        , 0.        ],
       [0.27056873, 0.15439436, 1.        , 0.19635649, 0.16815247],
       [0.        , 0.        , 0.19635649, 1.        , 0.54499756],
       [0.        , 0.        , 0.16815247, 0.54499756, 1.        ]])

In [30]:
arr = pairwise_similarity.toarray()
np.fill_diagonal(arr, np.nan)

input_doc = "The scikit-learn docs are Orange and Blue"
input_idx = corpus.index(input_doc)

# [0.         0.         0.16815247 0.54499756        nan] arr[4] 

[0.         0.         0.16815247 0.54499756        nan]


In [31]:
result_idx =  np.nanargmax(arr[input_idx]) 
# result_idx = 3
corpus[result_idx]

'I prefer scikit-learn to Orange'

In [33]:
import itertools
import numpy as np
from scipy.spatial.distance import cityblock
for idx_1, idx_2 in itertools.combinations(range(tfidf.shape[0]),2):
    v1, v2 = map(lambda idx: tfidf.toarray()[idx],(idx_1,idx_2))
    print(f"{(idx_1,idx_2)}"\
            f" - Euclidien: {np.linalg.norm(v1-v2):.4f}"\
            f" - Manhattan: {cityblock(v1,v2):.4f}")

(0, 1) - Euclidien: 1.2832 - Manhattan: 2.9663
(0, 2) - Euclidien: 1.2078 - Manhattan: 2.1134
(0, 3) - Euclidien: 1.4142 - Manhattan: 3.3671
(0, 4) - Euclidien: 1.4142 - Manhattan: 3.5991
(1, 2) - Euclidien: 1.3005 - Manhattan: 3.2775
(1, 3) - Euclidien: 1.4142 - Manhattan: 4.1938
(1, 4) - Euclidien: 1.4142 - Manhattan: 4.4258
(2, 3) - Euclidien: 1.2678 - Manhattan: 2.8707
(2, 4) - Euclidien: 1.2898 - Manhattan: 3.2187
(3, 4) - Euclidien: 0.9539 - Manhattan: 1.8335


In [34]:
from scipy import stats

formater = lambda t:', '.join(('%.5f' % f) for f in t)
for idx_1, idx_2 in itertools.combinations(range(tfidf.shape[0]),2):
    v1, v2 = map(lambda idx: tfidf.toarray()[idx], (idx_1,idx_2))
    print(f"{(idx_1, idx_2)}"\
         f" - Pearson: {formater(stats.pearsonr(v1, v2))}")

(0, 1) - Pearson: -0.08180, 0.79051
(0, 2) - Pearson: 0.10969, 0.72130
(0, 3) - Pearson: -0.27388, 0.36521
(0, 4) - Pearson: -0.32381, 0.28046
(1, 2) - Pearson: -0.19374, 0.52594
(1, 3) - Pearson: -0.51116, 0.07422
(1, 4) - Pearson: -0.60433, 0.02869
(2, 3) - Pearson: -0.08453, 0.78367
(2, 4) - Pearson: -0.17345, 0.57092
(3, 4) - Pearson: 0.31538, 0.29388


In [35]:
from scipy import stats

formater = lambda t:', '.join(('%.5f' % f) for f in t)
for idx_1, idx_2 in itertools.combinations(range(tfidf.shape[0]),2):
    v1, v2 = map(lambda idx: tfidf.toarray()[idx], (idx_1,idx_2))
    print(f"{(idx_1, idx_2)}"\
         f" - Spearman: {formater(stats.spearmanr(v1, v2))}")

(0, 1) - Spearman: -0.05579, 0.85636
(0, 2) - Spearman: 0.19543, 0.52226
(0, 3) - Spearman: -0.27798, 0.35778
(0, 4) - Spearman: -0.32487, 0.27879
(1, 2) - Spearman: -0.18814, 0.53821
(1, 3) - Spearman: -0.50753, 0.07665
(1, 4) - Spearman: -0.59313, 0.03263
(2, 3) - Spearman: -0.08206, 0.78985
(2, 4) - Spearman: -0.18541, 0.54422
(3, 4) - Spearman: 0.27317, 0.36651


In [43]:
data = pd.read_csv('Text_Similarity_Dataset.csv')
data


Unnamed: 0,Unique_ID,text1,text2
0,0,savvy searchers fail to spot ads internet sear...,newcastle 2-1 bolton kieron dyer smashed home ...
1,1,millions to miss out on the net by 2025 40% o...,nasdaq planning $100m share sale the owner of ...
2,2,young debut cut short by ginepri fifteen-year-...,ruddock backs yapp s credentials wales coach m...
3,3,diageo to buy us wine firm diageo the world s...,mci shares climb on takeover bid shares in us ...
4,4,be careful how you code a new european directi...,media gadgets get moving pocket-sized devices ...
...,...,...,...
4018,4018,labour plans maternity pay rise maternity pay ...,no seasonal lift for house market a swathe of ...
4019,4019,high fuel costs hit us airlines two of the lar...,new media battle for bafta awards the bbc lead...
4020,4020,britons growing digitally obese gadget lover...,film star fox behind theatre bid leading actor...
4021,4021,holmes is hit by hamstring injury kelly holmes...,tsunami to hit sri lanka banks sri lanka s b...


In [5]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import itertools
from pandas import DataFrame, merge
from sklearn.metrics.pairwise import cosine_similarity
cart_df = pd.read_csv('Text_Similarity_Dataset.csv')

# cart_product = list(itertools.product(df['text1'],df['text2']))

# cart_df = pd.DataFrame(cart_product, columns=['text1','text2'])

In [6]:
df = cart_df.copy()

# Count_Vectorizer

In [7]:
cos_similarity = []
for i in range(len(df)):
    doc1=df['text1'][i]
    doc2=df['text2'][i]
    docs=(doc1,doc2)
    matrix = CountVectorizer().fit_transform(docs)
    cosine_sim = cosine_similarity(matrix[0], matrix[1])
    cos_similarity.append(cosine_sim)
df['Count_Vectorizer'] = cos_similarity

# TF-IDF Vectorizer

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
sim_list = []
vect = TfidfVectorizer(stop_words='english')

for i in range(len(df)):
    doc1=df['text1'][i]
    doc2=df['text2'][i]
    docs=(doc1,doc2)
    tfidf_matrix = vect.fit_transform(docs)
    sim = cosine_similarity(tfidf_matrix[0],tfidf_matrix[1])
    sim_list.append(sim)
df['TF_IDF_Similarity'] = sim_list

In [9]:
df

Unnamed: 0,Unique_ID,text1,text2,Count_Vectorizer,TF_IDF_Similarity
0,0,savvy searchers fail to spot ads internet sear...,newcastle 2-1 bolton kieron dyer smashed home ...,[[0.5121263855362397]],[[0.018400340744745478]]
1,1,millions to miss out on the net by 2025 40% o...,nasdaq planning $100m share sale the owner of ...,[[0.6867717040468696]],[[0.033814752892231334]]
2,2,young debut cut short by ginepri fifteen-year-...,ruddock backs yapp s credentials wales coach m...,[[0.6365378822840906]],[[0.05273885434484543]]
3,3,diageo to buy us wine firm diageo the world s...,mci shares climb on takeover bid shares in us ...,[[0.4020487524843295]],[[0.04086775571951435]]
4,4,be careful how you code a new european directi...,media gadgets get moving pocket-sized devices ...,[[0.7648826352194371]],[[0.05168140016827307]]
...,...,...,...,...,...
4018,4018,labour plans maternity pay rise maternity pay ...,no seasonal lift for house market a swathe of ...,[[0.6487492098922014]],[[0.07389229110279227]]
4019,4019,high fuel costs hit us airlines two of the lar...,new media battle for bafta awards the bbc lead...,[[0.6605570566273307]],[[0.05014187171483796]]
4020,4020,britons growing digitally obese gadget lover...,film star fox behind theatre bid leading actor...,[[0.6799340870442956]],[[0.0719717480291427]]
4021,4021,holmes is hit by hamstring injury kelly holmes...,tsunami to hit sri lanka banks sri lanka s b...,[[0.5454415845071072]],[[0.02894578397832087]]


In [10]:
df_distance = cart_df.copy()

In [18]:
from scipy.spatial.distance import cityblock
import numpy as np
Vectorizer = TfidfVectorizer(stop_words = 'english')
euclidean_distance = []
manhattan_distance = []
for i in range(len(df)):
    doc1=df['text1'][i]
    doc2=df['text2'][i]
    docs=(doc1,doc2)
    tfidf_matrix = Vectorizer.fit_transform(docs)
    euclid_distance = np.linalg.norm(tfidf_matrix.toarray()[0]-tfidf_matrix.toarray()[1])
    manh_distance = cityblock(tfidf_matrix.toarray()[0],tfidf_matrix.toarray()[1])
    euclidean_distance.append(euclid_distance)
    manhattan_distance.append(manh_distance)
df_distance['euclid_distance'] = euclidean_distance
df_distance['manhattan_distance'] = manhattan_distance
df_distance
    # euclid_distance = np.linalg.norm(tfidf_matrix[0]-tfidf_matrix[1]) 
    # Manhattan_distance = cityblock(tfidf_matrix[0],tfidf_matrix[1]) 

Unnamed: 0,Unique_ID,text1,text2,euclid_distance,manhattan_distance
0,0,savvy searchers fail to spot ads internet sear...,newcastle 2-1 bolton kieron dyer smashed home ...,1.401142,20.889044
1,1,millions to miss out on the net by 2025 40% o...,nasdaq planning $100m share sale the owner of ...,1.390097,18.716633
2,2,young debut cut short by ginepri fifteen-year-...,ruddock backs yapp s credentials wales coach m...,1.376416,19.894704
3,3,diageo to buy us wine firm diageo the world s...,mci shares climb on takeover bid shares in us ...,1.385014,14.744520
4,4,be careful how you code a new european directi...,media gadgets get moving pocket-sized devices ...,1.377185,24.002366
...,...,...,...,...,...
4018,4018,labour plans maternity pay rise maternity pay ...,no seasonal lift for house market a swathe of ...,1.360961,17.934720
4019,4019,high fuel costs hit us airlines two of the lar...,new media battle for bafta awards the bbc lead...,1.378302,16.993757
4020,4020,britons growing digitally obese gadget lover...,film star fox behind theatre bid leading actor...,1.362372,21.289230
4021,4021,holmes is hit by hamstring injury kelly holmes...,tsunami to hit sri lanka banks sri lanka s b...,1.393596,16.081390


In [19]:
df_distance.to_csv('df_distance.csv', index = False)

In [21]:
df.to_csv('df_cosines_similarity.csv',index = False)