In [1]:
# import 

from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np

In [2]:
# compute cosine_similarity

def cosine_similarity(list_corpus :list) -> object :
    """ """
                                                                                                                                                                                                 
    vect = TfidfVectorizer(min_df=1, stop_words="english")                                                                                                                                                                                                   
    tfidf = vect.fit_transform(list_corpus)                                                                                                                                                                                                                       
    pairwise_similarity = tfidf * tfidf.T 
    
    return pairwise_similarity

In [3]:
# test corpus

corpus_0 = ["I'd like to eat an apple", 
              "Never compare an apple to an orange", 
              "I prefer scikit-learn to Orange", 
              "The scikit-learn docs are Orange and Blue",
            "An apple a day keeps the doctor away", 
              "Apple is good for health"
             ]  

In [4]:
# cosine

cos_0 = cosine_similarity(corpus_0)

In [5]:
# check dataframe, each row / col refers to corpus_0 string

df_0 = pd.DataFrame(cos_0.toarray())
df_0

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.16959,0.0,0.0,0.11001,0.149644
1,0.16959,1.0,0.210759,0.181119,0.124673,0.16959
2,0.0,0.210759,1.0,0.555071,0.0,0.0
3,0.0,0.181119,0.555071,1.0,0.0,0.0
4,0.11001,0.124673,0.0,0.0,1.0,0.11001
5,0.149644,0.16959,0.0,0.0,0.11001,1.0


In [6]:
# check for allied markets

corpus_1 = ["allied market", "allied market LLC"]

In [7]:
# dataframe

cos_1 = cosine_similarity(corpus_1)
df_1 = pd.DataFrame(cos_1.toarray())
df_1

Unnamed: 0,0,1
0,1.0,0.709297
1,0.709297,1.0


# Hypothesis

In [25]:
# take true y, true pred and a volontary degraded pres

y = "Allied Markets,Joshua Gilliland,Chawalit Wongkhiao"
pred = "Joshua Gilliland,Chawalit Wongkhiao,Marcia Morales Howard,Allied Markets"
pred_mod = "Allied Markets LLC,Mr Joshua Gilliland,Marcia Morales,Chawalit Wongkhiao Senior"

In [9]:
# from string to list

y_list = [i.strip().lower() for i in y.split(",")]
pred_list = [i.strip().lower() for i in pred.split(",")]
pred_list_mod = [i.strip().lower() for i in pred_mod.split(",")]

In [10]:
# print 

print(y_list)
print(pred_list)
print(pred_list_mod)

['allied markets', 'joshua gilliland', 'chawalit wongkhiao']
['joshua gilliland', 'chawalit wongkhiao', 'marcia morales howard', 'allied markets']
['allied markets llc', 'mr joshua gilliland', 'marcia morales', 'chawaaalit woongkhiao senior']


In [11]:
# mix y and pred : pred[i] + y for each pred

y_pred_list = [[i] + y_list for i in pred_list]
y_pred_list_mod = [[i] + y_list for i in pred_list_mod ]
y_pred_list

[['joshua gilliland',
  'allied markets',
  'joshua gilliland',
  'chawalit wongkhiao'],
 ['chawalit wongkhiao',
  'allied markets',
  'joshua gilliland',
  'chawalit wongkhiao'],
 ['marcia morales howard',
  'allied markets',
  'joshua gilliland',
  'chawalit wongkhiao'],
 ['allied markets',
  'allied markets',
  'joshua gilliland',
  'chawalit wongkhiao']]

# Perfect predictions

In [12]:
# compute 2d cosine arrays

cos_y_pred_arrays = [cosine_similarity(i).toarray() for i in y_pred_list ]
cos_y_pred_arrays

[array([[1., 0., 1., 0.],
        [0., 1., 0., 0.],
        [1., 0., 1., 0.],
        [0., 0., 0., 1.]]),
 array([[1., 0., 0., 1.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [1., 0., 0., 1.]]),
 array([[1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 0., 1.]]),
 array([[1., 1., 0., 0.],
        [1., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 0., 1.]])]

In [13]:
# keep 1st line as list, drop 1st occurence because value is 1 (due to diagonal ) 

cos_y_pred_list = np.array([i[0][1:] for i in cos_y_pred_arrays ])
cos_y_pred_list

array([[0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 0.],
       [1., 0., 0.]])

In [14]:
pd.DataFrame(cos_y_pred_list, columns=y_list, index=pred_list)

Unnamed: 0,allied markets,joshua gilliland,chawalit wongkhiao
joshua gilliland,0.0,1.0,0.0
chawalit wongkhiao,0.0,0.0,1.0
marcia morales howard,0.0,0.0,0.0
allied markets,1.0,0.0,0.0


In [15]:
# max of each line (except the 1.0 diagonal :) )
max_cos_y_pred = np.array([max(i) for i in cos_y_pred_list ])
max_cos_y_pred

array([1., 1., 0., 1.])

In [16]:
{ j:round(i,2) for i, j in zip(max_cos_y_pred,pred_list )}

{'joshua gilliland': 1.0,
 'chawalit wongkhiao': 1.0,
 'marcia morales howard': 0.0,
 'allied markets': 1.0}

In [17]:
# mean
max_cos_y_pred.mean()

0.7499999999999998

# Modified predictions

In [18]:
y_pred_list_mod

[['allied markets llc',
  'allied markets',
  'joshua gilliland',
  'chawalit wongkhiao'],
 ['mr joshua gilliland',
  'allied markets',
  'joshua gilliland',
  'chawalit wongkhiao'],
 ['marcia morales',
  'allied markets',
  'joshua gilliland',
  'chawalit wongkhiao'],
 ['chawaaalit woongkhiao senior',
  'allied markets',
  'joshua gilliland',
  'chawalit wongkhiao']]

In [19]:
cos_y_pred_arrays_mod = [cosine_similarity(i).toarray() for i in y_pred_list_mod ]
cos_y_pred_arrays_mod

[array([[1.       , 0.7444497, 0.       , 0.       ],
        [0.7444497, 1.       , 0.       , 0.       ],
        [0.       , 0.       , 1.       , 0.       ],
        [0.       , 0.       , 0.       , 1.       ]]),
 array([[1.       , 0.       , 0.7444497, 0.       ],
        [0.       , 1.       , 0.       , 0.       ],
        [0.7444497, 0.       , 1.       , 0.       ],
        [0.       , 0.       , 0.       , 1.       ]]),
 array([[1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 0., 1.]]),
 array([[1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 0., 1.]])]

In [20]:
cos_y_pred_list_mod = np.array([i[0][1:] for i in cos_y_pred_arrays_mod ])
cos_y_pred_list_mod

array([[0.7444497, 0.       , 0.       ],
       [0.       , 0.7444497, 0.       ],
       [0.       , 0.       , 0.       ],
       [0.       , 0.       , 0.       ]])

In [21]:
pd.DataFrame(cos_y_pred_list_mod, columns=y_list, index=pred_list_mod)

Unnamed: 0,allied markets,joshua gilliland,chawalit wongkhiao
allied markets llc,0.74445,0.0,0.0
mr joshua gilliland,0.0,0.74445,0.0
marcia morales,0.0,0.0,0.0
chawaaalit woongkhiao senior,0.0,0.0,0.0


In [22]:
max_cos_y_pred_mod = np.array([max(i) for i in cos_y_pred_list_mod ])
max_cos_y_pred_mod

array([0.7444497, 0.7444497, 0.       , 0.       ])

In [23]:
{ j:round(i,2) for i, j in zip(max_cos_y_pred_mod,pred_list_mod )}

{'allied markets llc': 0.74,
 'mr joshua gilliland': 0.74,
 'marcia morales': 0.0,
 'chawaaalit woongkhiao senior': 0.0}

In [24]:
max_cos_y_pred_mod.mean()

0.3722248517590162

# Diff entre 'alexandre' de 'alexxandre' -> 0.9

In [None]:
pred = "alexandre"
y = "alexxandre"