#**Cosine Similarity for Data Mapping**


In [1427]:
!pip install pandas scikit-learn



In [1428]:
import pandas as pd

In [1429]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [1430]:
# We can load csv files or json or input data manually.
# column 1 = pd.read_csv('filename.csv')
# column 2 = pd.read_csv('secondfilename.csv')
#data = pd.read_json('data.json')
data1 =  {'my_format' : ['B id type', 'B id code', 'B name', 'B dob', 'B cob', 'B extra field']}
data2 =  {'other_format' : ['B id type', 'B id code',  'B first name', 'B date', 'B country']}

In [1431]:
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)

In [1432]:
df1
#len(df1)

Unnamed: 0,my_format
0,B id type
1,B id code
2,B name
3,B dob
4,B cob
5,B extra field


In [1433]:
df2
#len(df2)

Unnamed: 0,other_format
0,B id type
1,B id code
2,B first name
3,B date
4,B country


In [1434]:
combined_text = df1['my_format'].tolist() + df2['other_format'].tolist()
combined_text

['B id type',
 'B id code',
 'B name',
 'B dob',
 'B cob',
 'B extra field',
 'B id type',
 'B id code',
 'B first name',
 'B date',
 'B country']

In [1435]:

# Now use the combined list with TfidfVectorizer
vectorizer = TfidfVectorizer().fit_transform(combined_text)

In [1436]:
vectors = vectorizer.toarray()

In [1437]:
#len(vectors)
vectors

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.61792812, 0.        ,
        0.78623459],
       [0.        , 0.78623459, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.61792812, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 1.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [1.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.70710678, 0.70710678, 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.

In [1438]:
vec1 = vectors[:len(df1)]
vec2 = vectors[len(df1):]

In [1439]:
vec1

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.61792812, 0.        ,
        0.78623459],
       [0.        , 0.78623459, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.61792812, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 1.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [1.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.70710678, 0.70710678, 0.        , 0.        , 0.        ,
        0.        ]])

In [1440]:
vec2

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.61792812, 0.        ,
        0.78623459],
       [0.        , 0.78623459, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.61792812, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.76014955, 0.        , 0.64974816,
        0.        ],
       [0.        , 0.        , 0.        , 1.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ]])

In [1441]:
cosine_similarity = cosine_similarity(vec1, vec2)

In [1442]:
matches = []
for i in range(len(df1)):
  best_match_index = cosine_similarity[i].argmax()
  best_match_score = cosine_similarity[i].max()
  matches.append((i, best_match_index, best_match_score))

In [1443]:
matches

[(0, 0, 1.0),
 (1, 1, 1.0000000000000002),
 (2, 2, 0.649748158161005),
 (3, 0, 0.0),
 (4, 0, 0.0),
 (5, 0, 0.0)]

In [1444]:
for match in matches:
  print(f"table 1 column {match[0] } matches with table 2 column {match[1]} with similarity {match[2]:.2f}")

table 1 column 0 matches with table 2 column 0 with similarity 1.00
table 1 column 1 matches with table 2 column 1 with similarity 1.00
table 1 column 2 matches with table 2 column 2 with similarity 0.65
table 1 column 3 matches with table 2 column 0 with similarity 0.00
table 1 column 4 matches with table 2 column 0 with similarity 0.00
table 1 column 5 matches with table 2 column 0 with similarity 0.00


#**XGB model**


In [1445]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import numpy as np

In [1446]:
df1

Unnamed: 0,my_format
0,B id type
1,B id code
2,B name
3,B dob
4,B cob
5,B extra field


In [1447]:
df1 = df1.drop(5) #for model development data has to be of equal length.

In [1448]:
df1 #view the data after dropping an extra field

Unnamed: 0,my_format
0,B id type
1,B id code
2,B name
3,B dob
4,B cob


In [1449]:
df2

Unnamed: 0,other_format
0,B id type
1,B id code
2,B first name
3,B date
4,B country


In [1450]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [1451]:
# Compute embeddings for both columns
embeddings1 = model.encode(df1['my_format'].tolist())
embeddings2 = model.encode(df2['other_format'].tolist())


In [1452]:
embeddings1

array([[-0.04094651,  0.01629302, -0.04280036, ...,  0.05585571,
        -0.01917019, -0.03783623],
       [-0.06923592,  0.03613158, -0.08476153, ...,  0.11149269,
        -0.05108549, -0.06808497],
       [-0.08776868,  0.04008351, -0.08625089, ..., -0.01336384,
        -0.01711275, -0.00095323],
       [-0.03333949, -0.00131321, -0.04713215, ..., -0.03108094,
        -0.06822322, -0.06418302],
       [-0.00790826,  0.01315571, -0.10467219, ..., -0.03033795,
        -0.01722167, -0.03376266]], dtype=float32)

In [1453]:
embeddings2

array([[-0.04094651,  0.01629302, -0.04280036, ...,  0.05585571,
        -0.01917019, -0.03783623],
       [-0.06923592,  0.03613158, -0.08476153, ...,  0.11149269,
        -0.05108549, -0.06808497],
       [-0.07606318,  0.02691524, -0.05429286, ..., -0.00535081,
         0.03006143, -0.01896352],
       [-0.03380917,  0.04730963,  0.01403578, ..., -0.01873691,
        -0.0139486 , -0.03578657],
       [ 0.04337719,  0.01816363, -0.04434117, ...,  0.00360834,
        -0.07644355, -0.03095439]], dtype=float32)

In [1454]:
# Compute cosine similarities between embeddings
similarities = util.pytorch_cos_sim(embeddings1, embeddings2)

In [1455]:
similarities

tensor([[1.0000, 0.8366, 0.4887, 0.3809, 0.4250],
        [0.8366, 1.0000, 0.4411, 0.3403, 0.3957],
        [0.5812, 0.5128, 0.8775, 0.5413, 0.5617],
        [0.4589, 0.4032, 0.5293, 0.4963, 0.5730],
        [0.4814, 0.4124, 0.5604, 0.5051, 0.5515]])

In [1456]:
similarity_df = pd.DataFrame(similarities.numpy(), index=df1['my_format'], columns=df2['other_format'])
similarity_df

other_format,B id type,B id code,B first name,B date,B country
my_format,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
B id type,1.0,0.836633,0.488713,0.38085,0.424966
B id code,0.836633,1.0,0.441107,0.340319,0.395716
B name,0.581152,0.512795,0.877452,0.541335,0.561669
B dob,0.458923,0.403151,0.529277,0.496291,0.572975
B cob,0.48141,0.412413,0.560436,0.505092,0.551474


In [1457]:
#Prepare training data
X = []
y = []

In [1458]:
for col1 in df1['my_format']:
  for col2 in df2['other_format']:
    X.append(similarity_df.loc[col1, col2])
    y.append(0 if similarity_df.loc[col1, col2] < 0.5 else 1)

In [1459]:
X = pd.DataFrame(X, columns=['similarity'])
X

Unnamed: 0,similarity
0,1.0
1,0.836633
2,0.488713
3,0.38085
4,0.424966
5,0.836633
6,1.0
7,0.441107
8,0.340319
9,0.395716


In [1460]:
y = pd.Series(y)
y

Unnamed: 0,0
0,1
1,1
2,0
3,0
4,0
5,1
6,1
7,0
8,0
9,0


In [1461]:
#Split data into training and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [1462]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

In [1463]:
X_test
#y_test

Unnamed: 0,similarity
8,0.340319
16,0.403151
0,1.0
23,0.505092
11,0.512795
9,0.395716
13,0.541335
1,0.836633
22,0.560436
5,0.836633


In [1464]:
predictions = xgb.predict(X_test)
predictions

array([0, 0, 1, 0, 0, 0, 1, 1, 1, 1])

In [1465]:
y_test #expected outputs

Unnamed: 0,0
8,0
16,0
0,1
23,1
11,1
9,0
13,1
1,1
22,1
5,1


In [1466]:
accuracy =(predictions == y_test).mean()
accuracy

0.8

In [1467]:
print (f'Accuracy : {accuracy *100}')

Accuracy : 80.0


#**KNN Classifier**

In [1468]:
from sklearn.neighbors import KNeighborsClassifier

In [1469]:
#knn = KNeighborsClassifier(n_neighbors=5, metric='cosine')
knn = KNeighborsClassifier(n_neighbors = 5, metric='euclidean')

In [1470]:
knn.fit(X_train, y_train)

In [1471]:
y_pred = knn.predict(X_test)
y_pred

array([0, 0, 1, 0, 0, 0, 1, 1, 1, 1])

In [1472]:
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)*100)

Accuracy: 80.0


#**Decision Tree**

In [1473]:
from sklearn.tree import DecisionTreeClassifier

In [1474]:
dt = DecisionTreeClassifier()
# Train Decision Tree Classiferifier()
dt.fit(X_train, y_train)

In [1475]:
y_pred = dt.predict(X_test)

In [1476]:
y_pred

array([0, 0, 1, 0, 1, 0, 1, 1, 1, 1])

In [1477]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)*100)

Accuracy: 90.0


#**Random Forest**

In [1478]:
from sklearn.ensemble import RandomForestClassifier

In [1489]:
clf = RandomForestClassifier(n_estimators = 100)

# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
clf.fit(X_train, y_train)

In [1490]:
y_pred=clf.predict(X_test)

In [1491]:
X_test

Unnamed: 0,similarity
8,0.340319
16,0.403151
0,1.0
23,0.505092
11,0.512795
9,0.395716
13,0.541335
1,0.836633
22,0.560436
5,0.836633


In [1492]:
y_pred

array([0, 0, 1, 0, 1, 0, 1, 1, 1, 1])

In [1493]:
print("accuracy:", metrics.accuracy_score(y_test, y_pred)*100)

accuracy: 90.0
