In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_blobs 

In [2]:
#let's create dataset from male_blobs sklearn
def make_forge():
    # a carefully hand-designed dataset lol
    X, y = make_blobs(centers=2, random_state=4, n_samples=30)
    y[np.array([7, 27])] = 0
    mask = np.ones(len(X), dtype=np.bool)
    mask[np.array([0, 1, 5, 26])] = 0
    X, y = X[mask], y[mask]
    return X, y
X, y = make_forge()

In [20]:
df = pd.DataFrame(data = X,  columns=["Feature1", "Feature2"])
df["Class"] = y
df

Unnamed: 0,Feature1,Feature2,Class
0,9.963466,4.596765,1
1,11.032954,-0.168167,0
2,11.541558,5.211161,1
3,8.69289,1.54322,0
4,8.106227,4.28696,0
5,8.309889,4.80624,1
6,11.930271,4.648663,1
7,9.672847,-0.202832,0
8,8.348103,5.134156,1
9,8.674947,4.475731,1


In [4]:
indices=np.arange(0, len(df))
np.random.shuffle(indices)
print(indices)

[ 2  9 17 14  3 22 23 24 10 19  1 20 16  8 15 21 12 25 18 11 13  7  0  6
  5  4]


In [5]:
test_indices=indices[:round(0.25*len(df))]
train_indices=indices[round(0.25*len(df)):]

In [6]:
test_indices

array([ 2,  9, 17, 14,  3, 22])

In [7]:
train_indices

array([23, 24, 10, 19,  1, 20, 16,  8, 15, 21, 12, 25, 18, 11, 13,  7,  0,
        6,  5,  4])

In [8]:
test_dataset=df.iloc[test_indices,:2]
train_dataset=df.iloc[train_indices,:]

In [9]:
test_dataset

Unnamed: 0,Feature1,Feature2
2,11.541558,5.211161
9,8.674947,4.475731
17,8.183781,1.295642
14,9.491235,4.332248
3,8.69289,1.54322
22,8.344688,1.638243


In [10]:
train_dataset

Unnamed: 0,Feature1,Feature2,Class
23,9.501693,1.938246,0
24,9.150723,5.498322,1
10,9.177484,5.092832,1
19,9.322983,5.098406,1
1,11.032954,-0.168167,0
20,10.063938,0.990781,0
16,7.998153,4.852505,1
8,8.348103,5.134156,1
15,9.256942,5.132849,1
21,9.50049,-0.264303,0


In [11]:
def eucladian_dist(first, second):
    dist=((first[0]-second[0])**2 + (first[1]-second[1])**2)**0.5
    return dist

In [12]:
dist_df=pd.DataFrame(columns=test_indices, index=train_indices)
for i in train_indices:
    for j in test_indices:
        dist_df.loc[i][j]=eucladian_dist(train_dataset.loc[i][:2], test_dataset.loc[j])
dist_df

Unnamed: 0,2,9,17,14,3,22
23,3.85656,2.66877,1.46623,2.39402,0.900116,1.19527
24,2.40802,1.12786,4.31248,1.21477,3.98151,3.94334
10,2.36703,0.795837,3.92506,0.822756,3.58254,3.55355
19,2.22144,0.898707,3.96974,0.784416,3.61059,3.5958
1,5.40332,5.20826,3.20321,4.75717,2.89909,3.23881
20,4.47157,3.75156,1.90471,3.39019,1.47816,1.83713
16,3.56151,0.774603,3.5617,1.58113,3.38142,3.23289
8,3.19438,0.735086,3.84203,1.39635,3.60745,3.49591
15,2.28596,0.877794,3.98445,0.834179,3.63367,3.61171
21,5.84351,4.81139,2.04136,4.59656,1.97974,2.22611


In [13]:
col=[dist_df[i].sort_values().head(3).index.tolist() for i in dist_df.columns]
col
  

[[6, 0, 19], [5, 4, 8], [12, 18, 23], [0, 19, 10], [12, 23, 18], [12, 18, 23]]

In [14]:
clas=[[train_dataset.loc[col[i][j]]['Class'].astype(int) for j in range(len(col[0]))] for i in range(len(col))]
clas

[[1, 1, 1], [1, 0, 1], [0, 0, 0], [1, 1, 1], [0, 0, 0], [0, 0, 0]]

In [15]:
clas_df=pd.DataFrame(clas, index=test_indices)
clas_df

Unnamed: 0,0,1,2
2,1,1,1
9,1,0,1
17,0,0,0
14,1,1,1
3,0,0,0
22,0,0,0


In [16]:
def res(df):
    res_list=[]
    for i in range(len(df)):
        x=round(df.iloc[i].sum()/len(df.iloc[i])*100, 2)
        if x>50:
            res_list.append(1)
        '''
        elif x==50:
            
        '''
        else:
            res_list.append(0)
    x=pd.Series(res_list, index=df.index)
    return x

In [17]:
test_dataset['pred']=res(clas_df)
test_dataset

Unnamed: 0,Feature1,Feature2,pred
2,11.541558,5.211161,1
9,8.674947,4.475731,1
17,8.183781,1.295642,0
14,9.491235,4.332248,1
3,8.69289,1.54322,0
22,8.344688,1.638243,0


In [18]:
result=(df.loc[test_indices]['Class']==test_dataset['pred']).sum()/len(test_dataset)*100
print(f'Similarity is {result:.2f} %')

Similarity is 100.00 %


In [19]:
df.loc[test_indices]['Class']

2     1
9     1
17    0
14    1
3     0
22    0
Name: Class, dtype: int32