# IMPORT LIBRARIES

In [66]:
from sklearn.datasets import load_breast_cancer
import numpy as np
import pandas as pd
from scipy.spatial import distance

# READ DATASET

In [3]:
data = load_breast_cancer(as_frame=True)
data

{'data':      mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
 0          17.99         10.38          122.80     1001.0          0.11840   
 1          20.57         17.77          132.90     1326.0          0.08474   
 2          19.69         21.25          130.00     1203.0          0.10960   
 3          11.42         20.38           77.58      386.1          0.14250   
 4          20.29         14.34          135.10     1297.0          0.10030   
 ..           ...           ...             ...        ...              ...   
 564        21.56         22.39          142.00     1479.0          0.11100   
 565        20.13         28.25          131.20     1261.0          0.09780   
 566        16.60         28.08          108.30      858.1          0.08455   
 567        20.60         29.33          140.10     1265.0          0.11780   
 568         7.76         24.54           47.92      181.0          0.05263   
 
      mean compactness  mean concavity  me

# create dataframe

In [19]:
features = data.data
target = data.target
whole_data = np.c_[features, target]

feature_names = data.feature_names.tolist()
feature_names.append('class')

df = pd.DataFrame(whole_data, columns=feature_names)

In [21]:
df.head().T

Unnamed: 0,0,1,2,3,4
mean radius,17.99,20.57,19.69,11.42,20.29
mean texture,10.38,17.77,21.25,20.38,14.34
mean perimeter,122.8,132.9,130.0,77.58,135.1
mean area,1001.0,1326.0,1203.0,386.1,1297.0
mean smoothness,0.1184,0.08474,0.1096,0.1425,0.1003
mean compactness,0.2776,0.07864,0.1599,0.2839,0.1328
mean concavity,0.3001,0.0869,0.1974,0.2414,0.198
mean concave points,0.1471,0.07017,0.1279,0.1052,0.1043
mean symmetry,0.2419,0.1812,0.2069,0.2597,0.1809
mean fractal dimension,0.07871,0.05667,0.05999,0.09744,0.05883


# select 3 random samples from each class

In [24]:
benign_samples = df[df['class'] == 0].sample(3)
benign_samples.T

Unnamed: 0,4,441,400
mean radius,20.29,17.27,17.91
mean texture,14.34,25.42,21.02
mean perimeter,135.1,112.4,124.4
mean area,1297.0,928.8,994.0
mean smoothness,0.1003,0.08331,0.123
mean compactness,0.1328,0.1109,0.2576
mean concavity,0.198,0.1204,0.3189
mean concave points,0.1043,0.05736,0.1198
mean symmetry,0.1809,0.1467,0.2113
mean fractal dimension,0.05883,0.05407,0.07115


In [28]:
malignant_samples = df[df['class'] == 1].sample(3)
malignant_samples.T

Unnamed: 0,20,46,364
mean radius,13.08,8.196,13.4
mean texture,15.71,16.84,16.95
mean perimeter,85.63,51.71,85.48
mean area,520.0,201.9,552.4
mean smoothness,0.1075,0.086,0.07937
mean compactness,0.127,0.05943,0.05696
mean concavity,0.04568,0.01588,0.02181
mean concave points,0.0311,0.005917,0.01473
mean symmetry,0.1967,0.1769,0.165
mean fractal dimension,0.06811,0.06503,0.05701


# calculate distances between sample pairs

In [32]:
# extract values from df
malignant_samples_array = malignant_samples.iloc[:, :-1].values
benign_samples_array    = benign_samples.iloc[:, :-1].values

print(f"malignant_samples_array.shape: {malignant_samples_array.shape}")
print(f"benign_samples_array.shape: {benign_samples_array.shape}")

malignant_samples_array.shape: (3, 30)
benign_samples_array.shape: (3, 30)


# define functions

In [79]:
def euclidean(v, u):
    return np.linalg.norm(u - v)

def manhattan(u, v):
    return np.sum(np.abs(u - v))

def chebyshev(u, v):
    return np.max(np.abs(u - v))

def cosine(u, v):
    return 1 - (np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v)))

def mahalanobis(u, v, S):
    # Inverse covariance matrix
    S_inv = np.linalg.inv(S)
    # Mahalanobis distance
    diff = u - v
    return distance.mahalanobis(u, v, S_inv)

In [45]:
euclidean_distances1 = np.linalg.norm(benign_samples_array[:, None, :] - malignant_samples_array[None, :, :], axis=2)
euclidean_distances1

array([[1227.97315837, 1731.87415221, 1182.32649939],
       [ 773.64704688, 1275.1910549 ,  728.90325522],
       [ 826.74859225, 1330.43319321,  781.53701372]])

In [47]:
euclidean_distances2 = np.zeros((3, 3))
for i, u in enumerate(benign_samples_array):
    for j, v in enumerate(malignant_samples_array):
        euclidean_distances2[i, j] = euclidean(u, v)

euclidean_distances2

array([[1227.97315837, 1731.87415221, 1182.32649939],
       [ 773.64704688, 1275.1910549 ,  728.90325522],
       [ 826.74859225, 1330.43319321,  781.53701372]])

In [42]:
manhattan_distances1 = np.sum(np.abs(benign_samples_array[:, None, :] - malignant_samples_array[None, :, :]), axis=2)
manhattan_distances1

array([[1932.749473, 2732.120841, 1873.800973],
       [1208.308506, 2002.214072, 1144.507226],
       [1294.186196, 2088.599202, 1230.644416]])

In [43]:
manhattan_distances2 = np.zeros((3, 3))
for i, u in enumerate(benign_samples_array):
    for j, v in enumerate(malignant_samples_array):
        manhattan_distances2[i, j] = manhattan(u, v)

manhattan_distances2

array([[1932.749473, 2732.120841, 1873.800973],
       [1208.308506, 2002.214072, 1144.507226],
       [1294.186196, 2088.599202, 1230.644416]])

In [49]:
chebyshev_distances1 = np.max(benign_samples_array[:, None, :] - malignant_samples_array[None, :, :], axis=2)
chebyshev_distances1

array([[ 944.5, 1332.8,  911.5],
       [ 653.5, 1041.8,  620.5],
       [ 673.5, 1061.8,  640.5]])

In [50]:
chebyshev_distances2 = np.zeros((3, 3))
for i, u in enumerate(benign_samples_array):
    for j, v in enumerate(malignant_samples_array):
        chebyshev_distances2[i, j] = chebyshev(u, v)

chebyshev_distances2

array([[ 944.5, 1332.8,  911.5],
       [ 653.5, 1041.8,  620.5],
       [ 673.5, 1061.8,  640.5]])

In [55]:
# Normalize rows
benign_samples_array_norm = benign_samples_array / np.linalg.norm(benign_samples_array, axis=1, keepdims=True)
malignant_samples_array_norm = malignant_samples_array / np.linalg.norm(malignant_samples_array, axis=1, keepdims=True)

# Cosine similarity matrix
cosine_sim = np.dot(benign_samples_array_norm, malignant_samples_array_norm.T)

# Cosine distance
cosine_distances1 = 1 - cosine_sim
cosine_distances1

array([[0.00225834, 0.01305153, 0.00182084],
       [0.00327944, 0.01268518, 0.00320738],
       [0.0015577 , 0.0105701 , 0.00143334]])

In [51]:
cosine_distances2 = np.zeros((3, 3))
for i, u in enumerate(benign_samples_array):
    for j, v in enumerate(malignant_samples_array):
        cosine_distances2[i, j] = cosine(u, v)

cosine_distances2

array([[0.00225834, 0.01305153, 0.00182084],
       [0.00327944, 0.01268518, 0.00320738],
       [0.0015577 , 0.0105701 , 0.00143334]])

In [80]:
new_data = data = np.vstack([benign_samples_array, malignant_samples_array])
cov = np.cov(df.values[:, :-1], rowvar=False)
mahalanobis_distances2 = np.zeros((3, 3))
for i, u in enumerate(benign_samples_array):
    for j, v in enumerate(malignant_samples_array):
        mahalanobis_distances2[i, j] = mahalanobis(u, v, cov)

mahalanobis_distances2

array([[7.0446778 , 7.01233209, 5.83360965],
       [7.28855813, 6.7284842 , 5.33828531],
       [9.22825148, 9.78861805, 9.14323833]])

In [81]:
names = ["1", "2", "3"]
pair_names = [f"{i}-{j}" for i in names for j in names]
pair_names

['1-1', '1-2', '1-3', '2-1', '2-2', '2-3', '3-1', '3-2', '3-3']

In [100]:
distances_df = pd.DataFrame(columns=["pairs", "Euclidean", "Manhattan",
                                    "Chebyshev", "Cosine", "Mahalanobis"])
distances_df

Unnamed: 0,pairs,Euclidean,Manhattan,Chebyshev,Cosine,Mahalanobis


In [104]:
distances_df.pairs = pair_names
distances_df.Euclidean = euclidean_distances2.flatten()
distances_df.Manhattan = manhattan_distances2.flatten()
distances_df.Chebyshev = chebyshev_distances2.flatten()
distances_df.Cosine = cosine_distances2.flatten()
distances_df.Mahalanobis = mahalanobis_distances2.flatten()
distances_df = distances_df.set_index("pairs")
distances_df

Unnamed: 0_level_0,Euclidean,Manhattan,Chebyshev,Cosine,Mahalanobis
pairs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1-1,1227.973158,1932.749473,944.5,0.002258,7.044678
1-2,1731.874152,2732.120841,1332.8,0.013052,7.012332
1-3,1182.326499,1873.800973,911.5,0.001821,5.83361
2-1,773.647047,1208.308506,653.5,0.003279,7.288558
2-2,1275.191055,2002.214072,1041.8,0.012685,6.728484
2-3,728.903255,1144.507226,620.5,0.003207,5.338285
3-1,826.748592,1294.186196,673.5,0.001558,9.228251
3-2,1330.433193,2088.599202,1061.8,0.01057,9.788618
3-3,781.537014,1230.644416,640.5,0.001433,9.143238


1. فاصله اقلیدسی فکر میکند همه ویژگیها یک متر و معیار دارند. و تاثیر داده ها با معیار بزرگتر در ان بیشتر است. از طرفی اوتلایرها اثر بزرگی روی ان دارند. مگر اینکه داده ها را نرمالایز کنیم و این اثرات را از بین ببریم.
2. موارد بالا برای فاصله منهتن نیز صد میکنند منتها اثر اوتلایر در این فاصله کمتر است.
3. فاصله چبیشف بیشترین فاصله از یک ویژگی را در نظر میگیرد و مابقی را کنار میگذارد. به نوعی فکر میکند همه ویژگیها مثل هم هستند و از میان این ویژگیهای شبیه بزرگترین را برمیدارد.
4. فاصله کوسینوسی تنها زاویه را در نظر میگیرد و اندازه را کنار میگذارد. چون ممکن است اندازه در تعیین سالم یا بدخیم بودن سلول خیلی تاثیرگذار باشد.
5. فاصله ماهالانوبیس دو عمل مهم را انجام میدهد. یکی مقیاسها را یکی میکند و چالش تفاوت معیار در فاصله های فوق را حل میکند و از طرفی ستونهایی که با هم همبسته هستند را پیدا کرده و اثرشان را کم میکند. مثلا به ستون محیط، مساحت و شعاع را معیارهای وزن کمتری داده و اهمیت شعاع را برخلاف معیارهای قبلی سه بار تکرار نمیکند.

فاصله هر نمونه جدید را از مجموعه فاصله های دارای برچسب انتخاب کرده و از مثلا 3 نمونه با فاصله نزدیکتر رای گیری میکنیم و رای بیشتر به معنی تعلق نمونه به آن دسته خواهد بود.برای این مورد از فاصله ماهالانوبیس استفاده میکنیم که هم اثر مقیاس اندازه را کم کند و هم اثر ستونهای همبسته را.

فاصله اقلیدسی حساس به اوتلایر است. یعنی کنار اهمیت زیاد دادن به مقیاسها، چنانچه یک فیچر مقدار نادرستی داشته باشد وزن بیشتری به آن داده و اثر بیشتری روی فاصله میگذارد.

فاصله کوسینوسی اندازه را در نظر نمیگیرد و فقط زاویه را در محاسبات خود دخالت میدهد.مثلا دو بردار را در نظر بگیریم یکی با اندازه 2 و دیگری با 1. چنانچه بردار جدید زاویه برابر با هر دو تشکیل دهد، به هر دو اهمیت یکسان خواهد داد. در جدول بالا با وجود افزایش فاصله مابقی معیارها، فاصله کوسینوسی کاهش میابد.