In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import warnings

warnings.filterwarnings('ignore') # ignore pandas future warnings

# Load data

In [7]:
data = pd.read_csv('..\\task_data.csv')
print(len(data))
data.head()

37


Unnamed: 0,ID,Cardiomegaly,Heart width,Lung width,CTR - Cardiothoracic Ratio,xx,yy,xy,normalized_diff,Inscribed circle radius,Polygon Area Ratio,Heart perimeter,Heart area,Lung area
0,1,0,172,405,424691358,1682.360871,3153.67188,-638.531109,-0.304239,688186,0.213446,6794873689,24898,75419
1,2,1,159,391,4066496164,1526.66096,5102.159054,-889.678405,-0.539387,7392564,0.203652,7886589419,29851,94494
2,5,0,208,400,52,2465.903392,5376.834707,-1755.344699,-0.371163,6933974,0.320787,8623229369,33653,66666
3,7,1,226,435,5195402299,2509.063593,6129.82127,-1025.079806,-0.419123,8414868,0.317545,906724959,42018,82596
4,8,1,211,420,5023809524,2368.770135,5441.767075,-1493.040062,-0.393442,7378347,0.263542,8642396777,35346,85631


### Preprocessing
Since "xx", "yy", "xy", "normalized_diff" apply to photos which are not provided, and "ID" is not relevant we will be dropping these columns as they are useless in the given task

In [8]:
# drop useless data
data.drop('xx', axis=1, inplace=True)
data.drop('yy', axis=1, inplace=True)
data.drop('xy', axis=1, inplace=True)
data.drop('ID', axis=1, inplace=True)
data.drop('normalized_diff', axis=1, inplace=True)

# replace ',' with '.' and convert everything to floats 
for index, row in data.iterrows():
    for idx, j in enumerate(row):
        if isinstance(j, str):
            j = float(j.replace(',', '.'))
        j = np.float64(j) 
        row.iloc[idx] = j
    data.iloc[index] = row   
data.head()
data, test_data = train_test_split(data, test_size=0.3)
data.head()

Unnamed: 0,Cardiomegaly,Heart width,Lung width,CTR - Cardiothoracic Ratio,Inscribed circle radius,Polygon Area Ratio,Heart perimeter,Heart area,Lung area
4,1,211,420,0.502381,73.78347,0.263542,864.239678,35346,85631
20,1,201,396,0.507576,72.33948,0.264131,721.327991,27874,66066
5,1,222,405,0.548148,83.86298,0.328101,1001.068103,46381,92755
21,1,213,406,0.524631,81.21576,0.346032,863.126978,37038,61689
32,0,239,447,0.534676,81.49233,0.31521,890.1808,39986,86438


# K-nearest neighbours
Since a record can be described as a vector (in our case 8 dimentional: v = [*Heart width*, *Lung width*, *CTR - Cardiothoracic Ratio*, *Inscribed circle radius*,	*Polygon Area Ratio*, *Heart perimeter*, *Heart area*, *Lung area*]) we can think of a record as a point in n-dimentional space. For a new given point 'A' we can calculate distances to other points  and then choose k nearest of them. If most of the points close to A represent sick patient then the patient represented by A is most likely sick as well.  

In [9]:
class KNN_classifier:
    def __init__(self, data: pd.DataFrame):
        self.data = data
    
    def knn(self, record: np.ndarray|pd.Series, k: int = 5):
        """Calculate distances of the points in dataFrame to the given record (point)\n
        Since sqrt(x) is increasing we can just calculate sqared distance and it won't matter in sorting\n
        We skip 'Cardiomegaly' column in calculating distance and add it as a label next to distance
        """
        # calculate all distances and add labels
        distances = []
        for _, row in self.data.iterrows():
            distances.append( (np.sum((row[1:] - record)**2), row[0]) )
        # Sort distances
        distances.sort(key=lambda x:x[0])
        # return the average of values of 'Cardiomegaly'
        # float value of the average represents certainty of patient being ill/healthy 
        return np.average(distances[:k], axis=0)[1]


### Evaluation of the model

In [13]:
knn = KNN_classifier(data.iloc[:30])
# eval uation metrics
TP, TN, FP, FN = 0,0,0,0
Accuracy=0
Precision = 0
Recall = 0
F1_Score = 0
print('Evaluation metrics:')
for idx, (_, row) in enumerate(test_data.iterrows()):
    prediction = round(knn.knn(row[1:], k=4)) # row of test_data include 'Cardiomegaly' column so we don't pass it to knn
    ground_truth = row[0]
    
    if prediction == 0 and ground_truth == 0:
        TN += 1
    elif prediction == 1 and ground_truth == 1:
        TP += 1
    elif prediction == 0 and ground_truth == 1:
        FN += 1
    else:
        FP += 1
    
    try:
        Accuracy=(TN+TP)/(TN+TP+FP+FN)
    except ZeroDivisionError:
        pass
    try:
        Precision = TP/(TP+FP)
    except ZeroDivisionError:
        pass
    try:
        Recall = TP/(TP+FN)
    except ZeroDivisionError:
        pass
    try:
        F1_Score = 2*Recall*Precision/(Recall+Precision)
    except:
        pass
                        
    print(f'{idx:>5} | {Accuracy=:.2f} | {Precision=:.2f} | {Recall=:.2f} | {F1_Score=:.2f}')
print(f'\nFinal: \n{Accuracy=:.2f} | {Precision=:.2f} | {Recall=:.2f} | {F1_Score=:.2f}')


Evaluation metrics:
    0 | Accuracy=0.00 | Precision=0.00 | Recall=0.00 | F1_Score=0.00
    1 | Accuracy=0.00 | Precision=0.00 | Recall=0.00 | F1_Score=0.00
    2 | Accuracy=0.33 | Precision=0.00 | Recall=0.00 | F1_Score=0.00
    3 | Accuracy=0.50 | Precision=1.00 | Recall=0.33 | F1_Score=0.50
    4 | Accuracy=0.60 | Precision=1.00 | Recall=0.50 | F1_Score=0.67
    5 | Accuracy=0.67 | Precision=1.00 | Recall=0.60 | F1_Score=0.75
    6 | Accuracy=0.71 | Precision=1.00 | Recall=0.67 | F1_Score=0.80
    7 | Accuracy=0.75 | Precision=1.00 | Recall=0.71 | F1_Score=0.83
    8 | Accuracy=0.78 | Precision=1.00 | Recall=0.75 | F1_Score=0.86
    9 | Accuracy=0.80 | Precision=1.00 | Recall=0.78 | F1_Score=0.88
   10 | Accuracy=0.82 | Precision=1.00 | Recall=0.80 | F1_Score=0.89
   11 | Accuracy=0.75 | Precision=0.89 | Recall=0.80 | F1_Score=0.84

Final: 
Accuracy=0.75 | Precision=0.89 | Recall=0.80 | F1_Score=0.84


### Conclusion
The model works well but dataset is not big enough to provide accurate predictions.