In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import warnings

from Evaluator import Evaluator

warnings.filterwarnings('ignore') # ignore pandas future warnings
evaluation_results = {}

# Load data

In [2]:
data = pd.read_csv('.\\task_data.csv')
print(len(data))
data.head()

37


Unnamed: 0,ID,Cardiomegaly,Heart width,Lung width,CTR - Cardiothoracic Ratio,xx,yy,xy,normalized_diff,Inscribed circle radius,Polygon Area Ratio,Heart perimeter,Heart area,Lung area
0,1,0,172,405,424691358,1682.360871,3153.67188,-638.531109,-0.304239,688186,0.213446,6794873689,24898,75419
1,2,1,159,391,4066496164,1526.66096,5102.159054,-889.678405,-0.539387,7392564,0.203652,7886589419,29851,94494
2,5,0,208,400,52,2465.903392,5376.834707,-1755.344699,-0.371163,6933974,0.320787,8623229369,33653,66666
3,7,1,226,435,5195402299,2509.063593,6129.82127,-1025.079806,-0.419123,8414868,0.317545,906724959,42018,82596
4,8,1,211,420,5023809524,2368.770135,5441.767075,-1493.040062,-0.393442,7378347,0.263542,8642396777,35346,85631


### Preprocessing
Since "xx", "yy", "xy", "normalized_diff" apply to photos which are not provided, and "ID" is not relevant we will be dropping these columns as they are useless in the given task

In [3]:
# drop useless data
data.drop(labels=['xx','yy','xy','ID','normalized_diff'] , axis=1, inplace=True)
# replace ',' with '.' and convert everything to floats 
for index, row in data.iterrows():
    for idx, j in enumerate(row):
        if isinstance(j, str):
            j = float(j.replace(',', '.'))
        row.iloc[idx] = j
    data.iloc[index] = row   
data.head()
X_train, X_test = train_test_split(data, test_size=0.3)
y_train, y_test = X_train['Cardiomegaly'], X_test['Cardiomegaly']

X_train.drop('Cardiomegaly', axis=1, inplace=True)
X_test.drop('Cardiomegaly', axis=1, inplace=True)

X_train = np.array(X_train, dtype=np.float32)
X_test = np.array(X_test, dtype=np.float32)
y_test = np.array(y_test, dtype=int)
y_train = np.array(y_train, dtype=int)

del data

# K-nearest neighbours
Since a record can be described as a vector (in our case 8 dimentional: v = [*Heart width*, *Lung width*, *CTR - Cardiothoracic Ratio*, *Inscribed circle radius*,	*Polygon Area Ratio*, *Heart perimeter*, *Heart area*, *Lung area*]) we can think of a record as a point in n-dimentional space. For a new given point 'A' we can calculate distances to other points  and then choose k nearest of them. If most of the points close to A represent sick patient then the patient represented by A is most likely sick as well.  

In [4]:
from ML import KNN_classifier
knn = KNN_classifier()
knn.fit(X_train, y_train)

### Evaluation of the model

In [5]:
evaluator = Evaluator()
for x, y in zip(X_test, y_test):
    prediction = knn.predict(x)
    evaluator.judge(prediction, y)
    evaluator.print()
evaluation_results['knn'] = evaluator.print_final()
    
    
   

Evaluation metrics:
 num  | Accuracy | Precision | Recall | F1_Score |
  1   |   1.00   |   1.00    |  1.00  |   1.00   |
  2   |   0.50   |   0.50    |  1.00  |   0.67   |
  3   |   0.67   |   0.67    |  1.00  |   0.80   |
  4   |   0.50   |   0.50    |  1.00  |   0.67   |
  5   |   0.60   |   0.60    |  1.00  |   0.75   |
  6   |   0.67   |   0.67    |  1.00  |   0.80   |
  7   |   0.71   |   0.71    |  1.00  |   0.83   |
  8   |   0.62   |   0.62    |  1.00  |   0.77   |
  9   |   0.67   |   0.67    |  1.00  |   0.80   |
  10  |   0.60   |   0.60    |  1.00  |   0.75   |
  11  |   0.64   |   0.64    |  1.00  |   0.78   |
  12  |   0.67   |   0.67    |  1.00  |   0.80   |

Final: 
accuracy=0.67 | precision=0.67 | recall=1.00 | f1_score=0.80


### Conclusion
The model works not bad but dataset is not big enough to provide accurate predictions.

In [6]:
del knn
del evaluator

# Decision Tree
We can split the data into two subsets considering a specific feature. If we continue splitting on and on we'l get a lot of subsets each containing some datapoints. If due to imposed conditions our data falls into one of the subsets the label is most likely the same as most common label in the considered subset.

In [7]:
from ML import DecisionTree
dt = DecisionTree()
dt.fit(X_train, y_train)

### Evaluation of the model

In [8]:
evaluator = Evaluator()
for x, y in zip(X_test, y_test):
    prediction = dt.predict(x)
    evaluator.judge(prediction, y)
    evaluator.print()
evaluation_results['dt'] = evaluator.print_final()

Evaluation metrics:
 num  | Accuracy | Precision | Recall | F1_Score |
  1   |   1.00   |   1.00    |  1.00  |   1.00   |
  2   |   1.00   |   1.00    |  1.00  |   1.00   |
  3   |   1.00   |   1.00    |  1.00  |   1.00   |
  4   |   1.00   |   1.00    |  1.00  |   1.00   |
  5   |   0.80   |   1.00    |  0.67  |   0.80   |
  6   |   0.83   |   1.00    |  0.75  |   0.86   |
  7   |   0.86   |   1.00    |  0.80  |   0.89   |
  8   |   0.75   |   0.80    |  0.80  |   0.80   |
  9   |   0.67   |   0.80    |  0.67  |   0.73   |
  10  |   0.70   |   0.80    |  0.67  |   0.73   |
  11  |   0.73   |   0.83    |  0.71  |   0.77   |
  12  |   0.75   |   0.86    |  0.75  |   0.80   |

Final: 
accuracy=0.75 | precision=0.86 | recall=0.75 | f1_score=0.80
