In [None]:
#Importing libraries 
import pandas as pd
import numpy as np
import pydicom 

import os
import random

#Visualisation 
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import warnings
warnings.filterwarnings("ignore");
%matplotlib inline

In [None]:
INPUT_PATH = '../input/rsna-intracranial-hemorrhage-detection/rsna-intracranial-hemorrhage-detection/'

In [None]:
os.listdir(INPUT_PATH)

# **Exploratory analysis**

Let us try exploring this dataset, to get some insights and understand our data better

In [None]:
sub = pd.read_csv(INPUT_PATH+"stage_2_sample_submission.csv")
sub.head(10)

So Image IDs of form ID_SUBTYPE, which means we would have to make predictions for each subtype under a image ID. 
`any` --> indicates there is at least one subtype present telling us that patient has IH or not
`Label` --> indicates probability of presence

In [None]:
train_df = pd.read_csv(INPUT_PATH+"stage_2_train.csv")
# train_df.head(10)
labels =  train_df.Label.values

The training dataset is provided as set image`Id` and **multiple labels**, one for each of the subtypes of hemorrhage along with an addition lable for `any`(will be true of any of the subtype labels in true). So this is a **multilable classification task**.

Lets split the ID into columns of images and the corresponding diagnosis(subtype)

In [None]:
train_df = train_df.ID.str.rsplit("_",n=1,expand=True)
train_df.loc[:, "label"] = labels
train_df.head()

In [None]:
train_df = train_df.rename({0 : "image",1 : "subtype"}, axis=1)

In [None]:
train_df.label.unique()

## Look into target distribution

In [None]:
#find the count of targets under each subtype
subtype_count = train_df.groupby("subtype").label.value_counts().unstack()
subtype_count

In [None]:
#calculating the % target distribution across each subtype
subtype_count_per =  subtype_count.loc[:,1]/train_df.groupby("subtype").size() *100

multi_target_count = train_df.groupby("image").label.sum()

In [None]:
#Helper function
def random_colors(num_colors : int):
    colors = []
    for i in range(num_colors):
        colors.append('#'+''.join([random.choice('0123456789ABCDEF') for j in range(6)]))
    return colors 

Lets ask questions and try finding answers to it through visualisations :
1. How many positive and negative targets do we see in the training dataset?
2. What is the target distribution across each of the labels ?
3. Is the dataset imbalanced ?

In [None]:
fig, ax = plt.subplots(3,1, figsize=(30,50))

sns.countplot(train_df.label,ax=ax[0], palette=random_colors(2))
ax[0].set_xlabel("Target", fontsize=40)
ax[0].tick_params(axis='x', labelsize=25 ) 
ax[0].tick_params(axis='y', labelsize=25 ) 
ax[0].set_title("Number of positive and negative targets",fontsize=40)


sns.countplot(x="subtype", hue="label", data=train_df, ax=ax[1], palette=random_colors(6))
ax[1].set_xlabel("Number of targets per image",fontsize=40)
ax[1].set_ylabel("Frequency",fontsize=40)
ax[1].tick_params(axis='x', labelsize=25 ) 
ax[1].tick_params(axis='y', labelsize=25 ) 
ax[1].set_title("Target distrubution",fontsize=40)

sns.barplot(subtype_count_per.index, subtype_count_per.values, ax=ax[2], palette=random_colors(6))
plt.xticks(rotation=45)
ax[2].set_ylabel("% of positive(1) occurences",fontsize=40)
ax[2].tick_params(axis='x', labelsize=25 ) 
ax[2].tick_params(axis='y', labelsize=25 ) 
ax[2].set_title("Imbalance in target distrubution",fontsize=40)

So what do we make of it ?

- Less no of positive target values
- Epidural type has very few positive occurences(<1%)
- Highly Imbalanced

### Basic Checks before exploring images..

Compare counts of images provided in training dataset with the training files given to check 
everything is fine before proceeeding..

In [None]:
#Count of images in training dataset
train_df.image.nunique()

In [None]:
train_files = os.listdir(INPUT_PATH+"stage_2_train")

In [None]:
#Actual no. of image files 
len(train_files)

Ok. So no issues in training set.

In [None]:
test_files = os.listdir(INPUT_PATH+"stage_2_test")
len(test_files)

In [None]:
len(train_files)/len(test_files)

So 6.2times more images in train dataset than test dataset

## Lets look into DICOM files

### **What is a DICOM ?**
Dicom is a format that has metadata, as well as Pixeldata attached to it. Below I extract some basic info with an image. You will know about the gender and age of the patient, as well as info how the image is sampled and generated. 

So lets look at some samples from our dataset

In [None]:
train_files[:5]

Below are some of the slices from CT scans that are stored as pixel data in DICOM files

In [None]:
fig = plt.figure(figsize=(40,20))
column= 5; rows = 4
for i in range(1, column*rows +1):
    dcm = pydicom.dcmread(INPUT_PATH+"stage_2_train/"+train_files[i])
    fig.add_subplot(rows, column, i)
    plt.imshow(dcm.pixel_array, cmap=plt.cm.bone)
    fig.add_subplot

Look into the meta data that comes with the DICOM File, look for insights that can help us during processing 

In [None]:
print(dcm)

As we see there are details about the sampling along with the patients details. 
Some of it(like Window center, Window width, Rescale Intercept) can help at better pre-processing of these DICOM files.

Lets look the pixel data of one sample and find out the shape of these images

In [None]:
image = dcm.pixel_array
print(type(image)) #format in which pixel data is stored
print(image.dtype) #datatype of the pixel values
print(image.shape) #shape of image(wxh)

So images are of 512x512, we'll downsample them later to deal with the large training set

In [None]:
plt.imshow(image, cmap=plt.cm.bone)

DICOM images typically contain between 12–16 bits/pixel, which corresponds to approximately 4,096 to 65,536 shades of gray. But most regular computer screens are often limited to 8 bits or 256 shades of gray. 

Most images like the one above **display a wide range of tissue densities**(ranging from -1000HU(air) to +1000HU(bone)), but as mentioned above a **computer screen  can only display 256 shades of gray with our eye detecting only about a 6% change** in grayscale 

**Math around it :**
 Eye can detect only 6% change in grayscale, so `100/6 = 17 shades of gray`
 To display a DICOM(having range of approx, 2000HU) image on computer screen(can only display 256 shades pf gray) = `2000/256 = 8 --> each shade of gray would have diff of 8HU`
 
 Therefore, each variation would vary by `256/17*9 = 120HU`
 
 BUT, the difference between normal and pathologically altered tissue is usually a lot less than 120 HU 
 

So what to do? **Windowing !**

# **Getting into data preprocessing**

The point of applying windows is **to focus down the 256 shades of grey into a narrow region of HU(Hounsfiled units) that contain the relevnat densities of tissues we are interested** in while diagonising.

In [None]:
## A function to correct pixel data and rescale intercercepts ob 12 bit images
def dcm_correction(dcm_img):
        x = dcm_img.pixel_array + 1000
        px_mode = 4096
        x[x >= px_mode] = x[x >= px_mode] - px_mode #if there are extra bits in 12-bit grayscale(<=4096)
        dcm_img.PixelData = x.tobytes()
        dcm_img.RescaleIntercept = -1000 #setting a common value across all 12-bit US images

In [None]:
diff_size = []
for i in range(len(INPUT_PATH+"stage_2_train/")):
    dicom = pydicom.dcmread(INPUT_PATH+"stage_2_train/"+train_files[i])
    
    if dicom.BitsStored == 12:
        diff_size.append(dicom)

In [None]:
len(diff_size)

In [None]:
diff_size[1]

In [None]:
diff_size[1].pixel_array

In [None]:
dcm_correction(diff_size[1])
diff_size[1]

In [None]:
diff_rescale = []
for i in range(len(INPUT_PATH+"stage_2_train/")):
    dicom = pydicom.dcmread(INPUT_PATH+"stage_2_train/"+train_files[i])
    
    if (int(dicom.RescaleIntercept) != -1024):
        diff_rescale.append(dicom)

In [None]:
diff_rescale[0].pixel_array

In [None]:
dcm_correction(diff_rescale[0])

In [None]:
diff_rescale[0].pixel_array

In [None]:
diff_rescale

In [None]:
need_correct = []
for i in range(len(INPUT_PATH+"stage_2_train/")):
    dicom = pydicom.dcmread(INPUT_PATH+"stage_2_train/"+train_files[i])
    
    if (dicom.BitsStored == 12) and (dicom.PixelRepresentation == 0):
        need_correct.append(dicom)

In [None]:
len(need_correct)

## **Windowing**

Windowing, also known as grey-level mapping or contrast enhancement is the process in which the CT image greyscale component of an image is manipulated via the CT numbers; doing this will change the appearance of the picture **to highlight particular structures**

Here's where some of the DICOM meta data comes to help -

`Window width` also know as the contrast --> is the measure of the range of CT numbers that an image contains

`Window center` also known as brightness -->  is the midpoint of the range of the CT numbers displayed; window level is decreased the CT image will be brighter and vice versa.


These two could be used to calculate the upper and lower grey levels, to produce different kinds of windows based on the kind of diagnosis.


In [None]:
#Systemic/linear windowing
def window_image(dcm, window_center, window_width):
    if (dcm.BitsStored == 12) and (dcm.PixelRepresentation == 0):
        dcm_correction(dcm)

    img = dcm.pixel_array * dcm.RescaleSlope + dcm.RescaleIntercept #reconstructing the image from pixels
    img_min = window_center - window_width // 2 #lowest visible value
    img_max = window_center + window_width // 2 #highest visible value
    img = np.clip(img, img_min, img_max)

    return img

**So what happening above ?**

If the DICOM file is of 12-bit type(41 outliers), then we correct them before generating our windows.

We then clip the pixel intensities between the lowest and hishest visisble values, to focus only on a narrow region where the abnormality might be present. This means that every pixel value greater than the `img_max` will show up as **white** and belowe `img_min` will show up as **black**

Since each window highlights particular ranges, it makes it easier for a radiologists(the DL system in our case) to see if there are any changes between normal and pathologically altered tissue. So based on the diagnosis, the model would learn to look at only certain windows of tissue desities(features). 

Lets now try to get a picture of how windows would help us

In [None]:
TRAIN_PATH = INPUT_PATH+"stage_2_train/"
TEST_PATH = INPUT_PATH+"stage_2_test/"

def view_images(image, title=''):

    dcm = pydicom.dcmread(os.path.join(TRAIN_PATH,image[3]+'.dcm'))

    fig, (ax1, ax2, ax3, ax4) = plt.subplots(4,1, figsize=(10,24))
    
    ax1.set_title("Default window")
    ax1.imshow(dcm.pixel_array, cmap=plt.cm.bone)
    
    ax2.set_title("Brain window")
    brain_img = window_image(dcm, 40, 80)
    ax2.imshow(brain_img, cmap=plt.cm.bone)
    
    ax3.set_title("Subdural window")
    subdural_img = window_image(dcm, 80, 200)
    ax3.imshow(subdural_img, cmap=plt.cm.bone)
#     ax3.annotate('', xy=(150, 380), xytext=(120, 430),
#             arrowprops=dict(facecolor='red', shrink=0.05),
#             )
#     ax3.annotate('', xy=(220, 430), xytext=(190, 500),
#             arrowprops=dict(facecolor='red', shrink=0.05),
#             )
    
    ax4.set_title("Soft Tissue window")
    soft_img = window_image(dcm, 40, 380)
    ax4.imshow(soft_img, cmap=plt.cm.bone)
    
    for ax in fig.axes:
        ax.axis("off")
        
    fig.suptitle(title)
    plt.show()
    

**Too much info! what are these different windows?**

[radiopedia.org](https://radiopaedia.org/articles/ct-head-an-approach?lang=gb) shows a typical workflow and well thought process that a radiologist takes when given a task to detect any abnormalties on CT scan of the brain.

For head CT, bone window and brain window are two important window settings.However, the details of soft tissues such as brain, that shows density lower than that of bones, are lost in the bone window setting. Brain window is the most frequently used setting, and the majority of evaluations of brain abnormality are done using this window setting.

What we understand is that, while the brain matter window is able to pick most abnormalities it might cause to miss some diagnosis. So while diagnising something like a hemorrhage we need to look into other windows like the subdural that focus more on the subdural hematoma.

#### **Subdurals could be tricky..** ###

If you check their definition, they usually are right next to the skull, longish in shape and follows the curvature of the skull. Hence , if you look through a brain window you might miss out on these.. hence it adviced to incoporate a subdural window.


In [None]:
train_df.head()

In [None]:
view_images(train_df[(train_df["subtype"] == 'epidural') & (train_df['label'] == 1)][:10].image.values, title='Images wth epidural')

In [None]:
view_images(train_df[(train_df["subtype"] == 'subdural') & (train_df['label'] == 1)][:10].image.values, title='Images wth subdural')

In [None]:
view_images(train_df[(train_df["subtype"] == 'subarachnoid') & (train_df['label'] == 1)][:10].image.values, title='Images wth subarachnoid')

In [None]:
view_images(train_df[(train_df["subtype"] == 'intraventricular') & (train_df['label'] == 1)][:10].image.values, title='Images wth intraventricular')

In [None]:
view_images(train_df[(train_df["subtype"] == 'intraparenchymal') & (train_df['label'] == 1)][:10].image.values, title='Images wth intraparenchymal')

### Combining the windows 

Lets take a random sample from the dataset to show the entire preprocessing would be

In [None]:
test_case = os.path.join(TRAIN_PATH,'ID_12cadc6af.dcm')

test_data = pydicom.read_file(test_case)
plt.imshow(test_data.pixel_array, cmap=plt.cm.bone)

Next, create the brain, subdural and soft tissue windows

In [None]:
brain_img = window_image(test_data, 40, 80)
subdural_img = window_image(dcm, 80, 200)
soft_img = window_image(dcm, 40, 380)

Before concatenating them, we need to make sure that all of theirvalues fall under a same range so we Standardise their values

In [None]:
brain_img = (brain_img - 0) / 80
# print(brain_img)
plt.imshow(brain_img, cmap=plt.cm.bone)

In [None]:
print(subdural_img)
plt.imshow(subdural_img, cmap=plt.cm.bone)

In [None]:
subdural_img = (subdural_img - (-20))/200
# print(subdural_img)
plt.imshow(subdural_img, cmap=plt.cm.bone)

In [None]:
print(soft_img)
plt.imshow(soft_img, cmap=plt.cm.bone)

In [None]:
soft_img = (soft_img - (-150))/380
# print(soft_img)
plt.imshow(soft_img, cmap=plt.cm.bone)

So we brought all three windows to same scale, now we will combine them into a single 3-channel image

In [None]:
bsb_img = np.array([brain_img, subdural_img, soft_img]).transpose(1, 2, 0)

In [None]:
plt.imshow(bsb_img, cmap=plt.cm.bone)

# REFERENCES
[1. For DICOM related info ]( https://dicomiseasy.blogspot.com/2012/08/chapter-12-pixel-data.html)

[2. To understand CT scans and workflow of radiologists](https://radiopaedia.org/articles/ct-head-an-approach?lang=gb)

[3. Paper referred](https://arxiv.org/pdf/2008.00302.pdf)

# Task

In [None]:
train_df

In [None]:
### to shorten running times i only took 1,000 id's

my_train_df = train_df[train_df['image'].isin(train_df.image.unique()[:1000])]

In [None]:
test_df = pd.DataFrame(index=my_train_df.image.unique(), columns=['label',
                                                               'brain_img',
                                                               'subdural_img',
                                                               'soft_img',
                                                                  'bsb_img'])

In [None]:
test_df

In [None]:
### find which id's have any type of hemorrhage

temp_list = list(my_train_df[(my_train_df['subtype']=='any') & (my_train_df['label']==1)].image)

for uid in test_df.index:
    if uid in temp_list:
        test_df.loc[uid]['label'] = 1
    else:
        test_df.loc[uid]['label'] = 0

In [None]:
test_df

In [None]:
### applying windowing functions

for uid in test_df.index:
    test_case = os.path.join(TRAIN_PATH,uid + '.dcm')
    test_data = pydicom.read_file(test_case)
    
    brain_img = window_image(test_data, 40, 80)
    subdural_img = window_image(test_data, 80, 200)
    soft_img = window_image(test_data, 40, 380)
    
    test_df.loc[uid]['brain_img'] = brain_img
    test_df.loc[uid]['subdural_img'] = subdural_img
    test_df.loc[uid]['soft_img'] = soft_img
    
    brain_img = (brain_img - 0) / 80
    subdural_img = (subdural_img - (-20))/200
    soft_img = (soft_img - (-150))/380
    
    test_df.loc[uid]['bsb_img'] = np.array([brain_img, subdural_img, soft_img]).transpose(1, 2, 0)

In [None]:
test_df

# mean

In [None]:
mean_df = test_df.applymap(np.mean)
mean_df

In [None]:
plt.figure(figsize=(16,10), dpi= 80)

sns.kdeplot(mean_df['brain_img'], shade=True, color="g", label="brain_img", alpha=.7)
sns.kdeplot(mean_df['subdural_img'], shade=True, color="deeppink", label="subdural_img", alpha=.7)
sns.kdeplot(mean_df['soft_img'], shade=True, color="dodgerblue", label="soft_img", alpha=.7)

plt.title('Mean dist', fontsize=22)
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(8,5), dpi= 80)

plt.plot(np.random.random_sample(size = 133),mean_df[mean_df['label']==1]['brain_img'], 'o', color='blue',label="hemorrhage");
plt.plot(np.random.random_sample(size = 867)+1,mean_df[mean_df['label']==0]['brain_img'], 'o', color='red',label="healthy");

plt.title('mean comparison brain img', fontsize=15)
plt.xticks([0.5, 1.5], ["hemorrhage", "healthy"])
plt.show()

In [None]:
plt.figure(figsize=(8,5), dpi= 80)

plt.plot(np.random.random_sample(size = 133),mean_df[mean_df['label']==1]['subdural_img'], 'o', color='blue',label="hemorrhage");
plt.plot(np.random.random_sample(size = 867)+1,mean_df[mean_df['label']==0]['subdural_img'], 'o', color='red',label="healthy");

plt.title('mean comparison subdural img', fontsize=15)
plt.xticks([0.5, 1.5], ["hemorrhage", "healthy"])
plt.show()

In [None]:
plt.figure(figsize=(8,5), dpi= 80)

plt.plot(np.random.random_sample(size = 133),mean_df[mean_df['label']==1]['soft_img'], 'o', color='blue',label="hemorrhage");
plt.plot(np.random.random_sample(size = 867)+1,mean_df[mean_df['label']==0]['soft_img'], 'o', color='red',label="healthy");

plt.title('mean comparison soft img', fontsize=15)
plt.xticks([0.5, 1.5], ["hemorrhage", "healthy"])
plt.show()

In [None]:
plt.figure(figsize=(8,5), dpi= 80)

plt.plot(np.random.random_sample(size = 133),mean_df[mean_df['label']==1]['bsb_img'], 'o', color='blue',label="hemorrhage");
plt.plot(np.random.random_sample(size = 867)+1,mean_df[mean_df['label']==0]['bsb_img'], 'o', color='red',label="healthy");

plt.title('mean comparison bsb img', fontsize=15)
plt.xticks([0.5, 1.5], ["hemorrhage", "healthy"])
plt.show()

# STD

In [None]:
std_df = test_df.applymap(np.std)
std_df['label'] = mean_df['label']
std_df

In [None]:
plt.figure(figsize=(16,10), dpi= 80)

sns.kdeplot(std_df['brain_img'], shade=True, color="g", label="brain_img", alpha=.7)
sns.kdeplot(std_df['subdural_img'], shade=True, color="deeppink", label="subdural_img", alpha=.7)
sns.kdeplot(std_df['soft_img'], shade=True, color="dodgerblue", label="soft_img", alpha=.7)

plt.title('STD dist', fontsize=22)
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(8,5), dpi= 80)

plt.plot(np.random.random_sample(size = 133),std_df[std_df['label']==1]['brain_img'], 'o', color='blue',label="hemorrhage");
plt.plot(np.random.random_sample(size = 867)+1,std_df[std_df['label']==0]['brain_img'], 'o', color='red',label="healthy");

plt.title('STD comparison brain img', fontsize=15)
plt.xticks([0.5, 1.5], ["hemorrhage", "healthy"])
plt.show()

In [None]:
plt.figure(figsize=(8,5), dpi= 80)

plt.plot(np.random.random_sample(size = 133),std_df[std_df['label']==1]['subdural_img'], 'o', color='blue',label="hemorrhage");
plt.plot(np.random.random_sample(size = 867)+1,std_df[std_df['label']==0]['subdural_img'], 'o', color='red',label="healthy");

plt.title('std comparison subdural img', fontsize=15)
plt.xticks([0.5, 1.5], ["hemorrhage", "healthy"])
plt.show()

In [None]:
plt.figure(figsize=(8,5), dpi= 80)

plt.plot(np.random.random_sample(size = 133),std_df[std_df['label']==1]['soft_img'], 'o', color='blue',label="hemorrhage");
plt.plot(np.random.random_sample(size = 867)+1,std_df[std_df['label']==0]['soft_img'], 'o', color='red',label="healthy");

plt.title('std comparison soft img', fontsize=15)
plt.xticks([0.5, 1.5], ["hemorrhage", "healthy"])
plt.show()

In [None]:
plt.figure(figsize=(8,5), dpi= 80)

plt.plot(np.random.random_sample(size = 133),std_df[std_df['label']==1]['bsb_img'], 'o', color='blue',label="hemorrhage");
plt.plot(np.random.random_sample(size = 867)+1,std_df[std_df['label']==0]['bsb_img'], 'o', color='red',label="healthy");

plt.title('std comparison bsb img', fontsize=15)
plt.xticks([0.5, 1.5], ["hemorrhage", "healthy"])
plt.show()

# Statistical Analysis and Modeling

**Ive decided to take all the std values and put it in a t-test to create my model**

Below is one example

In [None]:
import scipy.stats as stats

In [None]:
### ill check which img configurations are sig:

print(stats.ttest_ind(std_df['brain_img'][std_df['label'] == 1],
                std_df['brain_img'][std_df['label'] == 0]))
print(stats.ttest_ind(std_df['subdural_img'][std_df['label'] == 1],
                std_df['subdural_img'][std_df['label'] == 0]))
print(stats.ttest_ind(std_df['soft_img'][std_df['label'] == 1],
                std_df['soft_img'][std_df['label'] == 0]))

In [None]:
### brain_img is the only significant feature so ill choose it for my model.
### ill create a simple model that uses the distributions of brain_img.
### for each new datapoint ill check the standard score and if its larger than a certain cut-off point, ill clessify it as different than hemorrhage poplulation, hence as healthy.
### lastly ill add each hemorrhage datapoint to the hemorrhage population to create an online model.

new_train_std_df = std_df.iloc[:500,:].copy()
new_test_std_df = std_df.iloc[500:,:].copy()
results_df = pd.DataFrame(index=new_test_std_df.index, columns=['label','results'])
results_df['label'] = new_test_std_df['label']

for uid in new_test_std_df.index:
    temp = new_train_std_df[new_train_std_df['label']==1].copy()
    temp.loc[uid] = new_test_std_df.loc[uid]
    if abs(stats.zscore(np.array(temp['brain_img']))[-1])>3:
        results_df.at[uid,'results'] = 0
    else:
        results_df.at[uid,'results'] = 1

    if new_test_std_df.loc[uid]['label'] == 1:
        new_train_std_df.loc[uid] = new_test_std_df.loc[uid]
        
results_df

In [None]:
### check correlation
results_df['label'].astype(int).corr(results_df['results'].astype(int))

# Another option is to explore sklearn models

we'll explore a few models to check which one suits best

In [None]:
feature_names = ['brain_img', 'subdural_img', 'soft_img', 'bsb_img']
X = std_df[feature_names]
y = std_df['label']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
### Logistic Regression

from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
print('Accuracy of Logistic regression classifier on training set: {:.2f}'
     .format(logreg.score(X_train, y_train)))
print('Accuracy of Logistic regression classifier on test set: {:.2f}'
     .format(logreg.score(X_test, y_test)))

In [None]:
### Decision Tree Classifier

from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier().fit(X_train, y_train)
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

In [None]:
### K-Neighbors Classifier

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
print('Accuracy of K-NN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'
     .format(knn.score(X_test, y_test)))

# we've received the best results when using a logistic regression classifier