In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
#Samples
df = pd.read_csv('Leukemya_data.csv', header=None)

#Labels
labels = pd.read_csv('labels.csv', header=None)
y = labels.add(-1) #transformar as labels em 0 e 1

#Separar as 128 samples com labels das 50 samples sem labels
predict_data = df.iloc[128:,:]
X = df.iloc[:128,:]

In [4]:
#Proproção entre 0 e 1 no dataset
y[0].value_counts()

0    111
1     17
Name: 0, dtype: int64

In [5]:
#Pequena descrição do dataset
X.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,176,177,178,179,180,181,182,183,184,185
count,128.0,128.0,128.0,128.0,128.0,128.0,128.0,128.0,128.0,128.0,...,128.0,128.0,128.0,128.0,128.0,128.0,128.0,128.0,128.0,128.0
mean,0.598454,0.199827,-0.314046,-0.279318,0.628314,0.285455,0.551004,0.082255,-0.790809,0.175583,...,0.464284,1.075097,0.333877,0.126951,0.226637,0.101913,1.517887,3.429096,0.185997,0.13288
std,0.072532,0.031099,0.598357,1.070608,0.100354,0.090456,0.036781,0.015182,0.659477,1.32665,...,0.631497,1.247563,0.068351,0.039831,0.038796,0.030314,1.124836,9.670882,0.044779,0.08361
min,0.37677,0.1035,-1.2998,-1.4721,0.33855,0.088063,0.42476,0.029278,-2.0593,-1.6178,...,-1.9795,-1.3319,0.17124,0.059768,0.1492,0.01545,-0.29816,-1.734,0.14028,0.0
25%,0.555222,0.19015,-0.72569,-0.983997,0.58562,0.218935,0.531698,0.075788,-1.256475,-0.784435,...,0.10918,0.294132,0.29796,0.099242,0.205205,0.090534,0.850782,-0.813325,0.16312,0.06272
50%,0.60973,0.20331,-0.45711,-0.54937,0.65215,0.2774,0.56033,0.086056,-0.87909,-0.177425,...,0.51378,0.838475,0.32852,0.11927,0.22083,0.10693,1.40755,0.761485,0.173315,0.107145
75%,0.65252,0.217172,-0.050245,-0.068488,0.701812,0.349807,0.57512,0.091499,-0.486663,0.866645,...,0.831695,1.596775,0.373015,0.145382,0.248677,0.122075,1.85485,2.469725,0.18352,0.20544
max,0.72514,0.26425,1.8783,5.1816,0.79061,0.51174,0.61577,0.10867,1.5148,5.0,...,2.3205,6.6873,0.60431,0.31617,0.33413,0.15602,6.0316,66.292,0.35907,0.31736


## Sampling

In [6]:
#De modo a obter um dataset equilibrado - Fazer oversampling, uma vez que que temos um dataset pequeno
from imblearn.over_sampling import SMOTE
oversample = SMOTE(random_state=42)

X2, y2 = oversample.fit_resample(X, y)
y[0].value_counts()

0    111
1     17
Name: 0, dtype: int64

In [7]:
X2.shape, y2.shape

((222, 186), (222, 1))

In [8]:
#Dividir os dados em dados de treino e dados de teste
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X2, y2, test_size=0.3, random_state=42)

Xtrain.shape, Xtest.shape, ytrain.shape, ytest.shape

((155, 186), (67, 186), (155, 1), (67, 1))

## Feature selection

In [9]:
corr = Xtrain.corr() #Matriz de correlação
print(corr)

0         1         2         3         4         5         6    \
0    1.000000  0.571570 -0.928204 -0.535481  0.974411  0.329458  0.708757   
1    0.571570  1.000000 -0.710771 -0.814761  0.609479  0.886626  0.707371   
2   -0.928204 -0.710771  1.000000  0.635170 -0.940009 -0.458372 -0.861759   
3   -0.535481 -0.814761  0.635170  1.000000 -0.529580 -0.836795 -0.555862   
4    0.974411  0.609479 -0.940009 -0.529580  1.000000  0.380068  0.758156   
..        ...       ...       ...       ...       ...       ...       ...   
181  0.289174  0.570656 -0.499443 -0.384363  0.392681  0.465671  0.636138   
182 -0.386378 -0.601743  0.545737  0.453267 -0.465637 -0.516138 -0.671206   
183 -0.403812 -0.630893  0.551787  0.509076 -0.457613 -0.507626 -0.709683   
184  0.160388  0.264356 -0.238240 -0.225057  0.244898  0.327651  0.274931   
185  0.271315  0.415240 -0.398812 -0.319937  0.349145  0.407962  0.428265   

          7         8         9    ...       176       177       178  \
0    0.401059

In [10]:
def feature_drop(df):
    """Esta função elimina certas colunas de um dataset
    com base na sua matriz de correlação"""
    global list
    list=[]
    Xtrain2 = df
    for i in range(186):
        for j in range(i, 186):
            value = corr.loc[i].iat[j]
            if value>0.9 and j!=i:
                print(f'Correlação entre {i} e {j}')
                if j not in list:
                    list.append(j)

    Xtrain2 = Xtrain2.drop(list,axis=1)
    return Xtrain2

Xtrain2 = feature_drop(Xtrain)
Xtest2 = Xtest.drop(list,axis=1)
predict_data2 = predict_data.drop(list,axis=1)

Correlação entre 0 e 4
Correlação entre 2 e 8
Correlação entre 6 e 10
Correlação entre 10 e 40
Correlação entre 12 e 16
Correlação entre 18 e 22
Correlação entre 24 e 28
Correlação entre 24 e 66
Correlação entre 24 e 70
Correlação entre 25 e 67
Correlação entre 26 e 68
Correlação entre 26 e 128
Correlação entre 28 e 66
Correlação entre 28 e 70
Correlação entre 29 e 71
Correlação entre 30 e 34
Correlação entre 36 e 40
Correlação entre 42 e 46
Correlação entre 48 e 52
Correlação entre 54 e 58
Correlação entre 60 e 64
Correlação entre 66 e 70
Correlação entre 72 e 76
Correlação entre 78 e 82
Correlação entre 79 e 83
Correlação entre 84 e 88
Correlação entre 85 e 89
Correlação entre 90 e 94
Correlação entre 91 e 95
Correlação entre 96 e 100
Correlação entre 102 e 106
Correlação entre 104 e 105
Correlação entre 108 e 112
Correlação entre 108 e 113
Correlação entre 110 e 111
Correlação entre 116 e 117
Correlação entre 120 e 121
Correlação entre 120 e 124
Correlação entre 120 e 125
Correlação

In [11]:
Xtrain2

Unnamed: 0,0,1,2,3,5,6,7,9,11,12,...,171,173,174,175,176,177,179,180,182,184
182,0.456121,0.125120,1.325341,3.024584,0.129743,0.487328,0.052456,1.517226,0.055944,0.613929,...,2.359284,0.098970,0.454454,0.096893,-1.187531,1.835709,0.101493,0.168765,3.846218,0.153779
135,0.516983,0.127788,0.903578,2.025405,0.141086,0.486965,0.047446,1.560570,0.051395,0.556035,...,7.398438,0.054606,0.385813,0.091323,-0.712674,0.524339,0.108461,0.165602,3.401321,0.153936
86,0.646320,0.197520,-0.789460,0.076750,0.218200,0.575950,0.078682,0.899230,0.058637,0.546790,...,0.412810,0.075408,0.398300,0.110980,-0.101150,0.874130,0.110890,0.196110,1.926100,0.160500
65,0.666770,0.187610,-0.918730,0.462410,0.203520,0.577310,0.075230,2.347700,0.047963,0.514130,...,0.143510,0.099957,0.332810,0.100640,0.569450,0.406670,0.113540,0.205430,1.521700,0.163120
207,0.664382,0.194045,-0.592446,-0.062782,0.249574,0.537223,0.073950,0.774251,0.076983,0.603659,...,-0.023542,0.121741,0.464572,0.126428,-0.686019,0.251491,0.150212,0.163147,3.466096,0.149029
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,0.617480,0.214380,-0.573730,-0.633010,0.287670,0.577640,0.089599,-0.303220,0.119570,0.528360,...,-0.655820,0.159490,0.374400,0.111910,0.186320,0.287530,0.120540,0.218210,1.691000,0.172270
14,0.622580,0.215920,-0.584100,-0.530650,0.277890,0.568990,0.084487,0.406880,0.070319,0.519760,...,-0.273730,0.129430,0.388160,0.117050,-0.220900,-0.072788,0.131920,0.231230,1.500100,0.179880
92,0.725140,0.198860,-1.156900,0.635540,0.205480,0.594220,0.078664,1.518400,0.049561,0.653880,...,-0.233610,0.151170,0.358910,0.101180,0.276670,0.968420,0.111460,0.287200,0.490770,0.208120
179,0.583320,0.146183,0.284645,0.229740,0.181588,0.497442,0.046786,1.362502,0.052348,0.521454,...,6.579383,0.076159,0.403102,0.095070,-0.590679,0.442113,0.122618,0.163470,3.461385,0.154591


In [12]:
Xtest2

Unnamed: 0,0,1,2,3,5,6,7,9,11,12,...,171,173,174,175,176,177,179,180,182,184
154,0.520785,0.182583,0.233224,-0.453144,0.271421,0.529132,0.071441,-0.304470,0.116314,0.547261,...,-0.444863,0.193442,0.384009,0.122378,0.112193,-0.012783,0.176526,0.208992,2.216455,0.163368
215,0.566613,0.133363,0.110959,0.547993,0.159650,0.472884,0.042155,2.491550,0.051316,0.451933,...,1.069653,0.078536,0.545247,0.090439,-1.707239,4.199853,0.077152,0.158971,2.290402,0.150015
15,0.550020,0.240310,-0.010231,-1.456400,0.446180,0.539660,0.090968,-0.824740,0.149900,0.488440,...,-1.515900,0.273980,0.282010,0.105330,0.727430,0.294820,0.158160,0.281990,0.522950,0.179880
173,0.478191,0.156702,1.094282,1.532995,0.157164,0.482215,0.072827,0.465817,0.078710,0.544307,...,-1.254861,0.265313,0.297690,0.133197,0.894645,0.107229,0.189251,0.180541,2.432443,0.156387
158,0.530019,0.133420,0.539395,0.742605,0.159850,0.487301,0.045643,1.914177,0.048853,0.591275,...,1.310251,0.117574,0.470702,0.102542,-1.126647,2.236755,0.113409,0.165756,4.725240,0.154305
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,0.587516,0.143918,0.265428,0.306523,0.177426,0.500745,0.051824,0.991619,0.065863,0.511718,...,8.341588,0.050906,0.397216,0.086311,-0.708833,1.325906,0.098556,0.167347,3.178013,0.156648
206,0.476294,0.152546,1.039258,1.495883,0.143706,0.484604,0.065577,0.883693,0.063087,0.537090,...,-0.814128,0.241584,0.317502,0.123463,0.677838,0.253009,0.171767,0.177702,2.900768,0.154850
113,0.657080,0.196940,-0.953000,0.167970,0.194720,0.566410,0.079175,0.996760,0.051613,0.533290,...,-0.253110,0.113630,0.365760,0.103610,-0.061330,0.778270,0.108360,0.225290,1.223300,0.180450
5,0.592830,0.239070,-0.429870,-1.086400,0.408020,0.562480,0.090431,-0.888550,0.157350,0.525090,...,-1.155100,0.264340,0.363880,0.128590,0.290770,-0.098939,0.165180,0.216160,1.678300,0.178740


In [13]:
#Reduzir o Xtrain2, o Xtest2 e o predict_data2 a apenas 20 features (20 "melhores" features)
from sklearn.feature_selection import SelectKBest, f_classif
SKB = SelectKBest(score_func=f_classif, k=20)
Xtrain_fselected = SKB.fit_transform(Xtrain2, ytrain)
Xtest_fselected = SKB.transform(Xtest2)
predict_data_fselected = SKB.transform(predict_data2)
cols = SKB.get_support(indices=True)

Xtrain_fselected = pd.DataFrame(data=Xtrain_fselected, columns=cols)
Xtest_fselected = pd.DataFrame(data=Xtest_fselected, columns=cols)
predict_data_fselected = pd.DataFrame(data=predict_data_fselected, columns=cols)
cols

array([  1,   2,   5,   6,  11,  29,  39,  41,  43,  68,  69,  89,  90,
        95,  96,  98, 125, 127, 130, 131], dtype=int64)

In [14]:
Xtrain_fselected.shape, Xtest_fselected.shape, predict_data_fselected.shape

((155, 20), (67, 20), (50, 20))

## Classificadores

In [15]:
from sklearn.metrics import confusion_matrix, classification_report

def check_accuracy(y_pred,y_true):
    n=0
    for i in range(len(y_true)):
        if y_pred[i]==y_true[i]:
            n=n+1
    acc = n/len(y_true)
    return acc

def precision(pred,test):
    true_positives = 0
    positive_predictions = (pred == 1).sum()
    for i in range(0,len(pred)):
        if pred[i] == test[i] and test[i] == 1:
            true_positives+=1
    return true_positives/positive_predictions
    
def recall(pred,test):
    true_positives = 0
    positives = (test == 1).sum()
    for i in range(0,len(pred)):
        if pred[i] == test[i] and test[i] == 1:
            true_positives+=1
    return true_positives/positives

def F1score(precision,recall):
    score = 2*(precision*recall)/(precision+recall)
    return score

In [16]:
ytest = ytest.to_numpy()

In [17]:
#KKN - K Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
clf_KNN = KNeighborsClassifier(n_neighbors=2,p=2)
clf_KNN.fit(Xtrain_fselected, ytrain)

y_pred = clf_KNN.predict(Xtest_fselected)
y_pred

array([1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0,
       0], dtype=int64)

In [18]:
#Medidas do desempenho
ACC=check_accuracy(y_pred, ytest)
PREC=precision(y_pred, ytest)
REC=recall(y_pred, ytest)
F1=F1score(PREC, REC)

print('K Neighbors Classifier accuracy is: ', ACC)
print('K Neighbors Classifier precision is: ', PREC)
print('K Neighbors Classifier recall is: ', REC)
print('K Neighbors Classifier F1 score is: ', F1)

K Neighbors Classifier accuracy is:  0.9701492537313433
K Neighbors Classifier precision is:  0.9696969696969697
K Neighbors Classifier recall is:  0.9696969696969697
K Neighbors Classifier F1 score is:  0.9696969696969697


In [19]:
#Classification report
print(classification_report(ytest, y_pred))

precision    recall  f1-score   support

           0       0.97      0.97      0.97        34
           1       0.97      0.97      0.97        33

    accuracy                           0.97        67
   macro avg       0.97      0.97      0.97        67
weighted avg       0.97      0.97      0.97        67



In [20]:
#Matriz de confusão
cmatrix = confusion_matrix(ytest, y_pred)
cmatrix

array([[33,  1],
       [ 1, 32]], dtype=int64)

In [21]:
#Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
clf_DTC = DecisionTreeClassifier(random_state=42)
clf_DTC.fit(Xtrain_fselected, ytrain)

y_pred2 = clf_DTC.predict(Xtest_fselected)

In [22]:
#Medidas do desempenho
ACC2=check_accuracy(y_pred2, ytest)
PREC2=precision(y_pred2, ytest)
REC2=recall(y_pred2, ytest)
F12=F1score(PREC2, REC2)

print('Decision Tree Classifier accuracy is: ', ACC2)
print('Decision Tree Classifier precision is: ', PREC2)
print('Decision Tree Classifier recall is: ', REC2)
print('Decision Tree Classifier F1 score is: ', F12)

Decision Tree Classifier accuracy is:  0.9402985074626866
Decision Tree Classifier precision is:  0.9393939393939394
Decision Tree Classifier recall is:  0.9393939393939394
Decision Tree Classifier F1 score is:  0.9393939393939394


In [23]:
#Classification report
print(classification_report(ytest, y_pred2))

precision    recall  f1-score   support

           0       0.94      0.94      0.94        34
           1       0.94      0.94      0.94        33

    accuracy                           0.94        67
   macro avg       0.94      0.94      0.94        67
weighted avg       0.94      0.94      0.94        67



In [24]:
#Matriz de confusão
cmatrix2 = confusion_matrix(ytest, y_pred2)
cmatrix2

array([[32,  2],
       [ 2, 31]], dtype=int64)

In [25]:
#Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
clf_RFC = RandomForestClassifier(random_state=42)
clf_RFC.fit(Xtrain_fselected, ytrain)

y_pred3 = clf_RFC.predict(Xtest_fselected)

In [26]:
#Medidas do desempenho
ACC3=check_accuracy(y_pred3, ytest)
PREC3=precision(y_pred3, ytest)
REC3=recall(y_pred3, ytest)
F13=F1score(PREC3, REC3)

print('Random Forest Classifier accuracy is: ', ACC3)
print('Random Forest Classifier precision is: ', PREC3)
print('Random Forest Classifier recall is: ', REC3)
print('Random Forest Classifier F1 score is: ', F13)

Random Forest Classifier accuracy is:  0.9701492537313433
Random Forest Classifier precision is:  0.9696969696969697
Random Forest Classifier recall is:  0.9696969696969697
Random Forest Classifier F1 score is:  0.9696969696969697


In [27]:
#Classification report
print(classification_report(ytest, y_pred3))

precision    recall  f1-score   support

           0       0.97      0.97      0.97        34
           1       0.97      0.97      0.97        33

    accuracy                           0.97        67
   macro avg       0.97      0.97      0.97        67
weighted avg       0.97      0.97      0.97        67



In [28]:
#Matriz de confusão
cmatrix3 = confusion_matrix(ytest, y_pred3)
cmatrix3

array([[33,  1],
       [ 1, 32]], dtype=int64)

In [29]:
#Teste real dos três classificadores
prediction_KNN = clf_KNN.predict(predict_data_fselected)
prediction_DTC = clf_DTC.predict(predict_data_fselected)
prediction_RFC = clf_RFC.predict(predict_data_fselected)

In [36]:
#Salvar os testes reais em ficheiros csv
import csv
import os
ID = predict_data.index.values
def save(ID, array, filename):
    df = pd.DataFrame(data=array, index=ID, columns=['Predictions'])
    if not os.path.exists('results'):
        os.makedirs('results')
    df.to_csv(f'./results/{filename}')


save(ID,prediction_DTC,'results_DTC.csv'), save(ID,prediction_KNN,'results_KNN.csv'), save(ID,prediction_RFC,'results_RFC.csv') 

(None, None, None)