In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
import sklearn
%matplotlib inline

In [None]:
blood = pd.read_csv("/content/drive/MyDrive/Cópia de G4_blood-donation.csv")
labels = ['Months since Last Donation',	'Number of Donations',	'Total Volume Donated (c.c.)',	'Months since First Donation'	,'Made Donation in March 2007']
blood = blood.rename(columns={'V1':labels[0],'V2':labels[1],'V3':labels[2],'V4':labels[3],'Class':labels[4]},errors="raise")
pd.set_option('display.float_format', lambda x: '%.2f' % x)
blood.head()

Unnamed: 0,Months since Last Donation,Number of Donations,Total Volume Donated (c.c.),Months since First Donation,Made Donation in March 2007
0,2,50,12500,98,2
1,0,13,3250,28,2
2,1,16,4000,35,2
3,2,20,5000,45,2
4,1,24,6000,77,1


In [None]:
blood.drop(columns='Total Volume Donated (c.c.)', inplace=True)
blood.head()


Unnamed: 0,Months since Last Donation,Number of Donations,Months since First Donation,Made Donation in March 2007
0,2,50,98,2
1,0,13,28,2
2,1,16,35,2
3,2,20,45,2
4,1,24,77,1


In [None]:
blood['Made Donation in March 2007'] = (blood['Made Donation in March 2007']>1)*1
blood.head()

Unnamed: 0,Months since Last Donation,Number of Donations,Months since First Donation,Made Donation in March 2007
0,2,50,98,1
1,0,13,28,1
2,1,16,35,1
3,2,20,45,1
4,1,24,77,0


In [None]:
#separando dados
X = blood.iloc[:,:-1]
y = blood.iloc[:,-1]

#Separando os dados em teste e treino 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify = y,  random_state= 6)

#Separando os dados de treinamento em treinamento (final) e validação
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state= 6)

# Normalizando X_train
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)

# Normalizando o resto
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)




In [None]:
blood = DecisionTreeClassifier()

In [None]:
%time
blood.fit(X_train, y_train)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 7.39 µs


DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [None]:
predict = blood.predict(X_test)

In [None]:
print(classification_report(y_test, predict))

              precision    recall  f1-score   support

           0       0.77      0.70      0.73        86
           1       0.26      0.33      0.29        27

    accuracy                           0.61       113
   macro avg       0.51      0.52      0.51       113
weighted avg       0.65      0.61      0.63       113



In [None]:
confusion_matrix(y_test, predict)

array([[60, 26],
       [18,  9]])

In [None]:
dct = DecisionTreeClassifier()

parameters = {'criterion':["gini","entropy"], # função para medir a qualidade de uma divisão
              'splitter':["best","random"], # estratégia usada para escolher a divisão em cada nó
              'max_depth':[60,80,100,110], # profundidade máxima da árvore
              'min_samples_split': [10], # número mínimo de amostras necessárias para dividir um nó interno
              'min_samples_leaf': [1] # O número mínimo de amostras necessárias para estar em um nó folha
              #'min_weight_fraction_leaf': 0 # A fração mínima ponderada da soma total dos pesos'
              #'max_features' : sqrt(n_features) # int, float ou {“auto”, “sqrt”, “log2”}, padrão = Nenhum O número de recursos a serem considerados ao procurar a melhor divisão:
              #'random_state' : None # Controla a aleatoriedade do estimador
              #'max_leaf_nodes' : None    # int, default = None, Os melhores nós são definidos como redução relativa de impurezas
              #'min_impurity_decrease' :     # float, default = 0,0 Um nó será dividido se esta divisão induzir uma diminuição da impureza maior ou igual a este valor.'
              #'min_impurity_split': 0 # Limiar para parada precoce no crescimento da árvore
              #'class_weight': None # dict, list of dict or “balanced”' Pesos associados às classes no formulário
              #'ccp_alpha': 0,0 # Parâmetro de complexidade usado para redução de custo-complexidade mínima
              }

scores = ['roc_auc','f1_macro','recall_macro','precision_macro','accuracy']

for score in scores:

  print("#Hyperparametro: %s" % score)

  dct = GridSearchCV(estimator=DecisionTreeClassifier(),
                       param_grid = parameters,
                       cv = 10, verbose=True,
                     scoring='%s' % score)
  dct.fit(X_train, y_train)

  print("Melhores parametros:")
  print()
  print(dct.best_params_)
  print()
  print("Classification report:")
  print()
  print("The model is trained on the full development set.")
  print("The scores are computed on the full evaluation set.")
  print()
  y_true, y_pred = y_val, dct.predict(X_val)
  print(classification_report(y_true, y_pred))
  '''print("Scores:")
  print()
  means = dct.cv_results['mean_test_score']
  stds = dct.cvresults['std_test_score']
  for mean, std, params in zip(means, stds, dct.cvresults['params']):
    print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_val, dct.predict(X_val)
    print(classification_report(y_true, y_pred))
    print()'''

#Hyperparametro: roc_auc
Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Melhores parametros:

{'criterion': 'gini', 'max_depth': 60, 'min_samples_leaf': 1, 'min_samples_split': 10, 'splitter': 'random'}

Classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.75      0.92      0.83        87
           1       0.67      0.35      0.46        40

    accuracy                           0.74       127
   macro avg       0.71      0.63      0.64       127
weighted avg       0.73      0.74      0.71       127

#Hyperparametro: f1_macro
Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Melhores parametros:

{'criterion': 'gini', 'max_depth': 80, 'min_samples_leaf': 1, 'min_samples_split': 10, 'splitter': 'random'}

Classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.71      0.92      0.80        87
           1       0.50      0.17      0.26        40

    accuracy                           0.69       127
   macro avg       0.60      0.55      0.53       127
weighted avg       0.64      0.69      0.63       127

#Hyperparametro: recall_macro
Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Melhores parametros:

{'criterion': 'entropy', 'max_depth': 110, 'min_samples_leaf': 1, 'min_samples_split': 10, 'splitter': 'best'}

Classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.77      0.90      0.83        87
           1       0.65      0.42      0.52        40

    accuracy                           0.75       127
   macro avg       0.71      0.66      0.67       127
weighted avg       0.73      0.75      0.73       127

#Hyperparametro: precision_macro
Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Melhores parametros:

{'criterion': 'gini', 'max_depth': 100, 'min_samples_leaf': 1, 'min_samples_split': 10, 'splitter': 'random'}

Classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.74      0.94      0.83        87
           1       0.69      0.28      0.39        40

    accuracy                           0.73       127
   macro avg       0.71      0.61      0.61       127
weighted avg       0.72      0.73      0.69       127

#Hyperparametro: accuracy
Fitting 10 folds for each of 20 candidates, totalling 200 fits
Melhores parametros:

{'criterion': 'entropy', 'max_depth': 80, 'min_samples_leaf': 1, 'min_samples_split': 10, 'splitter': 'random'}

Classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

       

[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:    0.4s finished
