In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('../alzheimers_disease_data.csv')
df["Diagnosis"] = df["Diagnosis"].astype("bool") 

In [3]:
df["Diagnosis"]

0       False
1       False
2       False
3       False
4       False
        ...  
2144     True
2145     True
2146     True
2147     True
2148    False
Name: Diagnosis, Length: 2149, dtype: bool

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2149 entries, 0 to 2148
Data columns (total 35 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PatientID                  2149 non-null   int64  
 1   Age                        2149 non-null   int64  
 2   Gender                     2149 non-null   int64  
 3   Ethnicity                  2149 non-null   int64  
 4   EducationLevel             2149 non-null   int64  
 5   BMI                        2149 non-null   float64
 6   Smoking                    2149 non-null   int64  
 7   AlcoholConsumption         2149 non-null   float64
 8   PhysicalActivity           2149 non-null   float64
 9   DietQuality                2149 non-null   float64
 10  SleepQuality               2149 non-null   float64
 11  FamilyHistoryAlzheimers    2149 non-null   int64  
 12  CardiovascularDisease      2149 non-null   int64  
 13  Diabetes                   2149 non-null   int64

In [5]:
df.drop('PatientID', axis=1, inplace=True)

In [6]:
df.drop('DoctorInCharge', axis=1, inplace=True)

In [7]:
x = df.drop('Diagnosis', axis=1)
y = df['Diagnosis']

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=101)

In [9]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((1719, 32), (430, 32), (1719,), (430,))

## Random Forest

In [10]:
from sklearn.ensemble import RandomForestClassifier

In [11]:
rf = RandomForestClassifier()

In [12]:
rf.fit(x_train, y_train)

In [13]:
preds = rf.predict(x_test)

In [14]:
from sklearn.metrics import accuracy_score

In [15]:
accuracy_score(y_test, preds)

0.9279069767441861

In [16]:
from sklearn.metrics import classification_report, confusion_matrix

In [17]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

       False       0.92      0.98      0.95       282
        True       0.95      0.84      0.89       148

    accuracy                           0.93       430
   macro avg       0.93      0.91      0.92       430
weighted avg       0.93      0.93      0.93       430



- Recall: Nesse cenário, os alarmes falsos (FP) são de baixo custo, e os falsos negativos são muito caros. Portanto, faz sentido maximizar a recuperação ou a probabilidade de detecção. 
    - Essa é a métrica que queremos, pois é melhor priorizar a detecção de alzheimer mesmo dando alguns alarmes falsos (falso positivo) do que não detectar um alzheimer positivo (falso negativo)
    - Mas como da para ver, nosso recall é a métrica mais baixa agora pois está em 84%, isso era esperado visto que há muito mais caso negativo de alzheimer que positivo, logo o modelo está tendendo mais a dizer negativo, oque diminui o recall, por isso temos que mudar isso. 

In [18]:
y_test.value_counts()

Diagnosis
False    282
True     148
Name: count, dtype: int64

In [19]:
pd.Series(preds).value_counts()

False    299
True     131
Name: count, dtype: int64

In [20]:
print(confusion_matrix(y_test, preds))

[[275   7]
 [ 24 124]]


In [21]:
df['Diagnosis'].value_counts()

Diagnosis
False    1389
True      760
Name: count, dtype: int64

In [22]:
from sklearn.model_selection import GridSearchCV

In [23]:
rfParametros = {"max_features": ["sqrt", "log2", None], "criterion": ["gini", "entropy", "log_loss"]}
gs = GridSearchCV(estimator=rf, param_grid=rfParametros, verbose=3, scoring="recall", cv=5)
gs.fit(x_train, y_train)
print(gs.best_estimator_)
preds2 = gs.best_estimator_.predict(x_test)
print(accuracy_score(y_test, preds2))

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END .criterion=gini, max_features=sqrt;, score=0.828 total time=   1.3s
[CV 2/5] END .criterion=gini, max_features=sqrt;, score=0.803 total time=   1.4s
[CV 3/5] END .criterion=gini, max_features=sqrt;, score=0.902 total time=   1.4s
[CV 4/5] END .criterion=gini, max_features=sqrt;, score=0.821 total time=   1.3s
[CV 5/5] END .criterion=gini, max_features=sqrt;, score=0.852 total time=   1.3s
[CV 1/5] END .criterion=gini, max_features=log2;, score=0.844 total time=   1.3s
[CV 2/5] END .criterion=gini, max_features=log2;, score=0.869 total time=   1.3s
[CV 3/5] END .criterion=gini, max_features=log2;, score=0.886 total time=   1.3s
[CV 4/5] END .criterion=gini, max_features=log2;, score=0.813 total time=   1.3s
[CV 5/5] END .criterion=gini, max_features=log2;, score=0.828 total time=   1.3s
[CV 1/5] END .criterion=gini, max_features=None;, score=0.943 total time=   4.6s
[CV 2/5] END .criterion=gini, max_features=None;,

- Utilizado um GridSearchCV, para testar diferentes valores para 2 parâmetros diferentes, o max_features para determinar o número máximo de colunas usadas aleatoriamente, e o criterion que determina o tipo de calculo utilizado no Random Forest

In [24]:
print(classification_report(y_test, preds2))

              precision    recall  f1-score   support

       False       0.95      0.97      0.96       282
        True       0.94      0.91      0.93       148

    accuracy                           0.95       430
   macro avg       0.95      0.94      0.95       430
weighted avg       0.95      0.95      0.95       430



### Adicionando Oversampling

Para priorizar o Recall é necessário fazer um oversampling pois o dataset tende a dizer negativo (diagnóstico 0) pela quantidade de majoritária de 0 

In [25]:
# Bloco do Guilherme Cabreira
from imblearn.over_sampling import ADASYN

# Cria o objeto ADASYN
adasyn = ADASYN(random_state=42)

# Aplica o oversampling
X_resampled, y_resampled = adasyn.fit_resample(x, y)

# Verifica a nova distribuição das classes
print(y_resampled.value_counts())


Diagnosis
True     1466
False    1389
Name: count, dtype: int64


Foi testado a técnica de oversampling com SMOTE e ADASYN, para nosso caso o mais efetivo foi ADASYN

In [26]:
x_train_r, x_test_r, y_train_r, y_test_r = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=101)

In [27]:
gs.best_estimator_.fit(x_train_r, y_train_r)

preds3 = gs.best_estimator_.predict(x_test)

print(classification_report(y_test, preds3))

              precision    recall  f1-score   support

       False       0.99      0.97      0.98       282
        True       0.94      0.97      0.96       148

    accuracy                           0.97       430
   macro avg       0.96      0.97      0.97       430
weighted avg       0.97      0.97      0.97       430



E com Oversampling conseguimos aumentar o recall, de 91% para 97%, em consequencia de diminuir um pouco a precisão, de 95% para 93%, justamente o que queriamos.

## KNN

In [28]:
from sklearn.neighbors import KNeighborsClassifier

In [29]:
knn = KNeighborsClassifier(n_neighbors=5)

In [30]:
knn.fit(x_train, y_train)

In [31]:
preds3 = knn.predict(x_test)

In [32]:
print(classification_report(y_test, preds3))

              precision    recall  f1-score   support

       False       0.65      0.73      0.69       282
        True       0.33      0.26      0.29       148

    accuracy                           0.57       430
   macro avg       0.49      0.49      0.49       430
weighted avg       0.54      0.57      0.55       430



In [33]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x2 = scaler.fit_transform(x)

In [34]:
x_train2, x_test2, y_train2, y_test2 = train_test_split(x2, y, test_size=0.2, random_state=101)

In [35]:
knn = KNeighborsClassifier(n_neighbors=3)

In [36]:
knn.fit(x_train2, y_train2)

In [37]:
preds4 = knn.predict(x_test2)

In [38]:
print(classification_report(y_test2, preds4))

              precision    recall  f1-score   support

       False       0.75      0.87      0.80       282
        True       0.63      0.44      0.52       148

    accuracy                           0.72       430
   macro avg       0.69      0.65      0.66       430
weighted avg       0.71      0.72      0.70       430



- Como da para ver, o KNN mesmo com normalização deu um resultado bem inferior ao random forest.