# Exercícios: Classificação

In [None]:
import os

import numpy as np
import pandas as pd

import sklearn
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

Para estes exercícios, será usado o conjunto de dados [Credit Card Customers](https://www.kaggle.com/sakshigoyal7/credit-card-customers):

In [None]:
data_path = '../data/' if os.path.exists('../data/') else 'https://raw.githubusercontent.com/TheAwesomeGe/DECD/main/data/'
xlsx_file_path = data_path + 'BankChurners.xlsx'
df = pd.read_excel(xlsx_file_path, index_col='CLIENTNUM')

1. Se não o fez antes, dedique algum tempo a rever e a comprender as várias colunas do conjunto de dados, tendo também em conta a descrição que pode encontrar no [website](https://www.kaggle.com/sakshigoyal7/credit-card-customers). Em seguida, prepare os dados de forma a que estes não incluam valores em falta, o atributo `Attrition_Flag` seja o alvo da classificação, e os restantes atributos sejam numéricos.

In [None]:
df.head()

In [None]:
# Note: For simplicity, we will just discard the missing values 
df.replace('Unknown', np.nan, inplace=True)
df.dropna(inplace=True)

In [None]:
y = df['Attrition_Flag'].astype('category')
X = df.drop(columns=['Attrition_Flag'])

In [None]:
X['Gender'] = X['Gender'].astype('category')
X['Education_Level'] = X['Education_Level'].astype('category')
X['Marital_Status'] = X['Marital_Status'].astype('category')
X['Income_Category'] = X['Income_Category'].astype('category')
X['Card_Category'] = X['Card_Category'].astype('category')

cat_columns = X.select_dtypes(['category']).columns
X[cat_columns] = X[cat_columns].apply(lambda x: pd.factorize(x)[0]) # Not the best approach for ordinal features

2. Particione o conjunto de dados de forma a que 80% seja para treino e 20% para teste.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

3. Usando uma abordagem de validação cruzada sobre o conjunto de treino, explore o uso de diferentes algoritmos de classificação e analise as diferenças entre os resultados obtidos.

In [None]:
classifiers = { # NOTE: The hyperparameters are not optimized.
    'kNN': KNeighborsClassifier(n_neighbors=3),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVM': SVC(),
    'Neural Network': MLPClassifier(hidden_layer_sizes=(16,))
}

cv_results = {}
for c_name, c in classifiers.items():
    pred = cross_val_predict(c, X_train, y_train, cv=5)
    cv_results[c_name] = classification_report(y_train, pred, target_names=y.cat.categories, output_dict=True)

In [None]:
selected_metrics = {c: {
    'accuracy': r['accuracy'], 
    'macro f1': r['macro avg']['f1-score'], 
    'weighted f1': r['weighted avg']['f1-score']
} for c, r in cv_results.items()}

cv_selected_results = pd.DataFrame.from_dict(selected_metrics, orient='index')
cv_selected_results

4. Treine uma árvore de decisão no conjunto de treino e identifique os atributos mais importantes visualizando a árvore obtida.

In [None]:
dt = classifiers['Decision Tree'].fit(X_train, y_train)
plot_tree(dt, feature_names=X_train.columns, class_names=dt.classes_, impurity=False, rounded=True, filled=True, max_depth=1);

5. Entre as abordagens exploradas no ponto 3, selecione a melhor com base na taxa de acerto. Use essa abordagem para treinar um classificador no conjunto de treino. Avalie o desempenho desse classificador no conjunto de teste. A taxa de acerto parece-lhe a métrica mais adequada para este problema?

In [None]:
best = cv_selected_results.sort_values('accuracy', ascending=False).index[0]
print(best)
best_classifier = classifiers[best]
best_classifier.fit(X_train, y_train)

In [None]:
test_predictions = best_classifier.predict(X_test)

print(classification_report(y_test, test_predictions, target_names=y.cat.categories))
cm = confusion_matrix(y_test, test_predictions, labels=y.cat.categories)
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=y.cat.categories).plot()
plt.show()

6. Analise os exemplos do conjunto de teste para os quais o classificador faz previsões incorretas e verifique se existe algum padrão. 

In [None]:
df_predictions = X_test.copy()
df_predictions[y_test.name] = y_test
df_predictions['Prediction'] = test_predictions
wrong_predictions = df_predictions[y_test != test_predictions]
wrong_predictions

7. Repare que o conjunto de dados não é balanceado. Faça um balanceamento do conjunto de **treino**, reduzindo (subamostragem) o número de exemplos da classe `Existing Customer` (**Sugestão**: use a função [`sample`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sample.html)). Repita as experiências e analise o impacto do balanceamento dos dados de treino. 

In [None]:
min_samples = min(y_train.value_counts())
balanced_y = pd.concat([
    y_train[y_train == label].sample(min_samples, replace=False) for label in y.cat.categories
])
balanced_X = X_train.loc[balanced_y.index]

In [None]:
balanced_classifiers = { # NOTE: The hyperparameters are not optimized.
    'kNN': KNeighborsClassifier(n_neighbors=3),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=5000),
    'SVM': SVC(),
    'Neural Network': MLPClassifier(hidden_layer_sizes=(16,))
}

In [None]:
balanced_cv_results = {}
for c_name, c in balanced_classifiers.items():
    pred = cross_val_predict(c, balanced_X, balanced_y, cv=5)
    balanced_cv_results[c_name] = classification_report(balanced_y, pred, target_names=y.cat.categories, output_dict=True)

In [None]:
balanced_selected_metrics = {c: {
    'accuracy': r['accuracy'], 
    'macro f1': r['macro avg']['f1-score'], 
    'weighted f1': r['weighted avg']['f1-score']
} for c, r in balanced_cv_results.items()}

balanced_cv_selected_results = pd.DataFrame.from_dict(balanced_selected_metrics, orient='index')
balanced_cv_selected_results

In [None]:
balanced_best = balanced_cv_selected_results.sort_values('accuracy', ascending=False).index[0]
print(best)
balanced_best_classifier = balanced_classifiers[best]
balanced_best_classifier.fit(balanced_X, balanced_y)

In [None]:
balanced_test_predictions = balanced_best_classifier.predict(X_test)

print(classification_report(y_test, balanced_test_predictions, target_names=y.cat.categories))
cm = confusion_matrix(y_test, balanced_test_predictions, labels=y.cat.categories)
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=y.cat.categories).plot()
plt.show()

In [None]:
balanced_df_predictions = X_test.copy()
balanced_df_predictions[y_test.name] = y_test
balanced_df_predictions['Prediction'] = balanced_test_predictions
balanced_wrong_predictions = balanced_df_predictions[y_test != balanced_test_predictions]
balanced_wrong_predictions