In [None]:
import pandas as pd
import numpy as np

from sklearn.metrics import (
    accuracy_score,
    recall_score,
    precision_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    f1_score
)

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from google.colab import drive

##a

In [None]:
drive.mount('/content/drive')

In [None]:
url = '/content/drive/MyDrive/Ciencia_de_Datos/Tarea3/loan_data.csv'
df = pd.read_csv(url)

##b

In [None]:
ndf = pd.get_dummies(
    df, prefix='purpose', prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None
)

In [None]:
ndf.info()

In [None]:
X = ndf.loc[:, ndf.columns != 'not.fully.paid']
y = ndf['not.fully.paid']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=125
)

##c

In [None]:
copia_train =  X_train.copy()
copia_train['clase'] = y_train.copy()

train0 = copia_train[copia_train['clase'] == 0]
train1 = copia_train[copia_train['clase'] == 1]
train0 = train0.drop('clase', axis=1)
train1= train1.drop('clase', axis=1)

mean0 = train0.mean()
mean1 = train1.mean()

covariance_matrix0 = train0.cov(ddof=0)
covariance_matrix1 = train1.cov(ddof=0)

fraction_class0 = np.mean(y_train == 0)
fraction_class1 = np.mean(y_train == 1)

print('Clase 1:')
print(f'La media muestral de todas las variables es:\n\n{mean0}')
print(f'Proporción muestral: {fraction_class0}')

print('--------------------------------------------')

print('Clase 2:')
print(f'La media muestral de todas las variables es:\n\n{mean1}')
print(f'Proporción muestral: {fraction_class1}')

ACLARACIÓN: No imprimí las matrices de covarianza en la celda anterior porque estas son 18x18.

In [None]:
model = GaussianNB()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuray = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
recall = recall_score(y_test, y_pred, pos_label=1)
precision = precision_score(y_test, y_pred, pos_label=1)

print('Accuracy:', accuray)
print('F1 Score:', f1)
print('Recall :', recall)
print('Precision :', precision)

In [None]:
labels = ["Fully Paid", "Not fully Paid"]
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot()

##d

In [None]:
#Declaro la lista de columnas asi en ves de usar ndf.columns para ya tener englobados todos los
# propositos (purpose) en una sola columna y asi armar mi tabla mas facil despues
columnas = [
    "none", "credit.policy", "int.rate", "installment", "log.annual.inc", "dti",
    "fico", "days.with.cr.line", "revol.bal", "revol.util", "inq.last.6mths",
    "delinq.2yrs", "pub.rec", "purpose"
]

f1_scores = [f1]
accuracys = [accuray]
recalls = [recall]
precisions = [precision]

purposes = [
    "purpose_credit_card",
    "purpose_debt_consolidation", "purpose_educational", "purpose_home_improvement",
    "purpose_major_purchase", "purpose_small_business"
]


for c in columnas:
    if c != "none" and c != 'purpose':

        X_trainModified = X_train.loc[:, X_train.columns != c]
        X_testModified = X_test.loc[:, X_test.columns != c]

        model = GaussianNB()

        model.fit(X_trainModified, y_train)

        y_pred = model.predict(X_testModified)

        f1_scores.append(f1_score(y_test, y_pred, average="weighted"))
        accuracys.append(accuracy_score(y_test, y_pred))
        precisions.append(precision_score(y_test, y_pred, pos_label=1))
        recalls.append(recall_score(y_test, y_pred, pos_label=1))

    elif c == 'purpose':

        X_trainModified = X_train.loc[:, ~X_train.columns.isin(purposes)]
        X_test_Modified = X_test.loc[:, ~X_train.columns.isin(purposes)]

        model = GaussianNB()
        model.fit(X_trainModified, y_train)

        y_pred = model.predict(X_test_Modified)

        f1_scores.append(f1_score(y_test, y_pred, average="weighted"))
        accuracys.append(accuracy_score(y_test, y_pred))
        precisions.append(precision_score(y_test, y_pred, pos_label=1))
        recalls.append(recall_score(y_test, y_pred, pos_label=1))


data = {
    'col.eliminated' : columnas,
    'accuracy' : accuracys,
    'recall' : recalls,
    'precision' : precisions,
    'f1-score' : f1_scores
}

tabla = pd.DataFrame(data)

In [None]:
print(tabla)

##e

In [None]:
def ajustar_data(Columnas):
    if 'purpose' not in Columnas:
        X_modified = X_train.loc[:, ~X_train.columns.isin(Columnas)]
        X_testM = X_test.loc[:, ~X_train.columns.isin(Columnas)]

        model = GaussianNB()

        model.fit(X_modified, y_train)

        y_pred = model.predict(X_testM)
    else:
        X_modified = X_train.loc[:, ~X_train.columns.isin(Columnas)]
        X_testM = X_test.loc[:, ~X_train.columns.isin(Columnas)]
        X_modified = X_modified.loc[:, ~X_modified.columns.isin(purposes)]
        X_testM = X_testM.loc[:, ~X_testM.columns.isin(purposes)]

        model = GaussianNB()

        model.fit(X_modified, y_train)

        y_pred = model.predict(X_testM)

    return recall_score(y_test, y_pred), f1_score(y_test, y_pred, average="weighted")

In [None]:
C1 = ['revol.bal', 'revol.util']
C2 = ['revol.bal', 'revol.util', 'inq.last.6mths']
C3 = ['revol.bal', 'revol.util', 'inq.last.6mths', 'purpose']
C4 = ['revol.bal', 'inq.last.6mths', 'purpose']
C5 = ['revol.bal', 'revol.util', 'inq.last.6mths', 'int.rate']
C6 = ['revol.bal', 'revol.util', 'inq.last.6mths', 'log.annual.inc']
C7 = ['revol.bal', 'revol.util', 'inq.last.6mths', 'int.rate', 'delinq.2yrs']
C8 = ['revol.bal', 'revol.util', 'inq.last.6mths', 'int.rate', 'delinq.2yrs', 'dti']
C9 = ['revol.bal', 'revol.util', 'inq.last.6mths', 'int.rate', 'delinq.2yrs', 'dti', 'days.with.cr.line']
C10 = ['revol.bal', 'revol.util', 'inq.last.6mths', 'int.rate', 'delinq.2yrs', 'dti', 'days.with.cr.line', 'fico']
C11 = ['revol.bal', 'revol.util','delinq.2yrs', 'days.with.cr.line', 'log.annual.inc',
       "purpose_credit_card", "purpose_debt_consolidation", "purpose_educational",
       "purpose_home_improvement", "purpose_major_purchase", "purpose_small_business"]
C12 = ['revol.bal', 'revol.util','delinq.2yrs', 'days.with.cr.line', 'log.annual.inc', 'pub.rec',
       "purpose_credit_card", "purpose_debt_consolidation", "purpose_educational",
       "purpose_home_improvement", "purpose_major_purchase", "purpose_small_business"]
C13 = ['revol.bal', 'revol.util','delinq.2yrs', 'days.with.cr.line', 'log.annual.inc', 'pub.rec', 'int.rate',
       "purpose_credit_card", "purpose_debt_consolidation", "purpose_educational",
       "purpose_home_improvement", "purpose_major_purchase", "purpose_small_business"]
C14 = ['revol.bal', 'revol.util','delinq.2yrs', 'days.with.cr.line', 'log.annual.inc', 'pub.rec', 'installment',
       "purpose_credit_card", "purpose_debt_consolidation", "purpose_educational",
       "purpose_home_improvement", "purpose_major_purchase", "purpose_small_business"]
C15 = ['revol.bal', 'revol.util','delinq.2yrs', 'days.with.cr.line', 'log.annual.inc', 'pub.rec', 'installment', 'inq.last.6mths',
       "purpose_credit_card", "purpose_debt_consolidation", "purpose_educational",
       "purpose_home_improvement", "purpose_major_purchase", "purpose_small_business"]
C16 = ['revol.bal', 'revol.util','delinq.2yrs', 'days.with.cr.line', 'log.annual.inc', 'pub.rec', 'installment', 'inq.last.6mths','inq.last.6mths',
       "purpose_credit_card", "purpose_debt_consolidation", "purpose_educational",
       "purpose_home_improvement", "purpose_major_purchase", "purpose_small_business"]

Cs = [C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11, C12, C13, C14, C15, C16]
for i in Cs:
    print(ajustar_data(i), '# Columnas eliminadas :', len(i))

In [None]:
#Por las pruebas hechas anteriormente, noto que puedo eliminar los atributos de la lista C9 y aun asi tener una buena
#clasificación, incluso mejorando el recall_score.

print('Los atributos eliminados son : \n', C14)

In [None]:
X_podado = X_train.loc[:, ~X_train.columns.isin(C14)]
X_test_podado = X_test.loc[:, ~X_train.columns.isin(C14)]
print(f'Las columnas que conserva el dataset son: {list(X_podado.columns)} \n')
model = GaussianNB()

model.fit(X_podado, y_train)

y_pred = model.predict(X_test_podado)


f1 = f1_score(y_test, y_pred, average="weighted")
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label=1)
recall = recall_score(y_test, y_pred, pos_label=1)

print('Las metricas que se obtienen de entrenar al modelo solo con los atributos nombrados arriba son: ')
print('Accuracy:', accuray)
print('F1 Score:', f1)
print('Recall :', recall)
print('Precision :', precision)


labels = ["Fully Paid", "Not fully Paid"]
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)

print('La matriz de confusón es:')

disp.plot()

##f

La conclusión a la que llego con el item (e) es que a veces es conveniente reducir la dimensionalidad de lo datos, eliminando atributos redundantes e incluso entorpecedores a la hora de entrenar mi modelo para clasificación. Lo que mas me sorprendio es que reduciendo la dimensionalidad de los datos obtuve resultados notoriamente mejores (¡y eso que elimine la mayoria de los atributos del dataset original!), antes de esto yo pensaba que reduciendo la dimensionalidad se podia obtener como mucho resultados iguales que antes, esto me parece super util para futuros programas y trabajos.
