# Clasificación

In [1]:
# Tratamiento de datos
# ------------------------------------------------------------------------------
import numpy as np
import pandas as pd

# Gráficos
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Preprocesado y modelado
# ------------------------------------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
from numpy import std
from sklearn.tree import export_graphviz
from sklearn.tree import export_text
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import mean_squared_error

# Algoritmos
# ==============================================================================
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# Metricas de evaluación
# ==============================================================================
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score


# Configuración warnings
# ------------------------------------------------------------------------------
import warnings
warnings.simplefilter(action='ignore')

In [2]:
dataset_dir = 'TG2022_Datos\Departamentos_CSV\Dataset_Final_DANE.csv' #Se carga el dataset
df_dataset = pd.read_csv(dataset_dir, sep=(',')) #Se lee el archivo csv el cual está separado por comas
df_dataset #Se imprime el dataset

Unnamed: 0,Vulnerabilidad_numero,CASA,APTO,TIP_CUARTO,VIV_TRAD_INDIG,VIV_TRAD_ETNICA,OTRA_VIV,1-DOR,2-DOR,3-DOR,...,Tecnica_Profesional,Universitario,Posgrado,Ninguno,No_inf_escolaridad,No Aplica P_NIVEL_ANOSR,No informa ingresos,Recibe ingresos,No recibe ingresos,No Aplica ingresos
0,2.0,355252,514187,22005,70,94,1313,205922,300843,233206,...,245278,280569,82299,54515,35524,122409,38570,1155530,914201,264029
1,1.0,7891,522,92,1,0,7,2292,2451,1035,...,433,304,149,1230,143,923,235,7220,8060,2084
2,1.0,883,118,54,0,0,2,253,356,141,...,107,58,23,99,27,131,32,819,1024,284
3,1.0,1431,690,33,0,0,3,476,555,282,...,214,112,28,166,43,253,66,1456,1768,549
4,1.0,7892,3572,167,1,0,15,2749,3737,1910,...,1600,792,189,1400,128,1455,235,11472,11939,3175
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1097,1.0,350,5,7,36,1,12,275,98,31,...,31,18,6,66,230,200,115,96,947,852
1098,1.0,4767,997,456,533,21,132,2873,2026,718,...,1322,1032,380,1927,767,2099,298,5712,8894,4884
1099,1.0,3096,71,129,13,1,31,1307,953,379,...,393,272,83,699,265,1005,102,2341,4446,2801
1100,1.0,1103,4,15,4,0,16,440,447,142,...,126,145,34,267,44,376,45,981,1847,1153


In [3]:
print(df_dataset.columns) #Se imprimen las columnas del dataset

Index(['Vulnerabilidad_numero', 'CASA', 'APTO', 'TIP_CUARTO', 'VIV_TRAD_INDIG',
       'VIV_TRAD_ETNICA', 'OTRA_VIV', '1-DOR', '2-DOR', '3-DOR', '4-DOR',
       'Mas de 4 dor', 'No informa Num_dormit', '0-FALL', '1-FALL', '2-FALL',
       'Mas de 2 fall', '1-PER', '2-PER', '3-PER', '4-PER', 'Mas de 4 per',
       'Fall_Hombre', 'Fall_Mujer', 'No informa sexo fall',
       'No informa edad fall', 'Fall_P_Infancia', 'Fall_Infancia',
       'Fall_Juv-Adoles', 'Fall_Adultez', 'Fall_Per_Mayor', 'Per_Hombre',
       'Per_Mujer', 'Per_edad_0 a 4', 'Per_edad_5 a 9', 'Per_edad_10 a 14',
       'Per_edad_15 a 19', 'Per_edad_20 a 24', 'Per_edad_25 a 29',
       'Per_edad_30 a 34', 'Per_edad_35 a 39', 'Per_edad_40 a 44',
       'Per_edad_45 a 49', 'Per_edad_50 a 54', 'Per_edad_55 a 59',
       'Per_edad_Mas de 60', 'Indigena', 'Gitano o Rrom',
       'Raizal del Archi SAPSC', 'Palenquero de San Basilio',
       'Negro, mulato, afrodescendiente, afrocolombiano',
       'Ningun grupo etnico', 'No in

In [4]:
df_dataset.isnull().sum() #Se imprimen los datos nulos en el dataset para cada columna

Vulnerabilidad_numero      0
CASA                       0
APTO                       0
TIP_CUARTO                 0
VIV_TRAD_INDIG             0
                          ..
No Aplica P_NIVEL_ANOSR    0
No informa ingresos        0
Recibe ingresos            0
No recibe ingresos         0
No Aplica ingresos         0
Length: 86, dtype: int64

In [5]:
df_dataset = df_dataset[df_dataset['Vulnerabilidad_numero'].notna()] #No se tienen en cuenta las filas que tienen datos nulos
df_dataset.reset_index(drop=True, inplace=True) #Se reinicia el número de indice

In [6]:
#Solo se tiene en cuenta los numeros de vulnerabilidad diferentes de cero debido
#a que los valores reales de vulnerabilidad estan entre 1 y 5
df_dataset = df_dataset.loc[df_dataset["Vulnerabilidad_numero"] != 0, :] 
df_dataset

Unnamed: 0,Vulnerabilidad_numero,CASA,APTO,TIP_CUARTO,VIV_TRAD_INDIG,VIV_TRAD_ETNICA,OTRA_VIV,1-DOR,2-DOR,3-DOR,...,Tecnica_Profesional,Universitario,Posgrado,Ninguno,No_inf_escolaridad,No Aplica P_NIVEL_ANOSR,No informa ingresos,Recibe ingresos,No recibe ingresos,No Aplica ingresos
0,2.0,355252,514187,22005,70,94,1313,205922,300843,233206,...,245278,280569,82299,54515,35524,122409,38570,1155530,914201,264029
1,1.0,7891,522,92,1,0,7,2292,2451,1035,...,433,304,149,1230,143,923,235,7220,8060,2084
2,1.0,883,118,54,0,0,2,253,356,141,...,107,58,23,99,27,131,32,819,1024,284
3,1.0,1431,690,33,0,0,3,476,555,282,...,214,112,28,166,43,253,66,1456,1768,549
4,1.0,7892,3572,167,1,0,15,2749,3737,1910,...,1600,792,189,1400,128,1455,235,11472,11939,3175
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1097,1.0,350,5,7,36,1,12,275,98,31,...,31,18,6,66,230,200,115,96,947,852
1098,1.0,4767,997,456,533,21,132,2873,2026,718,...,1322,1032,380,1927,767,2099,298,5712,8894,4884
1099,1.0,3096,71,129,13,1,31,1307,953,379,...,393,272,83,699,265,1005,102,2341,4446,2801
1100,1.0,1103,4,15,4,0,16,440,447,142,...,126,145,34,267,44,376,45,981,1847,1153


In [7]:
df_dataset_v = df_dataset.loc[:,["Vulnerabilidad_numero"]] #Se muestra unicamente la columna de vulnerabilidad
df_dataset_v

Unnamed: 0,Vulnerabilidad_numero
0,2.0
1,1.0
2,1.0
3,1.0
4,1.0
...,...
1097,1.0
1098,1.0
1099,1.0
1100,1.0


In [8]:
df_dataset_norm = df_dataset[df_dataset.columns[1:]] #Se toman todas las columnas menos la última correspondiente a la vulnerabilidad
df_dataset_norm

Unnamed: 0,CASA,APTO,TIP_CUARTO,VIV_TRAD_INDIG,VIV_TRAD_ETNICA,OTRA_VIV,1-DOR,2-DOR,3-DOR,4-DOR,...,Tecnica_Profesional,Universitario,Posgrado,Ninguno,No_inf_escolaridad,No Aplica P_NIVEL_ANOSR,No informa ingresos,Recibe ingresos,No recibe ingresos,No Aplica ingresos
0,355252,514187,22005,70,94,1313,205922,300843,233206,56595,...,245278,280569,82299,54515,35524,122409,38570,1155530,914201,264029
1,7891,522,92,1,0,7,2292,2451,1035,199,...,433,304,149,1230,143,923,235,7220,8060,2084
2,883,118,54,0,0,2,253,356,141,17,...,107,58,23,99,27,131,32,819,1024,284
3,1431,690,33,0,0,3,476,555,282,52,...,214,112,28,166,43,253,66,1456,1768,549
4,7892,3572,167,1,0,15,2749,3737,1910,376,...,1600,792,189,1400,128,1455,235,11472,11939,3175
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1097,350,5,7,36,1,12,275,98,31,8,...,31,18,6,66,230,200,115,96,947,852
1098,4767,997,456,533,21,132,2873,2026,718,142,...,1322,1032,380,1927,767,2099,298,5712,8894,4884
1099,3096,71,129,13,1,31,1307,953,379,86,...,393,272,83,699,265,1005,102,2341,4446,2801
1100,1103,4,15,4,0,16,440,447,142,41,...,126,145,34,267,44,376,45,981,1847,1153


In [9]:
df = df_dataset[df_dataset.columns[1:]]
#Se normalizan los datos debido a que se tienen valores muy grandes a comparación del la columna de vulnerabilidad
def minmax_norm(df_input):
    return (df - df.min()) / ( df.max() - df.min())

df_dataset = minmax_norm(df)

df_dataset

Unnamed: 0,CASA,APTO,TIP_CUARTO,VIV_TRAD_INDIG,VIV_TRAD_ETNICA,OTRA_VIV,1-DOR,2-DOR,3-DOR,4-DOR,...,Tecnica_Profesional,Universitario,Posgrado,Ninguno,No_inf_escolaridad,No Aplica P_NIVEL_ANOSR,No informa ingresos,Recibe ingresos,No recibe ingresos,No Aplica ingresos
0,0.427150,0.334258,0.144799,0.002396,0.086477,0.503451,0.270745,0.315196,0.365386,0.509732,...,0.331295,2.311852e-01,0.197622,0.489310,0.219557,0.296936,0.270746,0.302846,0.391738,0.296344
1,0.009076,0.000339,0.000599,0.000034,0.000000,0.002684,0.002851,0.002466,0.001573,0.001756,...,0.000570,2.364986e-04,0.000348,0.010856,0.000853,0.002143,0.001608,0.001867,0.003260,0.002211
2,0.000642,0.000077,0.000349,0.000000,0.000000,0.000767,0.000168,0.000270,0.000172,0.000117,...,0.000130,3.378551e-05,0.000046,0.000700,0.000136,0.000221,0.000183,0.000190,0.000243,0.000190
3,0.001301,0.000449,0.000211,0.000000,0.000000,0.001150,0.000462,0.000479,0.000393,0.000432,...,0.000274,7.828350e-05,0.000058,0.001302,0.000235,0.000517,0.000421,0.000356,0.000562,0.000487
4,0.009077,0.002322,0.001092,0.000034,0.000000,0.005752,0.003452,0.003814,0.002944,0.003351,...,0.002146,6.386285e-04,0.000444,0.012382,0.000760,0.003434,0.001608,0.002982,0.004923,0.003436
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1097,0.000000,0.000003,0.000039,0.001232,0.000920,0.004601,0.000197,0.000000,0.000000,0.000036,...,0.000027,8.240368e-07,0.000005,0.000404,0.001391,0.000388,0.000765,0.000000,0.000210,0.000828
1098,0.005316,0.000648,0.002994,0.018244,0.019319,0.050613,0.003615,0.002021,0.001077,0.001243,...,0.001771,8.363974e-04,0.000903,0.017114,0.004710,0.004996,0.002050,0.001472,0.003617,0.005355
1099,0.003305,0.000046,0.000842,0.000445,0.000920,0.011887,0.001555,0.000896,0.000545,0.000739,...,0.000516,2.101294e-04,0.000190,0.006088,0.001607,0.002342,0.000674,0.000588,0.001710,0.003016
1100,0.000906,0.000003,0.000092,0.000137,0.000000,0.006135,0.000414,0.000366,0.000174,0.000333,...,0.000155,1.054767e-04,0.000072,0.002209,0.000241,0.000815,0.000274,0.000232,0.000596,0.001166


In [10]:
#Se crea un nuevo dataset con los datos normalizados
df= pd.concat([df_dataset_v, df_dataset], axis=1) 
df

Unnamed: 0,Vulnerabilidad_numero,CASA,APTO,TIP_CUARTO,VIV_TRAD_INDIG,VIV_TRAD_ETNICA,OTRA_VIV,1-DOR,2-DOR,3-DOR,...,Tecnica_Profesional,Universitario,Posgrado,Ninguno,No_inf_escolaridad,No Aplica P_NIVEL_ANOSR,No informa ingresos,Recibe ingresos,No recibe ingresos,No Aplica ingresos
0,2.0,0.427150,0.334258,0.144799,0.002396,0.086477,0.503451,0.270745,0.315196,0.365386,...,0.331295,2.311852e-01,0.197622,0.489310,0.219557,0.296936,0.270746,0.302846,0.391738,0.296344
1,1.0,0.009076,0.000339,0.000599,0.000034,0.000000,0.002684,0.002851,0.002466,0.001573,...,0.000570,2.364986e-04,0.000348,0.010856,0.000853,0.002143,0.001608,0.001867,0.003260,0.002211
2,1.0,0.000642,0.000077,0.000349,0.000000,0.000000,0.000767,0.000168,0.000270,0.000172,...,0.000130,3.378551e-05,0.000046,0.000700,0.000136,0.000221,0.000183,0.000190,0.000243,0.000190
3,1.0,0.001301,0.000449,0.000211,0.000000,0.000000,0.001150,0.000462,0.000479,0.000393,...,0.000274,7.828350e-05,0.000058,0.001302,0.000235,0.000517,0.000421,0.000356,0.000562,0.000487
4,1.0,0.009077,0.002322,0.001092,0.000034,0.000000,0.005752,0.003452,0.003814,0.002944,...,0.002146,6.386285e-04,0.000444,0.012382,0.000760,0.003434,0.001608,0.002982,0.004923,0.003436
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1097,1.0,0.000000,0.000003,0.000039,0.001232,0.000920,0.004601,0.000197,0.000000,0.000000,...,0.000027,8.240368e-07,0.000005,0.000404,0.001391,0.000388,0.000765,0.000000,0.000210,0.000828
1098,1.0,0.005316,0.000648,0.002994,0.018244,0.019319,0.050613,0.003615,0.002021,0.001077,...,0.001771,8.363974e-04,0.000903,0.017114,0.004710,0.004996,0.002050,0.001472,0.003617,0.005355
1099,1.0,0.003305,0.000046,0.000842,0.000445,0.000920,0.011887,0.001555,0.000896,0.000545,...,0.000516,2.101294e-04,0.000190,0.006088,0.001607,0.002342,0.000674,0.000588,0.001710,0.003016
1100,1.0,0.000906,0.000003,0.000092,0.000137,0.000000,0.006135,0.000414,0.000366,0.000174,...,0.000155,1.054767e-04,0.000072,0.002209,0.000241,0.000815,0.000274,0.000232,0.000596,0.001166


In [11]:
y = df["Vulnerabilidad_numero"] #Se establece la columna de datos dependientes
X = df.iloc[:,1:] #Se establece el conjunto de datos independientes

In [12]:
# División de los datos en train y test (Puede ser 80-20 o 70-30)
# ------------------------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
                                        df.drop(columns = "Vulnerabilidad_numero"),
                                        df['Vulnerabilidad_numero'],
                                        random_state = 123,
                                        train_size=0.7
                                    )

In [13]:
#Se prueba el modelo LinearDiscriminantAnalysis
model_1 = LinearDiscriminantAnalysis(solver = 'svd',n_components=2)

model_1.fit(X_train, y_train) #Se entrena el modelo
y_pred_1 = model_1.predict(X_test) #Se predice con ayuda del modelo
resultado_1 = model_1.score(X_test, y_test) #Se evalua el modelo
#Se imprime el resultado de la métrica F1
print('F1: ',f1_score(y_test, y_pred_1, average='macro'))

#Se define el metodo de validacion
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

#Se evalua nuevamente el modelo
#Se imprime cada métrica de evaluación
scores = cross_val_score(model_1, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
acc = accuracy_score(y_test, y_pred_1)
prec = precision_score(y_test, y_pred_1,average='macro')
recall = recall_score(y_test, y_pred_1,average='macro')

print("accuracy_score : ", acc)
print("precision_score : ", prec)
print("recall_score : ", recall)
print("accuracy_score cv : ", np.mean(scores))  

F1:  0.23974358974358978
accuracy_score :  0.8972809667673716
precision_score :  0.2267507886435331
recall_score :  0.25950054288816504
accuracy_score cv :  0.8959459459459456


In [14]:
#Se prueba el modelo QuadraticDiscriminantAnalysis
model_2 = QuadraticDiscriminantAnalysis()

model_2.fit(X_train, y_train) #Se entrena el modelo
y_pred_2 = model_2.predict(X_test) #Se predice con ayuda del modelo
resultado_2 = model_2.score(X_test, y_test) #Se evalua el modelo
#Se imprime el resultado de la métrica F1
print('F1: ',f1_score(y_test, y_pred_2, average='macro'))

#Se define el metodo de validacion
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

#Se evalua nuevamente el modelo
scores = cross_val_score(model_2, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
acc = accuracy_score(y_test, y_pred_2)
prec = precision_score(y_test, y_pred_2,average='macro')
recall = recall_score(y_test, y_pred_2,average='macro')

#Se imprime cada métrica de evaluación
print("accuracy_score : ", acc)
print("precision_score : ", prec)
print("recall_score : ", recall)
print("accuracy_score cv : ", np.mean(scores)) 

F1:  0.19247648902821318
accuracy_score :  0.9274924471299094
precision_score :  0.18549848942598188
recall_score :  0.2
accuracy_score cv :  0.9228746928746927


In [15]:
#Se prueba el modelo KNeighborsClassifier
model_3 = KNeighborsClassifier()

model_3.fit(X_train, y_train) #Se entrena el modelo
y_pred_3 = model_3.predict(X_test) #Se predice con ayuda del modelo
resultado_3 = model_3.score(X_test, y_test) #Se evalua el modelo
#Se imprime el resultado de la métrica F1
print('F1: ',f1_score(y_test, y_pred_3, average='macro'))

#Se define el metodo de validacion
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

#Se evalua nuevamente el modelo
scores = cross_val_score(model_3, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
acc = accuracy_score(y_test, y_pred_3)
prec = precision_score(y_test, y_pred_3,average='macro')
recall = recall_score(y_test, y_pred_3,average='macro')

#Se imprime cada métrica de evaluación
print("accuracy_score : ", acc)
print("precision_score : ", prec)
print("recall_score : ", recall)
print("accuracy_score cv : ", np.mean(scores)) 

F1:  0.19247648902821318
accuracy_score :  0.9274924471299094
precision_score :  0.18549848942598188
recall_score :  0.2
accuracy_score cv :  0.9216680316680315


In [16]:
#Se prueba el modelo DecisionTreeClassifiers
model_4 = DecisionTreeClassifier()

model_4.fit(X_train, y_train) #Se entrena el modelo
y_pred_4 = model_4.predict(X_test) #Se predice con ayuda del modelo
resultado_4 = model_4.score(X_test, y_test) #Se evalua el modelo
#Se imprime el resultado de la métrica F1
print('F1: ',f1_score(y_test, y_pred_4, average='macro'))

#Se define el metodo de validacion
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

#Se evalua nuevamente el modelo
scores = cross_val_score(model_4, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
acc = accuracy_score(y_test, y_pred_4)
prec = precision_score(y_test, y_pred_4,average='macro')
recall = recall_score(y_test, y_pred_4,average='macro')

#Se imprime cada métrica de evaluación
print("accuracy_score : ", acc)
print("precision_score : ", prec)
print("recall_score : ", recall)
print("accuracy_score cv :", np.mean(scores)) 

F1:  0.4318773053775966
accuracy_score :  0.8610271903323263
precision_score :  0.4343497934290773
recall_score :  0.4294692469821805
accuracy_score cv : 0.8545345345345345


In [17]:
#Se prueba el modelo GaussianNB
model_5 = GaussianNB()

model_5.fit(X_train, y_train) #Se entrena el modelo
y_pred_5 = model_5.predict(X_test) #Se predice con ayuda del modelo
resultado_5 = model_5.score(X_test, y_test) #Se evalua el modelo
#Se imprime el resultado de la métrica F1
print('F1: ',f1_score(y_test, y_pred_5, average='macro'))

#Se define el metodo de validacion
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

#Se evalua nuevamente el modelo
scores = cross_val_score(model_5, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
acc = accuracy_score(y_test, y_pred_5)
prec = precision_score(y_test, y_pred_5,average='macro')
recall = recall_score(y_test, y_pred_5,average='macro')

#Se imprime cada métrica de evaluación
print("accuracy_score : ", acc)
print("precision_score : ", prec)
print("recall_score : ", recall)
print("accuracy_score cv : ", np.mean(scores)) 

F1:  0.14151675792099214
accuracy_score :  0.18429003021148035
precision_score :  0.23138007441782685
recall_score :  0.5044772306316664
accuracy_score cv :  0.16573300573300573


In [18]:
#Se prueba el modelo SVC
model_6 = SVC()

model_6.fit(X_train, y_train) #Se entrena el modelo
y_pred_6 = model_6.predict(X_test) #Se predice con ayuda del modelo
resultado_6 = model_6.score(X_test, y_test) #Se evalua el modelo
#Se imprime el resultado de la métrica F1
print('F1: ',f1_score(y_test, y_pred_6, average='macro'))

#Se define el metodo de validacion
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

#Se evalua nuevamente el modelo
scores = cross_val_score(model_6, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
acc = accuracy_score(y_test, y_pred_6)
prec = precision_score(y_test, y_pred_6,average='macro')
recall = recall_score(y_test, y_pred_6,average='macro')

#Se imprime cada métrica de evaluación
print("accuracy_score : ", acc)
print("precision_score : ", prec)
print("recall_score : ", recall)
print("accuracy_score cv : ", np.mean(scores)) 

F1:  0.19247648902821318
accuracy_score :  0.9274924471299094
precision_score :  0.18549848942598188
recall_score :  0.2
accuracy_score cv :  0.9228746928746927
