In [1]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
SEED = 137 # semilla aleatoria para la reproducibilidad del análisis

In [3]:
# dataframe con los resultados de cada modelo
df_results = pd.DataFrame(columns=["model", "accuracy_score",
                                   "precision_score", "recall_score",
                                   "f1_score"])
file_save_data = "data/results_class.csv"

In [4]:
df = pd.read_csv("data/data_clean.csv", index_col=0)

In [5]:
df.head()

Unnamed: 0,numero_aviso,marca,codigo_irs,nombre,accion,accion_modelo,marca.1,linea,grupo,subgrupo,tipo_carroceria,anio_range
0,44023,HYUNDAI,03060003nndn,computadora,cambiar,cambiar,HYUNDAI,accent,3,6,sedan,2012 - 2009
1,44023,HYUNDAI,03030005nndn,cable ground de bateria,cambiar,cambiar,HYUNDAI,accent,3,3,sedan,2012 - 2009
2,17203,NISSAN,02090225nidn,pollera delantero izquierdo,cambiar,cambiar,NISSAN,FRONTIER,2,9,pickup,2016 - 2013
3,37133,KIA,03040016nidn,lampara delantero izquierdo,cambiar,reparar,KIA,rio,3,4,sedan,2020 - 2017
4,17434,HONDA,02090021nddn,bisagra derecho tapa motor,cambiar,cambiar,HONDA,CRV,2,9,camioneta,2016 - 2013


En esta etapa del análisis se descartaran a priori las variables con alta cardinalidad y que su correlación sea baja, teniendo en cuenta que correlación no significa causalidad, se realizaran las transformaciones requeridas en las variables categóricas y se realizaran las particiones de los datos en un 80% – 20% donde el 20% será para validación del modelo esta partición se realizara de manera aleatoria 

### Variables a descartar

In [6]:
df = df.drop(columns=["numero_aviso", # esta variable solo indica el identificador incidente
                      "marca", # esta variable tiene baja correlación y alta cardinalidad, es un identificador
                      "accion_modelo", # variable que vien de un módelo anterior
                      "linea", # es un identificador
                      "nombre" # esta variable tien mucha correlación con el codigo iris
                     ])

In [7]:
df.head()

Unnamed: 0,codigo_irs,accion,marca.1,grupo,subgrupo,tipo_carroceria,anio_range
0,03060003nndn,cambiar,HYUNDAI,3,6,sedan,2012 - 2009
1,03030005nndn,cambiar,HYUNDAI,3,3,sedan,2012 - 2009
2,02090225nidn,cambiar,NISSAN,2,9,pickup,2016 - 2013
3,03040016nidn,cambiar,KIA,3,4,sedan,2020 - 2017
4,02090021nddn,cambiar,HONDA,2,9,camioneta,2016 - 2013


In [8]:
df["grupo"] = df["grupo"].astype(str)
df["subgrupo"] = df["subgrupo"].astype(str)

In [9]:
y_temp = pd.get_dummies(df["accion"])

In [10]:
y = y_temp["cambiar"].values
y

array([1, 1, 1, ..., 1, 1, 0], dtype=uint8)

**Nota:** Se toma como variable objetivo la parte de cambiar donde 1 significa cambiar la pieza y 0 repararla esta será la notación de ahora en adelante para la interpretación del modelo

In [11]:
df = df.drop(columns=["accion"]) # ahora esta variable es denominada y

In [12]:
df_model = pd.get_dummies(df)

In [13]:
df_model.head()

Unnamed: 0,codigo_irs_01020001ennn,codigo_irs_01020003ennn,codigo_irs_01040001ndnn,codigo_irs_01040002ninn,codigo_irs_01040003nntn,codigo_irs_01040004nndn,codigo_irs_02010001nnns,codigo_irs_02010006nndn,codigo_irs_02010009iidn,codigo_irs_02010010iddn,...,tipo_carroceria_coupe,tipo_carroceria_pickup,tipo_carroceria_sedan,tipo_carroceria_utilitario,anio_range_2004 - atrás,anio_range_2008 - 2005,anio_range_2012 - 2009,anio_range_2016 - 2013,anio_range_2020 - 2017,anio_range_2024 - 2021
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


**Nota:** En esta iteración se toman todos los códigos de las partes para evaluar su desempeño como una primera iteración si el resultado supera el desempeño del modelo base se continuará con esta hipótesis de los contrario se realizara un reducción de dimensionalidad

In [14]:
X = df_model.values

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=SEED)

In [16]:
# clasificador Gausiano

classifier = GaussianNB()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

df_results.loc[0] = ["GaussianNB",
                     accuracy_score(y_test, y_pred),
                     precision_score(y_test, y_pred),
                     recall_score(y_test, y_pred),
                     f1_score(y_test, y_pred)]

df_results.to_csv(file_save_data)
df_results

Unnamed: 0,model,accuracy_score,precision_score,recall_score,f1_score
0,GaussianNB,0.447178,0.96648,0.261209,0.411266


In [17]:
# Regresión Logística
classifier = LogisticRegression(random_state=SEED,
                                max_iter=200)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

df_results.loc[1] = ["LogisticRegression",
                     accuracy_score(y_test, y_pred),
                     precision_score(y_test, y_pred),
                     recall_score(y_test, y_pred),
                     f1_score(y_test, y_pred)]

df_results.to_csv(file_save_data)
df_results

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,model,accuracy_score,precision_score,recall_score,f1_score
0,GaussianNB,0.447178,0.96648,0.261209,0.411266
1,LogisticRegression,0.859562,0.902158,0.908554,0.905344


In [18]:
# KNN
classifier = KNeighborsClassifier(n_neighbors=5,
                                  metric="minkowski",
                                  p=2)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

df_results.loc[2] = ["KNeighborsClassifier",
                     accuracy_score(y_test, y_pred),
                     precision_score(y_test, y_pred),
                     recall_score(y_test, y_pred),
                     f1_score(y_test, y_pred)]

df_results.to_csv(file_save_data)
df_results

Unnamed: 0,model,accuracy_score,precision_score,recall_score,f1_score
0,GaussianNB,0.447178,0.96648,0.261209,0.411266
1,LogisticRegression,0.859562,0.902158,0.908554,0.905344
2,KNeighborsClassifier,0.820401,0.869521,0.890698,0.879982


In [19]:
# SVM k_lineal
classifier = SVC(kernel="linear",
                 random_state=SEED)

classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

df_results.loc[3] = ["SVM_k_lineal",
                     accuracy_score(y_test, y_pred),
                     precision_score(y_test, y_pred),
                     recall_score(y_test, y_pred),
                     f1_score(y_test, y_pred)]

df_results.to_csv(file_save_data)
df_results

Unnamed: 0,model,accuracy_score,precision_score,recall_score,f1_score
0,GaussianNB,0.447178,0.96648,0.261209,0.411266
1,LogisticRegression,0.859562,0.902158,0.908554,0.905344
2,KNeighborsClassifier,0.820401,0.869521,0.890698,0.879982
3,SVM_k_lineal,0.857136,0.906409,0.899626,0.903005


In [None]:
# SVM k_sigmoid
classifier = SVC(kernel="sigmoid",
                 random_state=SEED)

classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

df_results.loc[4] = ["SVM_sigmoid",
                     accuracy_score(y_test, y_pred),
                     precision_score(y_test, y_pred),
                     recall_score(y_test, y_pred),
                     f1_score(y_test, y_pred)]

df_results.to_csv(file_save_data)
df_results