In [31]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, confusion_matrix, accuracy_score, recall_score
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder, OrdinalEncoder
import datetime
import os
import json
import csv

In [32]:
# Parámetros
inputFolder = "1-input"
processFolder = "2-process"
outputFolder = "3-output"
logsFolder = "4-logs"

dataVisualizationTopLimit = 20

testSize = 0.25
randomState = 0
partitionsNumber = 0
samplingStrategy = 1
percentileNumber = 95

In [33]:
# Funciones utilitarias
def readData(filePath, delimiter, encoding, header):
  data = pd.read_csv(filePath, delimiter=delimiter,encoding=encoding, header=header)
  return data

def writeJson(data, pathJson, encoding='utf-8'):
  with open(pathJson, 'w', encoding=encoding) as f:
    json.dump(data, f, indent=4, ensure_ascii=False)

def writeCsv(data, pathCsv, encoding='utf-8'):
  with open(pathCsv, 'w', newline='', encoding=encoding) as f:
    if data:
      writer = csv.DictWriter(f, fieldnames=data[0].keys(), lineterminator='\n')
      writer.writeheader()
      writer.writerows(data)
    else:
      f.write("")

In [34]:
def preprocessingData(df):
  # Aca no se realizará ni parseo ni nada, ya vendrá desde la fuente
  # Solo se leerá, se dividirá y se entrenará

  # Balanceo de datos: Sobremuestreo aleatorio (oversampling)
  # # De cada 2 no contratados habrá un contratado
  objectiveColumn = "hired"
  dictResults = dict(df[objectiveColumn].value_counts().sort_index())

  print(dictResults)

  maxKey = max(df)
  maxValue = max(dictResults.values())

  dfClassMaxKey = df[df[objectiveColumn] == maxKey]

  for key, value in dictResults.items():
    if key != maxKey:
      dfClass = df[df[objectiveColumn] == key]
      dfClassSampled = dfClass.sample(int(maxValue * samplingStrategy), random_state=randomState, replace=True)
      dfClassMaxKey = pd.concat([dfClassMaxKey, dfClassSampled],axis=0)

  df = dfClassMaxKey
  print(dict(df[objectiveColumn].value_counts().sort_index()))

  # Desordenando la data para evitar sesgos(filas)
  df = df.sample(frac = 1, random_state=randomState).reset_index(drop=True)  

  # Aplicando OrdinalEncoding a las variables categóricas ordinales()
  categoricalColumns = [columnName for columnName, columnType in df.dtypes.to_dict().items() if columnName not in [ "hired" ] and columnType == "object" ]
  categoricalOrdinalColumns = [columnName for columnName in categoricalColumns if columnName in [ "lastEducationStatus", "lastEducationDegree" ]]

  # OrdinalEncoder para la columna lastEducationStatus
  encoder = OrdinalEncoder(categories=[[ "Abandonado", "En Curso", "Graduado" ]])
  encoder.fit(df[["lastEducationStatus"]])
  df["lastEducationStatus"] = encoder.transform(df[["lastEducationStatus"]])

  # OrdinalEncoder para la columna lastEducationDegree
  encoder = OrdinalEncoder(categories=[[ "Otro", "Secundario", "Terciario/Tecnico", "Universitario", "Posgrado", "Master", "Doctorado" ]])
  encoder.fit(df[["lastEducationDegree"]])
  df["lastEducationDegree"] = encoder.transform(df[["lastEducationDegree"]])

  #display(df)

  # Aplicando OneHotEncoding a las variables categóricas cardinales (transformación a numéricas mediante columnas)
  categoricalCardinalColumns = [columnName for columnName in categoricalColumns if columnName not in [ "lastEducationStatus", "lastEducationDegree" ]]
  for column in categoricalCardinalColumns:
    dummies = pd.get_dummies(df[[column]], prefix=column, dummy_na=True)
    df = pd.concat([df, dummies], axis = 1)
    df = df.drop(columns=[column])

  #display(df.dtypes)

  # Aplicando MinMaxScaler a las variables numéricas (normalización) (esto tambien incluye a lastEducationStatus y lastEducationDegree, ya numéricas)
  # Algunas quedaran en 0.9999, esto porque no todas manejan la misma escala (sin decimales, o solo un decimal)
  numericalColumns = [columnName for columnName, columnType in df.dtypes.to_dict().items() if columnName not in [ "hired" ] and columnType == "float64" ]
  #print(numericalColumns)
  for column in numericalColumns:
    df[column] = df[column].fillna(0.0)

  mms = MinMaxScaler()
  df[numericalColumns] = mms.fit_transform(df[numericalColumns])

  # Leyendo el numero de atributos
  # 19146 filas, 12403 columnas con el drop_first=True
  # 19146 filas, 12415 columnas sin el drop_first=True

  # Revision maximos y minimos (todos si estan entre 0 y 1)
  """df.loc['max'] = df.max()
  df.loc['min']= df.min()
  maxValue, minValue = max(list(df.iloc[len(df)-2])), min(list(df.iloc[len(df)-1]))
  print(maxValue, minValue)
  df = df.drop(['max', 'min'], axis=0)"""

  # Eliminando columnas con varianza cercana a cero (variables no afectan en el resultado del modelo)
  df.loc['std'] = df.std()
  stdArray = df.iloc[len(df)-1]
  ninetyNinthPercentile = np.percentile(stdArray, percentileNumber)
  df = df.transpose()
  df = df[df["std"]>ninetyNinthPercentile]
  df = df.transpose()
  df = df.drop(['std'], axis=0)

  # Lectura de las variables de características y objetivo
  objectiveColumn = "hired"
  X = df.drop([objectiveColumn], axis=1)
  y = df[objectiveColumn]

  return X, y

In [35]:
def splitData(X, y, partitionNumber):
  # Dividiendo los dataframes de entrenamiento y prueba
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = testSize, random_state=randomState)

  if partitionNumber != 0:
    # Obteniendo el total de filas del dataframe de testeo
    X_train, X_test, y_train, y_test = X_train.reset_index(drop=True), X_test.reset_index(drop=True), y_train.reset_index(drop=True), y_test.reset_index(drop=True)
    totalRows = len(X_train)

    # Determinando el límite inferior (primera fila) de la partición
    bottomLimit = int(totalRows*testSize*(partitionNumber-1)) + 1

    # Determinando el límite superior (última fila) de la partición
    topLimit = int(totalRows*testSize*partitionNumber)

    # Determinando una lista con todas los numeros de las filas con la partición
    testList = [x for x in range(bottomLimit-1, topLimit)]

    # Determinando los dataframes de las categorías
    X_train, X_test = X_train[~X_train.index.isin(testList)], X_train[X_train.index.isin(testList)]

    # Determinando los dataframes de los objetivos
    y_train, y_test = y_train[~y_train.index.isin(testList)], y_train[y_train.index.isin(testList)]

  return X_train, X_test, y_train, y_test

In [36]:
def createClassifier(modelName):
  if modelName == "KNN":
    return KNeighborsClassifier()
  elif modelName == "LR":
    return LogisticRegression(random_state=randomState)
  elif modelName == "GNB":
    return GaussianNB()
  elif modelName == "DT":
    return DecisionTreeClassifier(random_state=randomState)
  elif modelName == "SVM":
    return SVC(random_state=randomState)
  elif modelName == "RF":
    return RandomForestClassifier(random_state=randomState)
  elif modelName == "GB":
    return GradientBoostingClassifier(random_state=randomState)
  else:
    return KNeighborsClassifier()

In [37]:
def trainModel(X_train, X_test, y_train, y_test, modelName):
  # Creación del clasificador KNN
  clf = createClassifier(modelName)

  # Entrenamiento del clasificador KNN
  clf.fit(X_train, y_train)

  # Calculando la predicción del modelo con la data de prueba
  y_pred = clf.predict(X_test)

  return y_test, y_pred

In [38]:
def getMetrics(y_train, y_test, y_pred, startDate, endDate, partitionNumber, algorithm = "KNN"):
  trainRows = len(y_train)
  testRows = len(y_test)

  # Calculando la exactitud del modelo
  accuracy = "{:.2%}".format(accuracy_score(y_test, y_pred))

  # Calculando la precisión del modelo
  precision = "{:.2%}".format(precision_score(y_test, y_pred, average='micro'))

  # Calculando la sensibilidad del modelo
  recall = "{:.2%}".format(recall_score(y_test, y_pred, average='micro'))

  # Calculando el valor F del modelo (robustez)
  f1Score = "{:.2%}".format(f1_score(y_test, y_pred, average='micro'))

  # Calculando el tiempo de ejecución del modelo
  executionTime = str(int((endDate - startDate).total_seconds() * 1000)) + "ms"

  confussionMatrix = str(confusion_matrix(y_test, y_pred).tolist())
  
  return {
    "algoritmo": algorithm,
    "tipo": "Total de datos" if partitionNumber == 0 else "Particion " + str(partitionNumber),
    "registrosEntrenamiento": trainRows,
    "registrosPrueba": testRows,
    "proporcionSobremuestreo": samplingStrategy,
    "tiempoEjecucion": executionTime,
    "matrizConfusion": confussionMatrix,
    "exactitud": accuracy,
    "precision": precision,
    "sensibilidad": recall,
    "valorF": f1Score,
  }


In [39]:
def main():
  # Definiendo el inicio del proceso
  startTime = datetime.datetime.now()
  print("Inicio: " + str(startTime))
  print("Se inició el procesamiento")
  print()

  # Leyendo la data
  data = readData(os.path.join(inputFolder, "result.csv"), ',', 'utf-8', 0)

  # Determinando los dataframes de las categorías (X) y el objetivo (y)
  X, y = preprocessingData(data)

  # Creando el arreglo de metricas de cada partición
  metricsList = []

  # Iterando sobre cada partición
  for partitionNumber in range(0,partitionsNumber+1):
    # Separando data para el entrenamiento y testeo
    X_train, X_test, y_train, y_test = splitData(X, y, partitionNumber)

    models = ["KNN", "LR", "GNB", "DT", "SVM", "RF", "GB"]

    for model in models:
      print("Calculando para el algoritmo {}".format(model))

      # Inicio de ejecución del modelo
      startDate = datetime.datetime.now()
      print("Inicio: " + str(startDate))
      
      # Realizar entrenamiento del modelo
      y_test, y_pred = trainModel(X_train, X_test, y_train, y_test, model)

      # Fin de ejecución del modelo
      endDate = datetime.datetime.now()
      print("Fin: " + str(endDate))
      print("Tiempo: " + str(endDate-startDate))
      print()

      # Obteniendo las métricas de la partición del modelo
      metrics = getMetrics(y_train, y_test, y_pred, startDate, endDate, partitionNumber, model)

      # Añadiendo la métrica de la partición a la lista de métricas
      metricsList.append(metrics)

  # Ordenando las métricas
  metricsList = sorted(metricsList, key=lambda x: (x["algoritmo"], x["tipo"]))
    
  # Escribiendo las metricas en un archivo de salida
  outputFileDateTime = datetime.datetime.now()
  writeJson(metricsList, os.path.join(outputFolder, "json", outputFileDateTime.strftime("%Y-%m-%d %H-%M-%S") + ".json"))
  writeCsv(metricsList, os.path.join(outputFolder, "csv", outputFileDateTime.strftime("%Y-%m-%d %H-%M-%S") + ".csv"))

  # Definiendo el fin del proceso
  endTime = datetime.datetime.now()
  print("Fin: " + str(endTime))
  print("Tiempo: " + str(endTime-startTime))
  
  # Retornando la lista de métricas
  return metricsList

In [40]:
if __name__ == "__main__":
  main()

Inicio: 2023-05-25 03:11:48.658381
Se inició el procesamiento

{0: 2950, 1: 4015, 2: 2075, 3: 906, 4: 651}
{0: 4015, 1: 4015, 2: 4015, 3: 4015, 4: 4015}
Calculando para el algoritmo KNN
Inicio: 2023-05-25 03:11:52.480716
Fin: 2023-05-25 03:11:52.870136
Tiempo: 0:00:00.389420

Calculando para el algoritmo LR
Inicio: 2023-05-25 03:11:52.883092


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fin: 2023-05-25 03:11:53.642280
Tiempo: 0:00:00.759188

Calculando para el algoritmo GNB
Inicio: 2023-05-25 03:11:53.652249
Fin: 2023-05-25 03:11:53.856228
Tiempo: 0:00:00.203979

Calculando para el algoritmo DT
Inicio: 2023-05-25 03:11:53.866113
Fin: 2023-05-25 03:11:54.947519
Tiempo: 0:00:01.081406

Calculando para el algoritmo SVM
Inicio: 2023-05-25 03:11:54.957489
Fin: 2023-05-25 03:13:07.168525
Tiempo: 0:01:12.211036

Calculando para el algoritmo RF
Inicio: 2023-05-25 03:13:07.178495
Fin: 2023-05-25 03:13:12.198398
Tiempo: 0:00:05.019903

Calculando para el algoritmo GB
Inicio: 2023-05-25 03:13:12.208368
Fin: 2023-05-25 03:13:47.321969
Tiempo: 0:00:35.113601

Fin: 2023-05-25 03:13:47.331936
Tiempo: 0:01:58.673555
