In [188]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_validate, cross_val_predict
from sklearn.metrics import f1_score, precision_score, confusion_matrix, accuracy_score, recall_score
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder, OrdinalEncoder
import datetime
import os
import json
import csv
import math

In [189]:
# Parámetros
inputFolder = "1-input"
processFolder = "2-process"
outputFolder = "3-output"
logsFolder = "4-logs"

inputMainFolder = inputFolder + r"\main"
inputAlgorithmsFolder = inputFolder + r"\algorithms"

dataVisualizationTopLimit = 20

testSize = 0.25
randomState = 0
partitionsNumber = 5
samplingStrategy = 0.2
percentileNumberStd = 90
percentileNumberCorrelation = 90

In [190]:
classifiers = {
  "K vecinos más cercanos": KNeighborsClassifier(),
  "Máquina de vectores de soporte": SVC(random_state=randomState),
  "Regresión logística": LogisticRegression(random_state=randomState, max_iter=200),
  "Naive bayes gaussiano": GaussianNB(),
  "Aumento de gradiente": GradientBoostingClassifier(random_state=randomState),
  "Árbol de decisión": DecisionTreeClassifier(random_state=randomState),
  "Bosque aleatorio": RandomForestClassifier(random_state=randomState),
}

In [191]:
# Funciones utilitarias
def readJson(path, encoding='utf-8', errors=None):
  with open (path, "r", encoding=encoding, errors=errors) as f:
    data = json.loads(f.read())
  return data

def writeJson(data, pathJson, encoding='utf-8'):
  with open(pathJson, 'w', encoding=encoding) as f:
    json.dump(data, f, indent=4, ensure_ascii=False)

def readCsvAsDf(filePath, delimiter, encoding, header):
  data = pd.read_csv(filePath, delimiter=delimiter,encoding=encoding, header=header)
  return data

def writeCsv(data, pathCsv, encoding='utf-8'):
  with open(pathCsv, 'w', newline='', encoding=encoding) as f:
    if data:
      writer = csv.DictWriter(f, fieldnames=data[0].keys(), lineterminator='\n')
      writer.writeheader()
      writer.writerows(data)
    else:
      f.write("")

In [192]:
def preprocessingData(inputPath):
  # Leyendo datos
  df = readCsvAsDf(inputPath, ',', 'utf-8', 0)

  # Balanceo de datos: Sobremuestreo aleatorio (oversampling) (1 de cada 5)
  objectiveColumn = "contratado"
  dictResults = dict(df[objectiveColumn].value_counts().sort_index())

  maxKey = max(dictResults, key=dictResults.get)
  maxValue = max(dictResults.values())

  dfClassMaxKey = df[df[objectiveColumn] == maxKey]

  for key, value in dictResults.items():
    if key != maxKey:
      dfClass = df[df[objectiveColumn] == key]
      dfClassSampled = dfClass.sample(int(maxValue * samplingStrategy), random_state=randomState, replace=True)
      dfClassMaxKey = pd.concat([dfClassMaxKey, dfClassSampled],axis=0)

  df = dfClassMaxKey

  # Aleatorizacion del orden de los registros para evitar sesgos(filas)
  df = df.sample(frac = 1, random_state=randomState).reset_index(drop=True)

  # Aplicando OrdinalEncoding a las variables categóricas ordinales()
  encoder = OrdinalEncoder(categories=[[ "Abandonado", "En Curso", "Graduado" ]])
  encoder.fit(df[["estadoUltimoEstudio"]])
  df["estadoUltimoEstudio"] = encoder.transform(df[["estadoUltimoEstudio"]])
  encoder = OrdinalEncoder(categories=[[ "Otro", "Secundario", "Terciario/Tecnico", "Universitario", "Posgrado", "Master", "Doctorado" ]])
  encoder.fit(df[["gradoUltimoEstudio"]])
  df["gradoUltimoEstudio"] = encoder.transform(df[["gradoUltimoEstudio"]])

  # Aplicando OneHotEncoding a las variables categóricas cardinales (transformación a numéricas mediante columnas)
  categoricalColumns = [columnName for columnName, columnType in df.dtypes.to_dict().items() if columnName not in [ "contratado" ] and columnType == "object" ]
  categoricalCardinalColumns = [columnName for columnName in categoricalColumns if columnName not in [ "estadoUltimoEstudio", "gradoUltimoEstudio" ]]
  for column in categoricalCardinalColumns:
    dummies = pd.get_dummies(df[[column]], prefix=column, dummy_na=True)
    df = pd.concat([df, dummies], axis = 1)
    df = df.drop(columns=[column])

  # Aplicando MinMaxScaler a las variables numéricas (normalización) (esto tambien incluye a lastEducationStatus y lastEducationDegree, ya numéricas)
  # Algunas quedaran en 0.9999, esto porque no todas manejan la misma escala (sin decimales, o solo un decimal)
  numericalColumns = [columnName for columnName, columnType in df.dtypes.to_dict().items() if columnName not in [ "contratado" ] and columnType == "float64" ]
  for column in numericalColumns:
    df[column] = df[column].fillna(0.0)
  mms = MinMaxScaler()
  df[numericalColumns] = mms.fit_transform(df[numericalColumns])

  # Eliminando columnas con varianza cercana a cero, dejando el 10% de columnas con mayor varianza (variables no afectan en el resultado del modelo)
  df.loc['std'] = df.std()
  stdArray = df.iloc[len(df)-1]
  nthPercentileStd = np.percentile(stdArray, percentileNumberStd)
  df = df.transpose()
  df = df[df["std"]>nthPercentileStd]
  df = df.transpose()
  df = df.drop(['std'], axis=0)

  # Eliminando columnas con correlación cercana a uno, dejando el 90% de columnas con menor correlación
  correlationMatrix = df.corr().abs()
  correlationMatrix[correlationMatrix == 1.0] = 0.0
  maxCorrelationValues = [max(correlationMatrix[column]) for column in correlationMatrix.columns]
  nthPercentileCorrelation = np.percentile(maxCorrelationValues, percentileNumberCorrelation)
  highCorrelationColumns = [column for column in correlationMatrix.columns if max(correlationMatrix[column]) > nthPercentileCorrelation]
  df = df.drop(highCorrelationColumns, axis=1)

  # Regresando el dataframe a array de dicts
  preprocessedData = df.to_dict('records')

  #writeJson(preprocessedData, os.path.join(processFolder, 'result.json'), 'utf-8') #Pesa mucho y ni se usa
  writeCsv(preprocessedData, os.path.join(processFolder, 'result.csv'), 'utf-8')

  return df

In [193]:
def splitData(X, y):
  # Dividiendo los dataframes de entrenamiento y prueba
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = testSize, random_state=randomState)

  return X_train, X_test, y_train, y_test

In [194]:
def trainModel(X_train, X_test, y_train, y_test, algorithm):
  # Creación del clasificador
  clf = classifiers[algorithm]

  # Entrenamiento del clasificador
  clf.fit(X_train, y_train)

  # Calculando la predicción del modelo con la data de prueba
  y_pred = clf.predict(X_test)

  return y_test, y_pred

In [195]:
def getMetrics(y_train, y_test, y_pred, startDate, endDate, algorithm, partitionNumber):
  trainRows = len(y_train)
  testRows = len(y_test)

  # Calculando la exactitud del modelo
  accuracy = accuracy_score(y_test, y_pred)

  # Calculando la precisión del modelo
  precision = precision_score(y_test, y_pred)

  # Calculando la sensibilidad del modelo
  recall = recall_score(y_test, y_pred)

  # Calculando el valor F del modelo (robustez)
  f1Score = f1_score(y_test, y_pred)

  # Calculando el promedio de métricas
  metricsList = [accuracy, precision, recall, f1Score]
  metricsMean = sum(metricsList) / len(metricsList)

  # Calculando el tiempo de ejecución del modelo
  executionTime = (endDate - startDate).total_seconds()
  formatExecutionTime = "{:.2f}".format(executionTime) + "s"
  formatAverageTime = "{:.2f}".format(executionTime*1000/(trainRows + testRows)) + "ms"

  # Obteniendo la matriz de confusión
  confussionMatrix = str(confusion_matrix(y_test, y_pred).tolist())
  
  return {
    "algoritmo": algorithm,
    "particion": "Partición {}".format(partitionNumber),
    "registrosEntrenamiento": trainRows,
    "registrosPrueba": testRows,
    "proporcionSobremuestreo": samplingStrategy,
    "tiempoEjecucion": formatExecutionTime,
    "matrizConfusion": confussionMatrix,
    "exactitud": "{:.2%}".format(accuracy),
    "precision": "{:.2%}".format(precision),
    "sensibilidad": "{:.2%}".format(recall),
    "robustez": "{:.2%}".format(f1Score),
    "promedioMetricas": "{:.2%}".format(metricsMean),
    "tiempoPromedio": formatAverageTime
  }

In [196]:
def evaluation(df, algorithm):
  # Lectura de las variables de características y objetivo
  objectiveColumn = "contratado"
  X = df.drop([objectiveColumn], axis=1)
  y = df[objectiveColumn]

  # Obteniendo el clasificador
  clf = classifiers[algorithm]

  # Definiendo el numero de folds
  kFolds = KFold(partitionsNumber)

  # Definiendo las metricas a retornar
  # El test_score por defecto es el accuracy
  scoring = ["accuracy", "precision", "recall", "f1"]

  # Realizando el cross validation
  y_pred = cross_val_predict(clf, X, y, cv=kFolds)

  print(len(y_pred))
  print(y_pred)
  #crossValidateMetrics = cross_validate(clf, X, y, cv=kFolds, scoring=scoring)

  # Creando el arreglo de metricas de cada algoritmo
  partitionsMetricsList = []

  """for partitionNumber in range(0, partitionsNumber):
    trainRows = 100
    testRows = 10

    # Calculando la exactitud del modelo
    accuracy = crossValidateMetrics["test_accuracy"][partitionNumber]

    # Calculando la precisión del modelo
    precision = crossValidateMetrics["test_precision"][partitionNumber]

    # Calculando la sensibilidad del modelo
    recall = crossValidateMetrics["test_recall"][partitionNumber]

    # Calculando el valor F del modelo (robustez)
    f1Score = crossValidateMetrics["test_f1"][partitionNumber]

    # Calculando el promedio de métricas
    metricsList = [accuracy, precision, recall, f1Score]
    metricsMean = sum(metricsList) / len(metricsList)

    # Calculando el tiempo de ejecución del modelo
    executionTime = crossValidateMetrics["fit_time"][partitionNumber]
    formatExecutionTime = "{:.2f}".format(executionTime) + "s"
    formatAverageTime = "{:.2f}".format(executionTime*1000/(trainRows + testRows)) + "ms"

    # Obteniendo la matriz de confusión
    #confussionMatrix = str(confusion_matrix(y_test, y_pred).tolist())

    partitionMetrics =  {
      "algoritmo": algorithm,
      "particion": "Partición {}".format(partitionNumber+1),
      #"registrosEntrenamiento": trainRows,
      #"registrosPrueba": testRows,
      "proporcionSobremuestreo": samplingStrategy,
      "tiempoEjecucion": formatExecutionTime,
      #"matrizConfusion": confussionMatrix,
      "exactitud": "{:.2%}".format(accuracy),
      "precision": "{:.2%}".format(precision),
      "sensibilidad": "{:.2%}".format(recall),
      "robustez": "{:.2%}".format(f1Score),
      "promedioMetricas": "{:.2%}".format(metricsMean),
      "tiempoPromedio": formatAverageTime
    }

    partitionsMetricsList.append(partitionMetrics)"""

  writeJson(partitionsMetricsList, os.path.join(outputFolder, 'result.json'), 'utf-8')
  writeCsv(partitionsMetricsList, os.path.join(outputFolder, 'result.csv'), 'utf-8')

  return partitionsMetricsList

In [197]:
def main():
  # Definiendo el inicio del proceso
  startTime = datetime.datetime.now()
  print("Inicio: " + str(startTime))

  isPreprocessed = True
  isEvaluated = False

  # Preprocesando los datos
  print("Preprocesando datos")
  df = readCsvAsDf(os.path.join(processFolder, "result.csv"), ",", "utf-8", 0) if isPreprocessed else preprocessingData(os.path.join(inputMainFolder, "result.csv"))

  # Obteniendo la lista de métricas del modelado
  print("Obteniendo lista de métricas del modelado")
  algorithmsMetricsList = readJson(os.path.join(inputAlgorithmsFolder, 'result.json'))

  # Elegir el algoritmo con mayor promedio de métricas
  maxAverageAlgorithm = max(algorithmsMetricsList, key=lambda x:x["promedioMetricas"])

  print("Algoritmo con mayor promedio de métricas: {}".format(maxAverageAlgorithm["algoritmo"]))

  # Evaluando el modelo
  print("Evaluando el modelo")
  partitionsMetricsList = readJson(os.path.join(outputFolder, 'result.json')) if isEvaluated else evaluation(df, maxAverageAlgorithm["algoritmo"])

  """# Elegir la partición con mayor promedio de métricas
  maxAveragePartition = max(partitionsMetricsList, key=lambda x:x["promedioMetricas"])

  print("Partición con mayor promedio de métricas: {}".format(maxAveragePartition["particion"]))
  print("Promedio de métricas: {}".format(maxAveragePartition["promedioMetricas"]))
  print("Exactitud: {}".format(maxAveragePartition["exactitud"]))
  print("Precisión: {}".format(maxAveragePartition["precision"]))
  print("Sensibilidad: {}".format(maxAveragePartition["sensibilidad"]))
  print("Robustez: {}".format(maxAveragePartition["robustez"]))
  print("Tiempo promedio: {}".format(maxAveragePartition["tiempoPromedio"]))"""

  # Definiendo el fin del proceso
  endTime = datetime.datetime.now()
  print("Fin: " + str(endTime))
  print("Tiempo: " + str(endTime-startTime))

In [198]:
if __name__ == "__main__":
  main()

Inicio: 2023-06-07 12:34:27.114695
Preprocesando datos
Obteniendo lista de métricas del modelado
Algoritmo con mayor promedio de métricas: Bosque aleatorio
Evaluando el modelo
12069
Fin: 2023-06-07 12:34:47.674433
Tiempo: 0:00:20.559738
