In [None]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, confusion_matrix, accuracy_score, recall_score
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder
from datetime import datetime
import os

In [None]:
testSize = 0.2
randomState = 0
partitionsNumber = 0

In [None]:
# Función para leer datos de un archivo csv
def readData(filePath, delimiter, encoding, header):
  data = pd.read_csv(filePath, delimiter=delimiter,encoding=encoding, header=header)
  return data

In [None]:
def preprocessingData(data):
  # Aca no se realizará ni parseo ni nada, ya vendrá desde la fuente
  # Solo se leerá, se dividirá y se entrenará

  # Desordenando la data (filas)
  data = data.sample(frac = 1, random_state=randomState).reset_index(drop=True)

  # Filtrar filas nulas
  #data = data.dropna()

  # Eliminando columnas que no se usan para el modelo
  nonModelColumns = ["postulationDate", "candidateName"]
  # , "lastWorkCenter", "lastWorkPosition"
  data = data.drop(nonModelColumns, axis=1)

  # Aplicando OneHotEncoding a las variables categóricas (transformación a numéricas y normalización)
  categoricalColumns = ["primaryRole", "secondaryRole", "companyArea", "profileLevel", "residenceCountry", "channel", "lastWorkCenter", "lastWorkPosition", "studyCenterCountry", "studyCenterType", "studyCenterSector", "careerField", "careerStatus", "careerDegree"]
  # , "lastWorkCenter", "lastWorkPosition"
  transformer = make_column_transformer( (OneHotEncoder(sparse=False), categoricalColumns), remainder='passthrough')
  transformed = transformer.fit_transform(data)
  data = pd.DataFrame(transformed, columns=transformer.get_feature_names())

  # Aplicando MinMaxScaler a las variables numéricas (normalización)
  numericNonScaledColumns = ["yearsOfExperience", "worksNumber", "studiesNumber", "technicalSkills", "languages", "anotherSkills", "references", "salary"]
  mms = MinMaxScaler()
  data[numericNonScaledColumns] = mms.fit_transform(data[numericNonScaledColumns])

  # Estandarizando el tipo de la variable objetivo a entero
  objectiveColumn = "hired"
  data[objectiveColumn] = data[objectiveColumn].astype(int)

  # Lectura de las variables de características y objetivo
  X = data.drop([objectiveColumn], axis=1)
  y = data[objectiveColumn]

  return X, y

In [None]:
def splitData(X, y, partitionNumber):
  # Dividiendo los dataframes de entrenamiento y prueba
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = testSize, random_state=randomState)

  if partitionNumber != 0:
    # Obteniendo el total de filas del dataframe de testeo
    X_train, X_test, y_train, y_test = X_train.reset_index(drop=True), X_test.reset_index(drop=True), y_train.reset_index(drop=True), y_test.reset_index(drop=True)
    totalRows = len(X_train)

    # Determinando el límite inferior (primera fila) de la partición
    bottomLimit = int(totalRows*testSize*(partitionNumber-1)) + 1

    # Determinando el límite superior (última fila) de la partición
    topLimit = int(totalRows*testSize*partitionNumber)

    # Determinando una lista con todas los numeros de las filas con la partición
    testList = [x for x in range(bottomLimit-1, topLimit)]

    # Determinando los dataframes de las categorías
    X_train, X_test = X_train[~X_train.index.isin(testList)], X_train[X_train.index.isin(testList)]

    # Determinando los dataframes de los objetivos
    y_train, y_test = y_train[~y_train.index.isin(testList)], y_train[y_train.index.isin(testList)]

  return X_train, X_test, y_train, y_test

In [None]:
def createClassifier(modelName, parameters)