In [50]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, KFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler, StandardScaler, LabelEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer

from xgboost import XGBClassifier
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.ensemble import RandomForestClassifier


from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV, cross_val_predict
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score,precision_score,recall_score,log_loss,confusion_matrix


from skopt import BayesSearchCV, dummy_minimize, gp_minimize
from lightgbm import LGBMClassifier



In [2]:
def cls_feature(X,max_cat = 10):
        #Agrupando features numéricas
    numeric_features  = [cls for cls in X.columns if X[cls].dtype == 'float64' or X[cls].dtype =='int64']
    # print('>>numeric_features:', numeric_features ,
          # len(numeric_features))


    #Agrupando features categoricas
    categorical_features = [cls for cls in X.columns if X[cls].dtypes == 'object' and X[cls].nunique()<=max_cat]
    # print('\n>>cat_features:', cat_features ,
          # len(cat_features))
    return numeric_features , categorical_featuresgp_minimize

In [3]:
def print_metrics(y_true, y_pred, y_pred_proba):
    
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    #f1 = f1_score(y_true, y_pred)
    logloss = log_loss(y_true, y_pred_proba)
    roc_auc = roc_auc_score(y_true, y_pred)

    print("Accuracy: {:.2f}".format(accuracy))
    print("Precision: {:.2f}".format(precision))
    print("Recall: {:.2f}".format(recall))
    #print("F1-Score: {:.2f}".format(f1))
    print("Log Loss: {:.2f}".format(logloss))
    print("ROC AUC Score: {:.2f}".format(roc_auc))

In [4]:
def custom_features(df):
    df_out = df.copy()
    df_out['cat_empresa'] = df_out['ano_de_estabelecimento'].apply(lambda x: 'antiga' if x <= 1945 else
                                                                    'intermediaria' if 1945 < x < 1996 else
                                                                    'nova' if x >= 1996 else 'NaN')
    percentil_25 = np.percentile(df['salario_prevalecente'], 25)
    percentil_75 = np.percentile(df['salario_prevalecente'], 75)
    median = np.median(df['salario_prevalecente'])
    df_out['cat_salario'] = df_out['salario_prevalecente'].apply(lambda x: 'baixo' if x < percentil_25 else
                                                                'medio_baixo' if percentil_25 <= x < median else
                                                                'medio_alto' if median <= x < percentil_75 else
                                                                'alto' if x >= percentil_75 else 'NaN')
    
    # df_out['cat_salario'] = df_out['salario_prevalecente'].apply(lambda x: 'baixo' if x < np.percentile(df['salario_prevalecente'], 25) else
    #                                                                'medio_baixo' if np.percentile(df['salario_prevalecente'], 25) <= x < np.median(df['salario_prevalecente']) else
    #                                                                'medio_alto' if np.median(df['salario_prevalecente']) <= x < np.percentile(df['salario_prevalecente'], 75) else
    #                                                                'alto' if x >= np.percentile(df['salario_prevalecente'], 75) else 'NaN')
    df_out['cat_experiencia'] = 'Null'
    df_out.loc[(df_out['tem_experiencia_de_trabalho'] == 'S') & (df_out['requer_treinamento_de_trabalho'] == 'S'), 'cat_experiencia'] = '1'
    df_out.loc[(df_out['tem_experiencia_de_trabalho'] == 'S') & (df_out['requer_treinamento_de_trabalho'] == 'N'), 'cat_experiencia'] = '2'
    df_out.loc[(df_out['tem_experiencia_de_trabalho'] == 'N') & (df_out['requer_treinamento_de_trabalho'] == 'S'), 'cat_experiencia'] = '3'
    df_out.loc[(df_out['tem_experiencia_de_trabalho'] == 'N') & (df_out['requer_treinamento_de_trabalho'] == 'N'), 'cat_experiencia'] = '4'
    
    return df_out


## Data

In [6]:
data_train = pd.read_csv('data/train.csv'); data_train

Unnamed: 0,id_do_caso,continente,educacao_do_empregado,tem_experiencia_de_trabalho,requer_treinamento_de_trabalho,num_de_empregados,ano_de_estabelecimento,regiao_de_emprego,salario_prevalecente,unidade_de_salario,posicao_em_tempo_integral,status_do_caso
0,EZYV10567,Europa,Ensino Médio,N,S,2087,1855,Sul,69711.24,Ano,S,Negado
1,EZYV5505,Ásia,Mestrado,S,N,5991,2003,Meio-Oeste,52931.38,Ano,S,Aprovado
2,EZYV5207,Ásia,Ensino Médio,N,N,1426,2000,Ilha,110830.21,Ano,S,Negado
3,EZYV7545,Ásia,Ensino Médio,N,N,3846,1992,Meio-Oeste,91884.68,Semana,S,Negado
4,EZYV16071,Ásia,Ensino Superior,S,N,3957,1949,Nordeste,138155.24,Ano,S,Aprovado
...,...,...,...,...,...,...,...,...,...,...,...,...
17831,EZYV17821,Ásia,Mestrado,S,N,95347,1995,Oeste,69692.24,Ano,S,Aprovado
17832,EZYV13200,Europa,Ensino Médio,N,N,1483,1886,Sul,136237.62,Ano,S,Negado
17833,EZYV8903,América do Norte,Mestrado,N,N,1504,2010,Nordeste,118187.30,Ano,S,Negado
17834,EZYV6163,América do Norte,Doutorado,S,N,251967,2010,Meio-Oeste,93133.40,Ano,S,Aprovado


In [7]:
df = data_train.copy()

In [8]:
df.drop(columns=['id_do_caso'],inplace = True)

In [9]:
df.head()

Unnamed: 0,continente,educacao_do_empregado,tem_experiencia_de_trabalho,requer_treinamento_de_trabalho,num_de_empregados,ano_de_estabelecimento,regiao_de_emprego,salario_prevalecente,unidade_de_salario,posicao_em_tempo_integral,status_do_caso
0,Europa,Ensino Médio,N,S,2087,1855,Sul,69711.24,Ano,S,Negado
1,Ásia,Mestrado,S,N,5991,2003,Meio-Oeste,52931.38,Ano,S,Aprovado
2,Ásia,Ensino Médio,N,N,1426,2000,Ilha,110830.21,Ano,S,Negado
3,Ásia,Ensino Médio,N,N,3846,1992,Meio-Oeste,91884.68,Semana,S,Negado
4,Ásia,Ensino Superior,S,N,3957,1949,Nordeste,138155.24,Ano,S,Aprovado


## Start

In [10]:
df['status_do_caso'].value_counts()

Aprovado    11937
Negado       5899
Name: status_do_caso, dtype: int64

In [11]:
df['status_do_caso'] = df['status_do_caso'].apply(lambda x: 1 if x == 'Aprovado' else 0)

In [12]:
df['status_do_caso'].value_counts()

1    11937
0     5899
Name: status_do_caso, dtype: int64

In [13]:
df.dtypes

continente                         object
educacao_do_empregado              object
tem_experiencia_de_trabalho        object
requer_treinamento_de_trabalho     object
num_de_empregados                   int64
ano_de_estabelecimento              int64
regiao_de_emprego                  object
salario_prevalecente              float64
unidade_de_salario                 object
posicao_em_tempo_integral          object
status_do_caso                      int64
dtype: object

In [14]:
X = df.drop(columns=['status_do_caso'])
y = df['status_do_caso']

In [15]:
df['ano_de_estabelecimento'] = np.log1p(df['ano_de_estabelecimento'])

## Prepro

In [16]:
feature_engineering_transformer = FunctionTransformer(custom_features)

In [17]:
numeric_features , categorical_features = cls_feature(X)

In [18]:
numeric_features

['num_de_empregados', 'ano_de_estabelecimento', 'salario_prevalecente']

In [19]:
categorical_features

['continente',
 'educacao_do_empregado',
 'tem_experiencia_de_trabalho',
 'requer_treinamento_de_trabalho',
 'regiao_de_emprego',
 'unidade_de_salario',
 'posicao_em_tempo_integral']

In [20]:
categorical_transformer = Pipeline([
    ('imputer',SimpleImputer(strategy='constant',fill_value='missing')),
    ('enconder',OneHotEncoder(handle_unknown='ignore',sparse=False)),
    
])
numerical_transformer = Pipeline([
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])


preprocessor = ColumnTransformer([
        ("num", numerical_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
]
)

pipeline_fe = Pipeline(steps=[
    ('fe', feature_engineering_transformer),
    ('preprocessor', preprocessor)])

In [21]:
X_preprocessed_fe = pipeline_fe.fit_transform(X)

### Grid Search

In [122]:
# # Dividir os dados em conjuntos de treinamento e teste
# X_train, X_test, y_train, y_test = train_test_split(X_preprocessed_fe, y, test_size=0.2, random_state=42)

# # Definir os modelos
# models = {
#     'RandomForest': RandomForestClassifier(class_weight='balanced', random_state=42),
#     'XGBoost': XGBClassifier(random_state=42)
# }

# # Definir as grades de hiperparâmetros para cada modelo
# param_grids = {
#     'RandomForest': {
#         'n_estimators': [100, 200, 500],
#         'max_depth': [None, 10, 30],
#         'min_samples_split': [2, 5, 10],
#         'criterion': ['gini', 'entropy']
#     },
#     'XGBoost': {
#         'n_estimators': [100, 200, 500],
#         'learning_rate': [0.01, 0.1, 0.3],
#         'max_depth': [3, 6, 10]
#     }
# }

# # Validação cruzada com 3 folds
# cv = KFold(n_splits=3, shuffle=True, random_state=42)

# # Treinar e ajustar os modelos
# grids = {} # armazenará os modelos
# for model_name, model in models.items():
#     grids[model_name] = GridSearchCV(estimator=model, param_grid=param_grids[model_name], cv=cv, scoring='f1', n_jobs=-1, verbose=2)
#     grids[model_name].fit(X_train, y_train)
#     best_params = grids[model_name].best_params_
#     best_score = grids[model_name].best_score_
    
#     # Prever os rótulos do conjunto de teste
#     y_pred = grids[model_name].predict(X_test)
    
#     # Calcular o F1 score para o conjunto de teste
#     f1 = f1_score(y_test, y_pred)
    
#     # Imprimir os resultados
#     print(f'Best parameters for {model_name}: {best_params}')
#     print(f'Best F1 score for {model_name}: {best_score}')
#     print(f'F1 score on test set for {model_name}: {f1}\n')

Fitting 3 folds for each of 54 candidates, totalling 162 fits
Best parameters for RandomForest: {'criterion': 'gini', 'max_depth': 30, 'min_samples_split': 2, 'n_estimators': 500}
Best F1 score for RandomForest: 0.803983160450065
F1 score on test set for RandomForest: 0.8011928429423459

Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best parameters for XGBoost: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500}
Best F1 score for XGBoost: 0.8237540956961373
F1 score on test set for XGBoost: 0.8294094944037051



### Random Forest
- Fitting 3 folds for each of 54 candidates, totalling 162 fits
- Best parameters for RandomForest: {'criterion': 'gini', 'max_depth': 30, 'min_samples_split': 2, 'n_estimators': 500}
- Best F1 score for RandomForest: 0.803983160450065
- F1 score on test set for RandomForest: 0.8011928429423459

### Xgboost
- Fitting 3 folds for each of 27 candidates, totalling 81 fits
- Best parameters for XGBoost: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500}
- Best F1 score for XGBoost: 0.8237540956961373
- F1 score on test set for XGBoost: 0.8294094944037051

### Random Search

In [49]:
def train_model(params):
    # Dividir os dados em conjuntos de treinamento e teste
    X_train, X_test, y_train, y_test = train_test_split(X_preprocessed_fe, y, test_size=0.2, random_state=42)

    learning_rate = params[0]
    num_leaves = params[1]
    min_child_samples = params[2]
    subsample = params[3]
    colsample_bytree = params[4]
    
    print(params,'\n')
    
    mdl = LGBMClassifier(learning_rate=learning_rate,num_leaves=num_leaves,min_child_samples=min_child_samples,
                         subsample=subsample,colsample_bytree=colsample_bytree,random_state=42,subsample_freq=1,n_estimators = 1000)
    mdl.fit(X_train,y_train)
    
    p = mdl.predict_proba(X_test)[:,1]
    
    return -roc_auc_score(y_test,p)


space = [(1e-3,1e-1,'log-uniform'), #learning_rate
        (2,128),# num_leaves
        (1,100),#min_child_samples
        (0.05,1.0),#subsample
        (0.1,1.0)]# colsample bytree

resultado = dummy_minimize(train_model,space,random_state=42,verbose = 1, n_calls = 30)

Iteration No: 1 started. Evaluating function at random point.
[0.03918194347141743, 16, 72, 0.6187255599871848, 0.2404167763981929] 

Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.4853
Function value obtained: -0.7750
Current minimum: -0.7750
Iteration No: 2 started. Evaluating function at random point.
[0.002051110418843398, 76, 75, 0.8728673384861885, 0.6410035105688879] 

Iteration No: 2 ended. Evaluation done at random point.
Time taken: 1.4220
Function value obtained: -0.7810
Current minimum: -0.7810
Iteration No: 3 started. Evaluating function at random point.
[0.026070247583707674, 23, 53, 0.9714143595538948, 0.8491983767203797] 

Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.7174
Function value obtained: -0.7689
Current minimum: -0.7810
Iteration No: 4 started. Evaluating function at random point.
[0.0026587543983272693, 65, 60, 0.22423428436076215, 0.373818018663584] 

Iteration No: 4 ended. Evaluation done at random point.
Time 

### Aprendizado Baysiano

In [None]:
def train_model(params):
    # Dividir os dados em conjuntos de treinamento e teste
    X_train, X_test, y_train, y_test = train_test_split(X_preprocessed_fe, y, test_size=0.2, random_state=42)

    learning_rate = params[0]
    num_leaves = params[1]
    min_child_samples = params[2]
    subsample = params[3]
    colsample_bytree = params[4]
    
    print(params,'\n')
    
    mdl = LGBMClassifier(learning_rate=learning_rate,num_leaves=num_leaves,min_child_samples=min_child_samples,
                         subsample=subsample,colsample_bytree=colsample_bytree,random_state=42,subsample_freq=1,n_estimators = 1000)
    mdl.fit(X_train,y_train)
    
    p = mdl.predict_proba(X_test)[:,1]
    
    return -roc_auc_score(y_test,p)


space = [(1e-3,1e-1,'log-uniform'), #learning_rate
        (2,128),# num_leaves
        (1,100),#min_child_samples
        (0.05,1.0),#subsample
        (0.1,1.0)]# colsample bytree

resultado_gp =gp_minimize(train_model,space,random_state=42,verbose = 1, n_calls = 30,n_random_starts=15)

Iteration No: 1 started. Evaluating function at random point.
[0.03918194347141743, 25, 78, 0.6170076500491628, 0.5012494775682321] 

Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.7182
Function value obtained: -0.7630
Current minimum: -0.7630
Iteration No: 2 started. Evaluating function at random point.
[0.0015847101210439095, 60, 34, 0.18572347702584374, 0.6857996256539677] 

Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.9590
Function value obtained: -0.7760
Current minimum: -0.7760
Iteration No: 3 started. Evaluating function at random point.
[0.00129665117537604, 93, 94, 0.05073982754896361, 0.992990403362096] 

Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.2518
Function value obtained: -0.7548
Current minimum: -0.7760
Iteration No: 4 started. Evaluating function at random point.
[0.017177621112338382, 79, 2, 0.07190930378934497, 0.5722971942325503] 

Iteration No: 4 ended. Evaluation done at random point.
Time 

In [44]:
from sklearn.metrics import f1_score
from skopt import dummy_minimize

def train_model(params):
    # Dividir os dados em conjuntos de treinamento e teste
    X_train, X_test, y_train, y_test = train_test_split(X_preprocessed_fe, y, test_size=0.2, random_state=42)

    learning_rate = params[0]
    num_leaves = params[1]
    min_child_samples = params[2]
    subsample = params[3]
    colsample_bytree = params[4]
    
    print(params,'\n')
    
    mdl = LGBMClassifier(learning_rate=learning_rate,num_leaves=num_leaves,min_child_samples=min_child_samples,
                         subsample=subsample,colsample_bytree=colsample_bytree,random_state=42,subsample_freq=1,n_estimators = 1000)
    mdl.fit(X_train,y_train)
    
    y_pred = mdl.predict(X_test)
    
    return -f1_score(y_test, y_pred)


space = [(1e-3,1e-1,'log-uniform'), #learning_rate
        (2,128),# num_leaves
        (1,100),#min_child_samples
        (0.05,1.0),#subsample
        (0.1,1.0)]# colsample bytree

resultado = dummy_minimize(train_model, space, random_state=42, verbose=1, n_calls=30)


Iteration No: 1 started. Evaluating function at random point.
[0.03918194347141743, 16, 72, 0.6187255599871848, 0.2404167763981929] 

Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.5199
Function value obtained: -0.8187
Current minimum: -0.8187
Iteration No: 2 started. Evaluating function at random point.
[0.002051110418843398, 76, 75, 0.8728673384861885, 0.6410035105688879] 

Iteration No: 2 ended. Evaluation done at random point.
Time taken: 1.4063
Function value obtained: -0.8263
Current minimum: -0.8263
Iteration No: 3 started. Evaluating function at random point.
[0.026070247583707674, 23, 53, 0.9714143595538948, 0.8491983767203797] 

Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.5421
Function value obtained: -0.8180
Current minimum: -0.8263
Iteration No: 4 started. Evaluating function at random point.
[0.0026587543983272693, 65, 60, 0.22423428436076215, 0.373818018663584] 

Iteration No: 4 ended. Evaluation done at random point.
Time 

In [38]:
# Iteration No: 30 ended. Evaluation done at random point.
# Time taken: 0.1145
# Function value obtained: -0.7556
# Current minimum: -0.7816

In [41]:
resultado.x

[0.002051110418843398, 76, 75, 0.8728673384861885, 0.6410035105688879]

In [52]:
resultado_gp.x

[0.0014664313962953794, 32, 19, 0.9243083857787031, 0.9027599641139328]

In [53]:
mdl = LGBMClassifier(learning_rate=0.0014664313962953794,num_leaves=32,min_child_samples=19,
                     subsample=0.9243083857787031,colsample_bytree=0.9027599641139328,random_state=42,subsample_freq=1,n_estimators = 1000)
mdl.fit(X_train,y_train)

y_pred_bay = mdl.predict(X_test)

## Submitions

In [55]:
df_test.head()

Unnamed: 0,id_do_caso,continente,educacao_do_empregado,tem_experiencia_de_trabalho,requer_treinamento_de_trabalho,num_de_empregados,ano_de_estabelecimento,regiao_de_emprego,salario_prevalecente,unidade_de_salario,posicao_em_tempo_integral
0,EZYV22339,Ásia,Ensino Superior,S,N,2414,1973,Nordeste,103320.8,Ano,N
1,EZYV9333,Ásia,Mestrado,N,S,2928,1937,Oeste,49786.35,Ano,S
2,EZYV9740,Ásia,Ensino Superior,S,N,1836,1999,Oeste,60855.83,Ano,S
3,EZYV3328,Ásia,Ensino Superior,S,N,122,2004,Sul,82379.06,Semana,S
4,EZYV12321,América do Sul,Ensino Superior,S,S,1720,2003,Nordeste,6719.81,Ano,S


In [91]:
def sub(nome_do_arquivo,modelo=mdl):
    mdl = modelo
    df_test = pd.read_csv('data/test.csv')
    df_test_processed = pipeline_fe.transform(df_test)
    df_test['status_do_caso'] = mdl.predict(df_test_processed)
    submit_respx = df_test[['id_do_caso', 'status_do_caso']]
    nome_arquivo = (f'submitions-'+str(nome_do_arquivo))
    submit_respx.to_csv(nome_arquivo+'.csv', index=False)
    return submit_respx
    

In [93]:
sub('litghgbm_model-10-48',mdl)

Unnamed: 0,id_do_caso,status_do_caso
0,EZYV22339,1
1,EZYV9333,1
2,EZYV9740,1
3,EZYV3328,1
4,EZYV12321,1
...,...,...
7639,EZYV19907,0
7640,EZYV576,0
7641,EZYV1804,1
7642,EZYV13543,1


In [97]:
sub('xboost_grid-2jun-10;50',grids['XGBoost'])

Unnamed: 0,id_do_caso,status_do_caso
0,EZYV22339,1
1,EZYV9333,1
2,EZYV9740,1
3,EZYV3328,1
4,EZYV12321,1
...,...,...
7639,EZYV19907,0
7640,EZYV576,0
7641,EZYV1804,1
7642,EZYV13543,1
