In [31]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold, cross_val_predict
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, precision_score, recall_score, jaccard_score, log_loss, confusion_matrix
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeRegressor
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier


In [32]:
def cls_feature(X,max_cat = 10):
        #Agrupando features numéricas
    numeric_features  = [cls for cls in X.columns if X[cls].dtype == 'float64' or X[cls].dtype =='int64']
    # print('>>numeric_features:', numeric_features ,
          # len(numeric_features))


    #Agrupando features categoricas
    categorical_features = [cls for cls in X.columns if X[cls].dtypes == 'object' and X[cls].nunique()<=max_cat]
    # print('\n>>cat_features:', cat_features ,
          # len(cat_features))
    return numeric_features , categorical_features

In [33]:
def print_metrics(y_true, y_pred, y_pred_proba):
    
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    #f1 = f1_score(y_true, y_pred)
    logloss = log_loss(y_true, y_pred_proba)
    roc_auc = roc_auc_score(y_true, y_pred)

    print("Accuracy: {:.2f}".format(accuracy))
    print("Precision: {:.2f}".format(precision))
    print("Recall: {:.2f}".format(recall))
    #print("F1-Score: {:.2f}".format(f1))
    print("Log Loss: {:.2f}".format(logloss))
    print("ROC AUC Score: {:.2f}".format(roc_auc))

In [34]:
def custom_features(df):
    df_out = df.copy()
    df_out['cat_empresa'] = df_out['ano_de_estabelecimento'].apply(lambda x: 'antiga' if x <= 1945 else
                                                                    'intermediaria' if 1945 < x < 1996 else
                                                                    'nova' if x >= 1996 else 'NaN')
    percentil_25 = np.percentile(df['salario_prevalecente'], 25)
    percentil_75 = np.percentile(df['salario_prevalecente'], 75)
    median = np.median(df['salario_prevalecente'])
    df_out['cat_salario'] = df_out['salario_prevalecente'].apply(lambda x: 'baixo' if x < percentil_25 else
                                                                'medio_baixo' if percentil_25 <= x < median else
                                                                'medio_alto' if median <= x < percentil_75 else
                                                                'alto' if x >= percentil_75 else 'NaN')
    
    # df_out['cat_salario'] = df_out['salario_prevalecente'].apply(lambda x: 'baixo' if x < np.percentile(df['salario_prevalecente'], 25) else
    #                                                                'medio_baixo' if np.percentile(df['salario_prevalecente'], 25) <= x < np.median(df['salario_prevalecente']) else
    #                                                                'medio_alto' if np.median(df['salario_prevalecente']) <= x < np.percentile(df['salario_prevalecente'], 75) else
    #                                                                'alto' if x >= np.percentile(df['salario_prevalecente'], 75) else 'NaN')
    df_out['cat_experiencia'] = 'Null'
    df_out.loc[(df_out['tem_experiencia_de_trabalho'] == 'S') & (df_out['requer_treinamento_de_trabalho'] == 'S'), 'cat_experiencia'] = '1'
    df_out.loc[(df_out['tem_experiencia_de_trabalho'] == 'S') & (df_out['requer_treinamento_de_trabalho'] == 'N'), 'cat_experiencia'] = '2'
    df_out.loc[(df_out['tem_experiencia_de_trabalho'] == 'N') & (df_out['requer_treinamento_de_trabalho'] == 'S'), 'cat_experiencia'] = '3'
    df_out.loc[(df_out['tem_experiencia_de_trabalho'] == 'N') & (df_out['requer_treinamento_de_trabalho'] == 'N'), 'cat_experiencia'] = '4'
    
    return df_out


## Data

In [35]:
data_train = pd.read_csv('data/train.csv'); data_train.head()

Unnamed: 0,id_do_caso,continente,educacao_do_empregado,tem_experiencia_de_trabalho,requer_treinamento_de_trabalho,num_de_empregados,ano_de_estabelecimento,regiao_de_emprego,salario_prevalecente,unidade_de_salario,posicao_em_tempo_integral,status_do_caso
0,EZYV10567,Europa,Ensino Médio,N,S,2087,1855,Sul,69711.24,Ano,S,Negado
1,EZYV5505,Ásia,Mestrado,S,N,5991,2003,Meio-Oeste,52931.38,Ano,S,Aprovado
2,EZYV5207,Ásia,Ensino Médio,N,N,1426,2000,Ilha,110830.21,Ano,S,Negado
3,EZYV7545,Ásia,Ensino Médio,N,N,3846,1992,Meio-Oeste,91884.68,Semana,S,Negado
4,EZYV16071,Ásia,Ensino Superior,S,N,3957,1949,Nordeste,138155.24,Ano,S,Aprovado


In [36]:
df = data_train.copy()

In [37]:
df.drop(columns=['id_do_caso'],inplace = True)

In [38]:
df.head()

Unnamed: 0,continente,educacao_do_empregado,tem_experiencia_de_trabalho,requer_treinamento_de_trabalho,num_de_empregados,ano_de_estabelecimento,regiao_de_emprego,salario_prevalecente,unidade_de_salario,posicao_em_tempo_integral,status_do_caso
0,Europa,Ensino Médio,N,S,2087,1855,Sul,69711.24,Ano,S,Negado
1,Ásia,Mestrado,S,N,5991,2003,Meio-Oeste,52931.38,Ano,S,Aprovado
2,Ásia,Ensino Médio,N,N,1426,2000,Ilha,110830.21,Ano,S,Negado
3,Ásia,Ensino Médio,N,N,3846,1992,Meio-Oeste,91884.68,Semana,S,Negado
4,Ásia,Ensino Superior,S,N,3957,1949,Nordeste,138155.24,Ano,S,Aprovado


## Start

In [39]:
df['status_do_caso'].value_counts()

Aprovado    11937
Negado       5899
Name: status_do_caso, dtype: int64

In [40]:
df['status_do_caso'] = df['status_do_caso'].apply(lambda x: 1 if x == 'Aprovado' else 0)

In [41]:
df['status_do_caso'].value_counts()

1    11937
0     5899
Name: status_do_caso, dtype: int64

In [42]:
df.dtypes

continente                         object
educacao_do_empregado              object
tem_experiencia_de_trabalho        object
requer_treinamento_de_trabalho     object
num_de_empregados                   int64
ano_de_estabelecimento              int64
regiao_de_emprego                  object
salario_prevalecente              float64
unidade_de_salario                 object
posicao_em_tempo_integral          object
status_do_caso                      int64
dtype: object

In [43]:
X = df.drop(columns=['status_do_caso'])
y = df['status_do_caso']

In [44]:
df['ano_de_estabelecimento'] = np.log1p(df['ano_de_estabelecimento'])

## Prepro

In [45]:
feature_engineering_transformer = FunctionTransformer(custom_features)

In [46]:
numeric_features , categorical_features = cls_feature(X)

In [47]:
numeric_features

['num_de_empregados', 'ano_de_estabelecimento', 'salario_prevalecente']

In [48]:
categorical_features

['continente',
 'educacao_do_empregado',
 'tem_experiencia_de_trabalho',
 'requer_treinamento_de_trabalho',
 'regiao_de_emprego',
 'unidade_de_salario',
 'posicao_em_tempo_integral']

In [49]:
from sklearn.decomposition import PCA

# Obtenha o número mínimo de componentes principais com base nas dimensões do conjunto de dados
n_components = 2

categorical_transformer = Pipeline([
    ('imputer',SimpleImputer(strategy='constant',fill_value='missing')),
    ('encoder',OneHotEncoder(handle_unknown='ignore',sparse=False)),
])

numerical_transformer = Pipeline([
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Adicione o estágio de PCA ao pipeline
pca_transformer = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=n_components))
])

preprocessor = ColumnTransformer([
    ("num", numerical_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features),
])

# Adicione o estágio de PCA ao preprocessor
preprocessor_with_pca = ColumnTransformer([
    ("num", pca_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features),
])

pipeline_fe = Pipeline(steps=[
    ('fe', feature_engineering_transformer),
    ('preprocessor', preprocessor_with_pca)
])


In [50]:
X_preprocessed_fe = pipeline_fe.fit_transform(X)

In [60]:
# Dividir os dados em conjuntos de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed_fe, y, test_size=0.2, random_state=42)

# Definir os modelos
models = {
    'RandomForest': RandomForestClassifier(class_weight='balanced', random_state=42),
    'XGBoost': XGBClassifier(random_state=42,class_weight='balanced'),
    'LGBM': LGBMClassifier(random_state=42,class_weight = 'balanced')
}

# Definir as grades de hiperparâmetros para cada modelo
param_grids = {
    'RandomForest': {
        'n_estimators': [100, 200, 500,1000],
        'max_depth': [None, 10, 30],
        'min_samples_split': [2, 5, 10],
        'criterion': ['gini', 'entropy']
    },
    'XGBoost': {
        'n_estimators': [100, 200, 500,1000],
        'learning_rate': [0.001, 0.1, 0.3],
        'max_depth': [3, 6, 10],
        'num_leaves':[2,16,64,128],
        'min_child_samples':[1,40,70,100],
        
    },
    'LGBM':{
        'learning_rate': [0.001, 0.1, 0.3],
        'num_leaves':[2,16,64,128],
        'min_child_samples':[1,40,70,100],
        
        
    }
}

# Validação cruzada com 3 folds
cv = KFold(n_splits=3, shuffle=True, random_state=42)

# Treinar e ajustar os modelos
grids = {} # armazenará os modelos
for model_name, model in models.items():
    grids[model_name] = GridSearchCV(estimator=model, param_grid=param_grids[model_name], cv=cv, scoring='f1', n_jobs=-1, verbose=1)
    grids[model_name].fit(X_train, y_train)
    best_params = grids[model_name].best_params_
    best_score = grids[model_name].best_score_
    
    # Prever os rótulos do conjunto de teste
    y_pred = grids[model_name].predict(X_test)
    
    # Calcular o F1 score para o conjunto de teste
    f1 = f1_score(y_test, y_pred)
    
    # Imprimir os resultados
    print(f'Best parameters for {model_name}: {best_params}')
    print(f'Best F1 score for {model_name}: {best_score}')
    print(f'F1 score on test set for {model_name}: {f1}\n')

Fitting 3 folds for each of 72 candidates, totalling 216 fits
Best parameters for RandomForest: {'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 1000}
Best F1 score for RandomForest: 0.7956325006081763
F1 score on test set for RandomForest: 0.7970983571580968

Fitting 3 folds for each of 576 candidates, totalling 1728 fits
Parameters: { "class_weight", "min_child_samples", "num_leaves" } are not used.

Parameters: { "class_weight", "min_child_samples", "num_leaves" } are not used.

Parameters: { "class_weight", "min_child_samples", "num_leaves" } are not used.

Parameters: { "class_weight", "min_child_samples", "num_leaves" } are not used.

Parameters: { "class_weight", "min_child_samples", "num_leaves" } are not used.

Parameters: { "class_weight", "min_child_samples", "num_leaves" } are not used.

Parameters: { "class_weight", "min_child_samples", "num_leaves" } are not used.

Parameters: { "class_weight", "min_child_samples", "num_leaves" } are not used

KeyboardInterrupt: 

In [59]:
from skopt import gp_minimize

def train_model_lgbm(params):
    # Dividir os dados em conjuntos de treinamento e teste
    X_train, X_test, y_train, y_test = train_test_split(X_preprocessed_fe, y, test_size=0.2, random_state=42)

    learning_rate = params[0]
    num_leaves = params[1]
    min_child_samples = params[2]
    subsample = params[3]
    colsample_bytree = params[4]
    
    print(params,'\n')
    
    mdl = LGBMClassifier(learning_rate=learning_rate, num_leaves=num_leaves, min_child_samples=min_child_samples,
                         subsample=subsample, colsample_bytree=colsample_bytree, random_state=42, subsample_freq=1, n_estimators=1000)
    mdl.fit(X_train, y_train)
    
    p = mdl.predict_proba(X_test)[:,1]
    
    return -roc_auc_score(y_test, p)

def train_model_xgboost(params):
    # Dividir os dados em conjuntos de treinamento e teste
    X_train, X_test, y_train, y_test = train_test_split(X_preprocessed_fe, y, test_size=0.2, random_state=42)

    learning_rate = params[0]
    num_leaves = params[1]
    min_child_samples = params[2]
    subsample = params[3]
    colsample_bytree = params[4]
    
    print(params,'\n')
    
    mdl = XGBClassifier(learning_rate=learning_rate, num_leaves=num_leaves, min_child_samples=min_child_samples,
                        subsample=subsample, colsample_bytree=colsample_bytree, random_state=42, n_estimators=1000)
    mdl.fit(X_train, y_train)
    
    p = mdl.predict_proba(X_test)[:,1]
    
    return -roc_auc_score(y_test, p)

def train_model_randomforest(params):
    # Dividir os dados em conjuntos de treinamento e teste
    X_train, X_test, y_train, y_test = train_test_split(X_preprocessed_fe, y, test_size=0.2, random_state=42)

    n_estimators = params[0]
    max_depth = params[1]
    min_samples_split = params[2]
    criterion = params[3]
    
    print(params,'\n')
    
    mdl = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split,
                                 criterion=criterion, random_state=42, class_weight='balanced')
    mdl.fit(X_train, y_train)
    
    p = mdl.predict_proba(X_test)[:,1]
    
    return -roc_auc_score(y_test, p)

space_lgbm = [(1e-3, 1e-1, 'log-uniform'),  # learning_rate
              (2, 128),  # num_leaves
              (1, 100),  # min_child_samples
              (0.05, 1.0),  # subsample
              (0.1, 1.0)]  # colsample_bytree

space_xgboost = [(1e-3, 1e-1, 'log-uniform'),  # learning_rate
                 (2, 128),  # num_leaves
                 (1, 100),  # min_child_samples
                 (0.05, 1.0),  # subsample
                 (0.1, 1.0)]  # colsample_bytree

space_randomforest = [(100, 1000),  # n_estimators
                      (None, 10, 30),  # max_depth
                      (2, 5, 10),  # min_samples_split
                      ['gini', 'entropy']]  # criterion

resultado_gp_lgbm = gp_minimize(train_model_lgbm, space_lgbm, random_state=42, verbose=1, n_calls=30, n_random_starts=15)
resultado_gp_xgboost = gp_minimize(train_model_xgboost, space_xgboost, random_state=42, verbose=1, n_calls=30, n_random_starts=15)
resultado_gp_randomforest = gp_minimize(train_model_randomforest, space_randomforest, random_state=42, verbose=1, n_calls=30, n_random_starts=15)

best_params_lgbm = dict(zip(['learning_rate', 'num_leaves', 'min_child_samples', 'subsample', 'colsample_bytree'], resultado_gp_lgbm.x))
best_score_lgbm = -resultado_gp_lgbm.fun

best_params_xgboost = dict(zip(['learning_rate', 'num_leaves', 'min_child_samples', 'subsample', 'colsample_bytree'], resultado_gp_xgboost.x))
best_score_xgboost = -resultado_gp_xgboost.fun

best_params_randomforest = dict(zip(['n_estimators', 'max_depth', 'min_samples_split', 'criterion'], resultado_gp_randomforest.x))
best_score_randomforest = -resultado_gp_randomforest.fun

print('LGBM:')
print('Best parameters:', best_params_lgbm)
print('Best score:', best_score_lgbm)

print('\nXGBoost:')
print('Best parameters:', best_params_xgboost)
print('Best score:', best_score_xgboost)

print('\nRandomForest:')
print('Best parameters:', best_params_randomforest)
print('Best score:', best_score_randomforest)


Iteration No: 1 started. Evaluating function at random point.
[0.03918194347141743, 25, 78, 0.6170076500491628, 0.5012494775682321] 

Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.6494
Function value obtained: -0.7626
Current minimum: -0.7626
Iteration No: 2 started. Evaluating function at random point.
[0.0015847101210439095, 60, 34, 0.18572347702584374, 0.6857996256539677] 

Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.8975
Function value obtained: -0.7743
Current minimum: -0.7743
Iteration No: 3 started. Evaluating function at random point.
[0.00129665117537604, 93, 94, 0.05073982754896361, 0.992990403362096] 

Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.2538
Function value obtained: -0.7537
Current minimum: -0.7743
Iteration No: 4 started. Evaluating function at random point.
[0.017177621112338382, 79, 2, 0.07190930378934497, 0.5722971942325503] 

Iteration No: 4 ended. Evaluation done at random point.
Time 



Iteration No: 23 ended. Search finished for the next optimal point.
Time taken: 5.8349
Function value obtained: -0.7802
Current minimum: -0.7805
Iteration No: 24 started. Searching for the next optimal point.
[100, 10, 2, 'entropy'] 

Iteration No: 24 ended. Search finished for the next optimal point.
Time taken: 0.9046
Function value obtained: -0.7797
Current minimum: -0.7805
Iteration No: 25 started. Searching for the next optimal point.
[1000, 10, 5, 'entropy'] 

Iteration No: 25 ended. Search finished for the next optimal point.
Time taken: 6.7781
Function value obtained: -0.7801
Current minimum: -0.7805
Iteration No: 26 started. Searching for the next optimal point.
[101, 10, 10, 'gini'] 

Iteration No: 26 ended. Search finished for the next optimal point.
Time taken: 0.8124
Function value obtained: -0.7796
Current minimum: -0.7805
Iteration No: 27 started. Searching for the next optimal point.
[1000, 10, 10, 'gini'] 





Iteration No: 27 ended. Search finished for the next optimal point.
Time taken: 5.8640
Function value obtained: -0.7802
Current minimum: -0.7805
Iteration No: 28 started. Searching for the next optimal point.
[1000, 10, 5, 'gini'] 





Iteration No: 28 ended. Search finished for the next optimal point.
Time taken: 5.8365
Function value obtained: -0.7803
Current minimum: -0.7805
Iteration No: 29 started. Searching for the next optimal point.
[105, 30, 10, 'entropy'] 

Iteration No: 29 ended. Search finished for the next optimal point.
Time taken: 1.3547
Function value obtained: -0.7429
Current minimum: -0.7805
Iteration No: 30 started. Searching for the next optimal point.
[100, 10, 5, 'entropy'] 

Iteration No: 30 ended. Search finished for the next optimal point.
Time taken: 0.9118
Function value obtained: -0.7802
Current minimum: -0.7805
LGBM:
Best parameters: {'learning_rate': 0.0028042661889240163, 'num_leaves': 41, 'min_child_samples': 34, 'subsample': 1.0, 'colsample_bytree': 0.7815938299262029}
Best score: 0.7821784000820071

XGBoost:
Best parameters: {'learning_rate': 0.004498643077694106, 'num_leaves': 74, 'min_child_samples': 53, 'subsample': 0.9631134231318819, 'colsample_bytree': 0.8600804638103364}
Best 

### Random Forest
- Fitting 3 folds for each of 54 candidates, totalling 162 fits
- Best parameters for RandomForest: {'criterion': 'gini', 'max_depth': 30, 'min_samples_split': 2, 'n_estimators': 500}
- Best F1 score for RandomForest: 0.803983160450065
- F1 score on test set for RandomForest: 0.8011928429423459

### Xgboost
- Fitting 3 folds for each of 27 candidates, totalling 81 fits
- Best parameters for XGBoost: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500}
- Best F1 score for XGBoost: 0.8237540956961373
- F1 score on test set for XGBoost: 0.8294094944037051

## Submitions

In [127]:
df_test = pd.read_csv('data/test.csv')

In [128]:
df_test

Unnamed: 0,id_do_caso,continente,educacao_do_empregado,tem_experiencia_de_trabalho,requer_treinamento_de_trabalho,num_de_empregados,ano_de_estabelecimento,regiao_de_emprego,salario_prevalecente,unidade_de_salario,posicao_em_tempo_integral
0,EZYV22339,Ásia,Ensino Superior,S,N,2414,1973,Nordeste,103320.800,Ano,N
1,EZYV9333,Ásia,Mestrado,N,S,2928,1937,Oeste,49786.350,Ano,S
2,EZYV9740,Ásia,Ensino Superior,S,N,1836,1999,Oeste,60855.830,Ano,S
3,EZYV3328,Ásia,Ensino Superior,S,N,122,2004,Sul,82379.060,Semana,S
4,EZYV12321,América do Sul,Ensino Superior,S,S,1720,2003,Nordeste,6719.810,Ano,S
...,...,...,...,...,...,...,...,...,...,...,...
7639,EZYV19907,América do Norte,Ensino Superior,N,N,66,2008,Oeste,70.599,Hora,S
7640,EZYV576,Ásia,Ensino Médio,S,N,4040,1994,Meio-Oeste,60158.590,Ano,S
7641,EZYV1804,Ásia,Mestrado,S,N,32503,1925,Sul,44555.120,Ano,S
7642,EZYV13543,América do Norte,Mestrado,S,N,1894,2000,Sul,97830.730,Ano,S


In [129]:
df_test_processed = pipeline_fe.transform(df_test)

In [141]:
df_test['status_do_caso'] = grids['XGBoost'].predict(df_test_processed)

In [143]:
submit_resp = df_test[['id_do_caso', 'status_do_caso']]

In [144]:
submit_resp

Unnamed: 0,id_do_caso,status_do_caso
0,EZYV22339,1
1,EZYV9333,1
2,EZYV9740,1
3,EZYV3328,1
4,EZYV12321,1
...,...,...
7639,EZYV19907,0
7640,EZYV576,0
7641,EZYV1804,1
7642,EZYV13543,1


In [146]:
submit_resp.to_csv('submission_2.csv', index=False)