## Imports e funcs

In [49]:
import pandas as pd
import numpy as np


from sklearn.model_selection import (
    train_test_split, KFold)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    OneHotEncoder, OrdinalEncoder, MinMaxScaler, StandardScaler, LabelEncoder)
from sklearn.impute import SimpleImputer



from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score


import matplotlib.pyplot as plt
import seaborn as sns

In [50]:

def preprocess_data(df, target_column):
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    
    # Realizar o encoding das features categóricas, se necessário
    # label_encoder = LabelEncoder()
    # X['feature_categorica'] = label_encoder.fit_transform(X['feature_categorica'])
    
    return X, y

def split_data(X, y, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

def train_models(X_train, y_train):
    # Inicializar os modelos de classificação
    models = [
        ('Logistic Regression', LogisticRegression()),
        ('Decision Tree', DecisionTreeClassifier()),
        ('Random Forest', RandomForestClassifier())
    ]

    trained_models = {}
    
    # Treinar os modelos
    for model_name, model in models:
        model.fit(X_train, y_train)
        trained_models[model_name] = model
    
    return trained_models


def evaluate_models(trained_models, X_test, y_test):
    # Avaliar os modelos
    for model_name, model in trained_models.items():
        y_pred = model.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        
        print(f'Model: {model_name}')
        print(f'Accuracy: {accuracy}')
        print(f'Precision: {precision}')
        print(f'Recall: {recall}')
        print(f'F1-Score: {f1}')
        print('---')


In [51]:


def plotbar(df, x_column, horizontal=False):
    counts = df[x_column].value_counts()
    
    plt.figure(figsize=(10, 6))  # Ajuste o tamanho conforme necessário
    
    if horizontal:
        bars = plt.barh(counts.index, counts.values)
        plt.xlabel('Quantidade')
        plt.ylabel(x_column)
    else:
        bars = plt.bar(counts.index, counts.values)
        plt.xlabel(x_column)
        plt.ylabel('Quantidade')
    
    plt.title('Gráfico de Barras')
    
    # Adicionar os valores do eixo y nas barras
    for bar in bars:
        if horizontal:
            width = bar.get_width()
            plt.text(width, bar.get_y() + bar.get_height() / 2, str(int(width)), ha='left', va='center')
        else:
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width() / 2, height, str(int(height)), ha='center', va='bottom')
    
    plt.show()


In [52]:
def cls_feature(X,max_cat = 10):
        #Agrupando features numéricas
    numeric_features  = [cls for cls in X.columns if X[cls].dtype == 'float64' or X[cls].dtype =='int64']
    # print('>>numeric_features:', numeric_features ,
          # len(numeric_features))


    #Agrupando features categoricas
    categorical_features = [cls for cls in X.columns if X[cls].dtypes == 'object' and X[cls].nunique()<=max_cat]
    # print('\n>>cat_features:', cat_features ,
          # len(cat_features))
    return numeric_features , categorical_features

In [64]:
data_train = pd.read_csv('df2_train.csv')
data_test =  pd.read_csv('data/test.csv')

In [65]:
df = data_train

In [66]:
df['tem_experiencia_de_trabalho'] = df['tem_experiencia_de_trabalho'].astype('category').cat.codes
df['requer_treinamento_de_trabalho'] = df['requer_treinamento_de_trabalho'].astype('category').cat.codes

In [67]:
# Separar os dados em features (X) e target (y)
X = df.drop(columns='status_do_caso')  # Substitua 'target_column' pelo nome da coluna alvo
y = df['status_do_caso']

In [68]:
X

Unnamed: 0,continente,educacao_do_empregado,tem_experiencia_de_trabalho,requer_treinamento_de_trabalho,num_de_empregados,ano_de_estabelecimento,regiao_de_emprego,salario_prevalecente,unidade_de_salario,posicao_em_tempo_integral,cat_empresa,cat_salario,cat_experiencia
0,2,1,0,1,2087,1855,4,69711.24,0,1,1,2,3
1,5,3,1,0,5991,2003,1,52931.38,0,1,3,2,2
2,5,1,0,0,1426,2000,0,110830.21,0,1,3,4,4
3,5,1,0,0,3846,1992,1,91884.68,3,1,2,3,4
4,5,2,1,0,3957,1949,2,138155.24,0,1,2,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17831,5,3,1,0,95347,1995,3,69692.24,0,1,2,2,2
17832,2,1,0,0,1483,1886,4,136237.62,0,1,1,4,4
17833,0,3,0,0,1504,2010,2,118187.30,0,1,3,4,4
17834,0,4,1,0,251967,2010,1,93133.40,0,1,3,3,2


In [69]:
y

0        0
1        1
2        0
3        0
4        1
        ..
17831    1
17832    0
17833    0
17834    1
17835    1
Name: status_do_caso, Length: 17836, dtype: int64

### Model

In [70]:
X_train, X_test, y_train, y_test = split_data(X,y,test_size=0.2,random_state=42)

In [71]:

# Treinar os modelos
trained_models = train_models(X_train, y_train)



In [72]:
# Avaliar os modelos
evaluate_models(trained_models, X_test, y_test)

Model: Logistic Regression
Accuracy: 0.679932735426009
Precision: 0.46230852470389505
Recall: 0.679932735426009
F1-Score: 0.5503893280425418
---
Model: Decision Tree
Accuracy: 0.6446188340807175
Precision: 0.6514786176259022
Recall: 0.6446188340807175
F1-Score: 0.6477225331683928
---
Model: Random Forest
Accuracy: 0.7261771300448431
Precision: 0.716969214303177
Recall: 0.7261771300448431
F1-Score: 0.7198454137989314
---


  _warn_prf(average, modifier, msg_start, len(result))


In [73]:
model = RandomForestClassifier(n_estimators=300)

In [74]:
model.fit(X_train,y_train)

RandomForestClassifier(n_estimators=300)

In [75]:
y_pred = model.predict(X_test)

In [76]:
from sklearn.metrics import roc_auc_score

micro_roc_auc_ovr = roc_auc_score(
    y_test,
    y_pred,
)
micro_roc_auc_ovr

0.6598593318443078

### model 2

In [78]:
numeric_features, categorical_features = cls_feature(X)

In [90]:
numeric_features

['continente',
 'educacao_do_empregado',
 'num_de_empregados',
 'ano_de_estabelecimento',
 'regiao_de_emprego',
 'salario_prevalecente',
 'unidade_de_salario',
 'posicao_em_tempo_integral',
 'cat_empresa',
 'cat_salario',
 'cat_experiencia']

In [91]:
categorical_features

[]

In [83]:
pipe_categorical = Pipeline([
    ('imputer',SimpleImputer(strategy='constant',fill_value='missing')),
    ('enconder',OneHotEncoder(handle_unknown='ignore',sparse=False))
    
])
pipe_numerical = Pipeline([
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])


preprocessor = ColumnTransformer([
        ("num", pipe_numerical, numeric_features),
        ("cat", pipe_categorical, categorical_features),
]
)
preprocessor

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaler', StandardScaler())]),
                                 ['continente', 'educacao_do_empregado',
                                  'num_de_empregados', 'ano_de_estabelecimento',
                                  'regiao_de_emprego', 'salario_prevalecente',
                                  'unidade_de_salario',
                                  'posicao_em_tempo_integral', 'cat_empresa',
                                  'cat_salario', 'cat_experiencia']),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(fill_value='missing',
                                                                strategy='constant')),
                      

In [84]:
clf = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LogisticRegression())
])
clf

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['continente',
                                                   'educacao_do_empregado',
                                                   'num_de_empregados',
                                                   'ano_de_estabelecimento',
                                                   'regiao_de_emprego',
                                                   'salario_prevalecente',
                                                   'unidade_de_salario',
                                                   'pos

In [85]:
# Aplicar o pipeline aos dados de treinamento
X_train_preprocessed = clf["preprocessor"].fit_transform(X_train)

# Treinar o modelo
clf.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['continente',
                                                   'educacao_do_empregado',
                                                   'num_de_empregados',
                                                   'ano_de_estabelecimento',
                                                   'regiao_de_emprego',
                                                   'salario_prevalecente',
                                                   'unidade_de_salario',
                                                   'pos

In [88]:
# Fazer previsões nos dados de teste
y_pred = clf.predict(X_test)

# Obter relatório de classificação
classification_report = classification_report(y_test, y_pred)

print("Relatório de Classificação:")
print(classification_report)

Relatório de Classificação:
              precision    recall  f1-score   support

           0       0.59      0.45      0.51      1142
           1       0.77      0.85      0.81      2426

    accuracy                           0.72      3568
   macro avg       0.68      0.65      0.66      3568
weighted avg       0.71      0.72      0.71      3568

