## Imports e funcs

In [12]:
import pandas as pd
import numpy as np


from sklearn.model_selection import (
    train_test_split, KFold)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    OneHotEncoder, OrdinalEncoder, MinMaxScaler, StandardScaler, LabelEncoder)
from sklearn.impute import SimpleImputer



from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score


import matplotlib.pyplot as plt
import seaborn as sns

In [13]:

def preprocess_data(df, target_column):
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    
    # Realizar o encoding das features categóricas, se necessário
    # label_encoder = LabelEncoder()
    # X['feature_categorica'] = label_encoder.fit_transform(X['feature_categorica'])
    
    return X, y

def split_data(X, y, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

def train_model(model, X_train, y_train):
    model.fit(X_train, y_train)

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print('Evaluation Metrics:')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1-Score: {f1}')

In [2]:


def plotbar(df, x_column, horizontal=False):
    counts = df[x_column].value_counts()
    
    plt.figure(figsize=(10, 6))  # Ajuste o tamanho conforme necessário
    
    if horizontal:
        bars = plt.barh(counts.index, counts.values)
        plt.xlabel('Quantidade')
        plt.ylabel(x_column)
    else:
        bars = plt.bar(counts.index, counts.values)
        plt.xlabel(x_column)
        plt.ylabel('Quantidade')
    
    plt.title('Gráfico de Barras')
    
    # Adicionar os valores do eixo y nas barras
    for bar in bars:
        if horizontal:
            width = bar.get_width()
            plt.text(width, bar.get_y() + bar.get_height() / 2, str(int(width)), ha='left', va='center')
        else:
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width() / 2, height, str(int(height)), ha='center', va='bottom')
    
    plt.show()


In [4]:
def cls_feature(X,max_cat = 10):
        #Agrupando features numéricas
    numeric_features  = [cls for cls in X.columns if X[cls].dtype == 'float64' or X[cls].dtype =='int64']
    # print('>>numeric_features:', numeric_features ,
          # len(numeric_features))


    #Agrupando features categoricas
    categorical_features = [cls for cls in X.columns if X[cls].dtypes == 'object' and X[cls].nunique()<=max_cat]
    # print('\n>>cat_features:', cat_features ,
          # len(cat_features))
    return numeric_features , categorical_features

In [6]:
data_train = pd.read_csv('data/df2_train.csv')
data_test =  pd.read_csv('data/test.csv')

In [8]:
df = data_train

In [9]:
# Separar os dados em features (X) e target (y)
X = df.drop(columns='status_do_caso')  # Substitua 'target_column' pelo nome da coluna alvo
y = df['status_do_caso']

In [10]:
X

Unnamed: 0,continente,educacao_do_empregado,num_de_empregados,ano_de_estabelecimento,regiao_de_emprego,salario_prevalecente,unidade_de_salario,posicao_em_tempo_integral,cat_empresa,cat_salario,cat_experiencia
0,2,1,2087,1855,4,69711.24,0,1,1,2,3
1,5,3,5991,2003,1,52931.38,0,1,3,2,2
2,5,1,1426,2000,0,110830.21,0,1,3,4,4
3,5,1,3846,1992,1,91884.68,3,1,2,3,4
4,5,2,3957,1949,2,138155.24,0,1,2,4,2
...,...,...,...,...,...,...,...,...,...,...,...
17831,5,3,95347,1995,3,69692.24,0,1,2,2,2
17832,2,1,1483,1886,4,136237.62,0,1,1,4,4
17833,0,3,1504,2010,2,118187.30,0,1,3,4,4
17834,0,4,251967,2010,1,93133.40,0,1,3,3,2


In [11]:
y

0        0
1        1
2        0
3        0
4        1
        ..
17831    1
17832    0
17833    0
17834    1
17835    1
Name: status_do_caso, Length: 17836, dtype: int64