# MARATONA BEHIND THE CODE 2020

## DESAFIO 6 - LIT

# Parte 1. Importando os pacotes

In [1]:
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
import datetime as dt

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import KFold, cross_val_score, cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score

import warnings
warnings.filterwarnings("ignore")

# Parte 2. Carregando os Dados

In [2]:
# Carregando o dataset da competição
treino = pd.read_csv('../data/training_dataset.csv')
teste  = pd.read_csv('../data/to_be_scored.csv')

# Parte 3. Feature Engineering

In [3]:
# Funcao para verificar valores missing no dataset
def check_missing(df, display = 10):
    temp_df = df.copy()
    df_nan = (temp_df.isnull().sum() / len(temp_df)) * 100
    missing_data = pd.DataFrame({'Missing n': temp_df.isnull().sum(),'% Missing' :df_nan})
    if missing_data['Missing n'].sum() == 0:
        return print('Ótimo! Não há mais valores faltantes neste dataset.')
    else:
        return missing_data.sort_values('% Missing', ascending = False).head(display)

# Funcao para preencher os dados missing de algumas colunas
def preenche_missing(df):
    
    # Remove todos os registros que tem algum dado vazio de algumas colunas
    df.dropna(axis='index', how='any', subset=['certificados', 'total_modulos'], inplace=True)

    # coloca constante 0 em outras colunas com dados missing
    impute_zeros = SimpleImputer(
                                    missing_values=np.nan,
                                    strategy='constant',
                                    fill_value=0,
                                    verbose=0,
                                    copy=True
                                )
    
    impute_zeros.fit(X=df)

    # Reconstruindo um Pandas DataFrame com os resultados
    df = pd.DataFrame.from_records(
            data = impute_zeros.transform(
            X    = df
        ),
        columns=df.columns
    )    

    return df

# Funcao para trabalhar as features
def fe(df):

    # Removendo colunas unicas
    df = df.drop(['id', 'graduacao', 'universidade', 'organizacao', 'como_conheceu_lit'], axis = 1)

    return df

# Funcao para transformar features categoricas em numericas
def fe_cat_num(df):

    # Tratando variáveis categóricas com o método Pandas ``get_dummies()''
    df = pd.get_dummies(df, columns=['profissao'])

    return df

In [4]:
# Feature Engineering
treino = preenche_missing(treino)    
treino = fe(treino)   
treino = fe_cat_num(treino)    

# Removendo outliers
treino = treino[treino['modulos_finalizados'] <= 500]

# Verificando as colunas com dados missing do dataset
check_missing(treino, display = 15)

print(treino.shape)
treino.head(10)

Ótimo! Não há mais valores faltantes neste dataset.
(12315, 22)


Unnamed: 0,pretende_fazer_cursos_lit,interesse_mba_lit,importante_ter_certificado,horas_semanais_estudo,total_modulos,modulos_iniciados,modulos_finalizados,certificados,categoria,profissao_0,...,profissao_Analista Senior,profissao_Assessor,profissao_Coordenador,profissao_Diretor,profissao_Engenheiro,profissao_Gerente,profissao_Outros,profissao_SEM EXPERIÊNCIA,profissao_Supervisor,profissao_Sócio/Dono/Proprietário
0,1.0,1.0,1.0,6.0,28.0,5.0,4.0,0.0,perfil6,0,...,1,0,0,0,0,0,0,0,0,0
1,0.0,0.0,1.0,6.0,66.0,36.0,34.0,0.0,perfil2,0,...,0,0,0,0,0,0,0,0,1,0
2,1.0,1.0,1.0,10.0,27.0,27.0,27.0,0.0,perfil2,0,...,0,0,0,0,0,0,1,0,0,0
3,0.0,1.0,1.0,5.0,29.0,10.0,6.0,0.0,perfil5,0,...,0,0,0,0,0,0,1,0,0,0
4,0.0,0.0,1.0,7.0,67.0,49.0,44.0,0.0,perfil2,0,...,0,0,0,0,0,0,0,1,0,0
5,1.0,1.0,1.0,0.0,66.0,52.0,50.0,0.0,perfil2,0,...,0,0,0,0,0,0,0,0,0,0
6,0.0,0.0,0.0,2.0,31.0,0.0,13.0,0.0,perfil4,0,...,0,1,0,0,0,0,0,0,0,0
7,0.0,0.0,0.0,6.0,27.0,13.0,12.0,0.0,perfil2,0,...,0,0,0,0,0,0,0,0,0,0
8,0.0,0.0,1.0,3.0,131.0,38.0,20.0,0.0,perfil3,0,...,0,0,0,0,1,0,0,0,0,0
9,0.0,0.0,1.0,9.0,51.0,49.0,49.0,0.0,perfil2,0,...,0,0,0,0,0,1,0,0,0,0


In [5]:
# Feature Engineering
teste = preenche_missing(teste)    
teste = fe(teste)   
teste = fe_cat_num(teste)    
teste['profissao_0'] = 0

# Verificando as colunas com dados missing do dataset
check_missing(teste, display = 15)

print(teste.shape)
teste.head(10)

Ótimo! Não há mais valores faltantes neste dataset.
(1000, 21)


Unnamed: 0,pretende_fazer_cursos_lit,interesse_mba_lit,importante_ter_certificado,horas_semanais_estudo,total_modulos,modulos_iniciados,modulos_finalizados,certificados,profissao_Advogado,profissao_Analista,...,profissao_Assessor,profissao_Coordenador,profissao_Diretor,profissao_Engenheiro,profissao_Gerente,profissao_Outros,profissao_SEM EXPERIÊNCIA,profissao_Supervisor,profissao_Sócio/Dono/Proprietário,profissao_0
0,0,0,1,3,58,3,2,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,1,1,7,62,19,17,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,1,9,36,3,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,1,1,1,9,145,61,54,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,1,8,33,33,29,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5,0,0,1,3,29,13,11,0,0,0,...,0,0,0,0,0,0,1,0,0,0
6,0,0,1,8,521,190,160,1,0,0,...,0,0,0,0,0,0,1,0,0,0
7,0,0,1,4,10,10,10,0,0,0,...,0,0,1,0,0,0,0,0,0,0
8,0,0,1,10,27,22,17,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9,1,1,1,7,10,8,6,0,0,0,...,0,0,0,0,0,0,1,0,0,0


# Parte 4. Machine Learning

In [6]:
features = ['pretende_fazer_cursos_lit', 'interesse_mba_lit',
            'importante_ter_certificado', 'horas_semanais_estudo', 'total_modulos',
            'modulos_iniciados', 'modulos_finalizados', 'certificados',
            'profissao_0', 'profissao_Advogado', 'profissao_Analista',
            'profissao_Analista Senior', 'profissao_Assessor',
            'profissao_Coordenador', 'profissao_Diretor', 'profissao_Engenheiro',
            'profissao_Gerente', 'profissao_Outros', 'profissao_SEM EXPERIÊNCIA',
            'profissao_Supervisor', 'profissao_Sócio/Dono/Proprietário']

target = 'categoria'

In [7]:
# Separando features e target
X = treino[features]
y = treino[target]

# Split dos dados em treino e validacao
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=133)

# Treinamento com GradientBoosting
param = {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500}

model = GradientBoostingClassifier(**param).fit(X_train, y_train)

# Validacao do modelo
y_pred_val = model.predict(X_valid)

# Matriz de Classificacao
print(classification_report(y_valid, y_pred_val))

# Avaliacao da metrica F1
print('F1_macro   : ', f1_score(y_valid, y_pred_val, average='macro'))
print('F1_micro   : ', f1_score(y_valid, y_pred_val, average='micro'))
print('F1_weighted: ', f1_score(y_valid, y_pred_val, average='weighted'))
print('F1_none    : ', f1_score(y_valid, y_pred_val, average=None))

              precision    recall  f1-score   support

     perfil1       0.85      0.81      0.83       406
     perfil2       0.83      0.85      0.84       650
     perfil3       0.81      0.71      0.76       156
     perfil4       0.80      0.78      0.79       265
     perfil5       0.83      0.85      0.84       288
     perfil6       0.84      0.87      0.85       698

    accuracy                           0.83      2463
   macro avg       0.83      0.81      0.82      2463
weighted avg       0.83      0.83      0.83      2463

F1_macro   :  0.818293470859962
F1_micro   :  0.8319123020706456
F1_weighted:  0.831440463096592
F1_none    :  [0.82619647 0.841306   0.75767918 0.79238095 0.83848797 0.85371025]


# Parte 5. Realizando novas previsões

In [8]:
# Copia do teste
sub = teste.copy()

# Realizando previsoes com dados de teste
y_pred_test = model.predict(sub)

# Salvando na coluna target
sub['target'] = y_pred_test

# Visualizando os ultimos registros
sub.tail()

Unnamed: 0,pretende_fazer_cursos_lit,interesse_mba_lit,importante_ter_certificado,horas_semanais_estudo,total_modulos,modulos_iniciados,modulos_finalizados,certificados,profissao_Advogado,profissao_Analista,...,profissao_Coordenador,profissao_Diretor,profissao_Engenheiro,profissao_Gerente,profissao_Outros,profissao_SEM EXPERIÊNCIA,profissao_Supervisor,profissao_Sócio/Dono/Proprietário,profissao_0,target
995,0,0,1,2,245,63,45,1,0,0,...,1,0,0,0,0,0,0,0,0,perfil1
996,1,1,1,8,163,111,96,0,0,0,...,0,0,0,0,0,0,0,0,0,perfil2
997,1,1,1,9,56,10,4,0,0,0,...,0,0,0,1,0,0,0,0,0,perfil6
998,1,1,1,9,138,60,30,0,0,0,...,0,0,0,0,1,0,0,0,0,perfil2
999,0,0,1,9,22,20,19,0,1,0,...,0,0,0,0,0,0,0,0,0,perfil3


In [9]:
# Visualizando a distribuicao dos targets
sub['target'].value_counts()

perfil6    297
perfil2    249
perfil1    148
perfil4    123
perfil5    122
perfil3     61
Name: target, dtype: int64

In [10]:
# Gravando resultado em csv
sub.to_csv('../submission/results_otimizado.csv',index=False)