In [1]:
import os
import pandas as pd
import numpy as np
from scipy.stats import mode
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import MinMaxScaler, LabelEncoder


from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

from xgboost import XGBClassifier

SEED = 42 # Muito importante manter a SEED igual em todos os modelos para garantir a consistência dos dados no ensemble
FOLDS = 5 # Muito importante manter o mesmo número de FOLDS em todos os modelos para garantir a consistência dos dados no ensemble

In [2]:
sintetico = pd.read_csv('../../src/train/train.csv', index_col='id')
original = pd.read_csv('../../src/train/original.csv')
test = pd.read_csv('../../src/test/test.csv', index_col='id')

train = pd.concat([sintetico, original], ignore_index=True)

initial_features = list(test.columns)

In [3]:
def cross_validation(model, X, y, encoder, scoring=accuracy_score):

    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)

    scores = []
    out_of_fold = []
    
    for i, (train_index, val_index) in enumerate(skf.split(X, y)):
        print(f"Fold {i + 1}")
        
        X_train = X.iloc[train_index]
        y_train = y.iloc[train_index]

        X_val = X.iloc[val_index]
        y_val = y.iloc[val_index]
        
        model.fit(X_train, y_train)

        probabilidades = model.predict_proba(X_val)

        # Recuperar a predição final a partir das probabilidades
        indices_predicoes = np.argmax(probabilidades, axis=1)
        classes_preditas = model.classes_[indices_predicoes]

        score = scoring(y_val, classes_preditas)

        scores.append(score)

        true_label = pd.Series(y_val.values, name='true')

        pred_label_df = pd.DataFrame(probabilidades)

        oof_pred = pd.concat([pred_label_df, true_label], axis=1, ignore_index=True)
        oof_pred.columns = [f'pred_{encoder.classes_[model.classes_[0]]}', f'pred_{encoder.classes_[model.classes_[1]]}', f'pred_{encoder.classes_[model.classes_[2]]}', 'true']

        out_of_fold.append(oof_pred)

    print(f"Score: {np.mean(scores)}")
    
    return scores, out_of_fold

In [4]:
def save_oof(oof):

    os.makedirs('oof', exist_ok=True)

    for i, fold in enumerate(oof):
        fold.to_csv(f'oof/fold_{i+1}.csv', index=False)

In [5]:
def predict_test(model, X_train, y_train, X_test, encoder):

    model.fit(X_train, y_train)

    probabilidades = model.predict_proba(X_test)
    pred_label_df = pd.DataFrame(probabilidades)

    pred_label_df.columns = [f'pred_{encoder[model.classes_[0]]}', f'pred_{encoder[model.classes_[1]]}', f'pred_{encoder[model.classes_[2]]}']

    os.makedirs('test', exist_ok=True)

    pred_label_df.to_csv(f'test/test_pred.csv', index=False)

    return pred_label_df

In [6]:
def PreprocessDatapreprocess_data(df):
    df = df.copy()
    df['total'] = df.sum(axis = 1)
    df['mean'] = df.mean(axis = 1)
    df['median'] = df.median(axis = 1)
    df['std'] = df.std(axis = 1)
    df['min'] = df.min(axis = 1)
    df['max'] = df.max(axis = 1)
    df['ptp'] = df.values.ptp(axis = 1)
    df['q25'] = df.quantile(0.25, axis = 1)
    df['q75'] = df.quantile(0.75, axis = 1)
    df['credited_avg'] = (df['Curricular units 1st sem (credited)'] + df['Curricular units 2nd sem (credited)']) / 2
    df['enrolled_avg'] = (df['Curricular units 1st sem (enrolled)'] + df['Curricular units 2nd sem (enrolled)']) / 2
    df['evaluations_avg'] = (df['Curricular units 1st sem (evaluations)'] + df['Curricular units 2nd sem (evaluations)']) / 2
    df['approved_avg'] = (df['Curricular units 1st sem (approved)'] + df['Curricular units 2nd sem (approved)']) / 2
    df['grade_avg'] = (df['Curricular units 1st sem (grade)'] + df['Curricular units 2nd sem (grade)']) / 2
    df['wo_evaluations_avg'] = (df['Curricular units 1st sem (without evaluations)'] + 
                                df['Curricular units 2nd sem (without evaluations)']) / 2
    df['Age_at_enrollment_cat'] = pd.cut(df['Age at enrollment'], 
                                            bins = [18, 24, 30, 35, 40], 
                                            labels = [1, 2, 3, 4])
    df['Parents_qualification_avg'] = (df["Mother's qualification"] + df["Father's qualification"]) / 2
    df['Parents_occupation_avg'] = (df["Mother's occupation"] + df["Father's occupation"]) / 2
    df['Unemployment_Inflation_ratio'] = (df['Unemployment rate'] + 1) / (df['Inflation rate'] + 1)
    
    return df

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin

class PreprocessData(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        df = X.copy()
        df['total'] = df.sum(axis=1)
        df['mean'] = df.mean(axis=1)
        df['median'] = df.median(axis=1)
        df['std'] = df.std(axis=1)
        df['min'] = df.min(axis=1)
        df['max'] = df.max(axis=1)
        df['ptp'] = df.values.ptp(axis=1)
        df['q25'] = df.quantile(0.25, axis=1)
        df['q75'] = df.quantile(0.75, axis=1)
        df['credited_avg'] = (df['Curricular units 1st sem (credited)'] + df['Curricular units 2nd sem (credited)']) / 2
        df['enrolled_avg'] = (df['Curricular units 1st sem (enrolled)'] + df['Curricular units 2nd sem (enrolled)']) / 2
        df['evaluations_avg'] = (df['Curricular units 1st sem (evaluations)'] + df['Curricular units 2nd sem (evaluations)']) / 2
        df['approved_avg'] = (df['Curricular units 1st sem (approved)'] + df['Curricular units 2nd sem (approved)']) / 2
        df['grade_avg'] = (df['Curricular units 1st sem (grade)'] + df['Curricular units 2nd sem (grade)']) / 2
        df['wo_evaluations_avg'] = (df['Curricular units 1st sem (without evaluations)'] + 
                                    df['Curricular units 2nd sem (without evaluations)']) / 2
        df['Age_at_enrollment_cat'] = pd.cut(df['Age at enrollment'], 
                                                bins=[18, 24, 30, 35, 40], 
                                                labels=[1, 2, 3, 4])
        df['Parents_qualification_avg'] = (df["Mother's qualification"] + df["Father's qualification"]) / 2
        df['Parents_occupation_avg'] = (df["Mother's occupation"] + df["Father's occupation"]) / 2
        df['Unemployment_Inflation_ratio'] = (df['Unemployment rate'] + 1) / (df['Inflation rate'] + 1)
        
        return df


In [8]:
le = LabelEncoder()

y = pd.DataFrame(le.fit_transform(train['Target']), columns=['Target'])
y = y['Target']
X = train[initial_features]

In [9]:
cat_best_params = {'iterations': 1060, 
                    'learning_rate': 0.12870938526369563, 
                    'depth': 5, 
                    'min_data_in_leaf': 2, 
                    'colsample_bylevel': 0.5571138654353053, 
                    'l2_leaf_reg': 3.917650301267089, 
                    'random_strength': 4.0271592863936245, 
                    'bagging_temperature': 0.20975982889738484}

In [10]:
# Pipeline de pré-processamento + escalonamento
preprocess_and_scale_pipeline = Pipeline([
    ('fet_eng', PreprocessData()),
    ('scaler', MinMaxScaler())
])

# ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('fet_eng_and_scaling', preprocess_and_scale_pipeline, X.columns),
    ]
)

In [11]:
xgb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', CatBoostClassifier(**cat_best_params, verbose=0)),
])

In [12]:
scores,oof = cross_validation(xgb, X, y, le)

Fold 1


Fold 2
Fold 3
Fold 4
Fold 5
Score: 0.8300017226094226


In [13]:
for fold in oof:
    fold['true'] = fold['true'].replace(le.classes_)

save_oof(oof)

  fold['true'] = fold['true'].replace(le.classes_)


In [14]:
X_test = test[initial_features]

predict_test(xgb, X, y, X_test, le.classes_)

Unnamed: 0,pred_Dropout,pred_Enrolled,pred_Graduate
0,0.996518,0.001092,0.002389
1,0.004375,0.010570,0.985055
2,0.038680,0.206687,0.754632
3,0.180649,0.246410,0.572942
4,0.210739,0.737709,0.051552
...,...,...,...
51007,0.822866,0.052212,0.124922
51008,0.988725,0.010898,0.000378
51009,0.981241,0.009394,0.009365
51010,0.890343,0.092572,0.017085
