In [12]:
import os
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

from lightgbm import LGBMClassifier

SEED = 42 # Muito importante manter a SEED igual em todos os modelos para garantir a consistência dos dados no ensemble
FOLDS = 5 # Muito importante manter o mesmo número de FOLDS em todos os modelos para garantir a consistência dos dados no ensemble

In [13]:
sintetico = pd.read_csv('../../src/train/train.csv', index_col='id')
original = pd.read_csv('../../src/train/original.csv')
test = pd.read_csv('../../src/test/test.csv', index_col='id')

train = pd.concat([sintetico, original], ignore_index=True)

initial_features = list(test.columns)

In [14]:
def cross_validation(model, X, y, encoder, scoring=accuracy_score):

    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)

    scores = []
    out_of_fold = []
    
    for i, (train_index, val_index) in enumerate(skf.split(X, y)):
        print(f"Fold {i + 1}")
        
        X_train = X.iloc[train_index]
        y_train = y.iloc[train_index]

        X_val = X.iloc[val_index]
        y_val = y.iloc[val_index]
        
        model.fit(X_train, y_train)

        probabilidades = model.predict_proba(X_val)

        # Recuperar a predição final a partir das probabilidades
        indices_predicoes = np.argmax(probabilidades, axis=1)
        classes_preditas = model.classes_[indices_predicoes]

        score = scoring(y_val, classes_preditas)

        scores.append(score)

        true_label = pd.Series(y_val.values, name='true')

        pred_label_df = pd.DataFrame(probabilidades)

        oof_pred = pd.concat([pred_label_df, true_label], axis=1, ignore_index=True)
        oof_pred.columns = [f'pred_{encoder[model.classes_[0]]}', f'pred_{encoder[model.classes_[1]]}', f'pred_{encoder[model.classes_[2]]}', 'true']

        out_of_fold.append(oof_pred)

    print(f"Score: {np.mean(scores)}")
    
    return scores, out_of_fold

In [15]:
def save_oof(oof):

    os.makedirs('oof', exist_ok=True)

    for i, fold in enumerate(oof):
        fold.to_csv(f'oof/fold_{i+1}.csv', index=False)

In [16]:
cat_features = ['Marital status', 'Application mode', 'Application order', 'Course',
        'Daytime/evening attendance', 'Previous qualification', 'Nacionality',
        "Mother's qualification", "Father's qualification",
        "Mother's occupation", "Father's occupation", "Displaced",
        'Educational special needs', 'Debtor', 'Tuition fees up to date',
        'Gender', 'Scholarship holder', 'International', 'Curricular units 1st sem (credited)',
        'Curricular units 1st sem (enrolled)',
        'Curricular units 1st sem (evaluations)',
        'Curricular units 1st sem (approved)',
        'Curricular units 1st sem (without evaluations)',
        'Curricular units 2nd sem (credited)',
        'Curricular units 2nd sem (enrolled)',
        'Curricular units 2nd sem (evaluations)',
        'Curricular units 2nd sem (approved)',
        'Curricular units 2nd sem (without evaluations)']

for feature in cat_features:
    for df in [train, test]:
        df[feature] = df[feature].astype('category')

num_features = df.select_dtypes(include=['int64', 'float64']).columns

In [17]:
y = train['Target'].replace({"Enrolled": 0, "Graduate": 1, "Dropout": 2})
X = train[initial_features]

In [18]:
encoder = {0: "Enrolled", 1: "Graduate", 2: "Dropout"}

In [19]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features),
    ])

lgbm = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LGBMClassifier(random_state = 42, verbose=-1,
                                    max_depth= 8, 
                                    learning_rate= 0.00694099927384355, 
                                    n_estimators = 1689,
                                    min_child_weight = 0.22269847051511316
                                )
    )
])

In [21]:
scores,oof = cross_validation(lgbm, X, y, encoder)

Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Score: 0.82914923776212


In [23]:
for fold in oof:
    fold['true'] = fold['true'].replace(encoder)
    

save_oof(oof)

In [25]:
oof[0]

Unnamed: 0,pred_Enrolled,pred_Graduate,pred_Dropout,true
0,0.323773,0.647589,0.028638,Graduate
1,0.031517,0.046213,0.922269,Dropout
2,0.019986,0.008483,0.971531,Dropout
3,0.067120,0.008723,0.924157,Dropout
4,0.028315,0.000898,0.970787,Dropout
...,...,...,...,...
16184,0.013116,0.973776,0.013108,Graduate
16185,0.057799,0.886558,0.055643,Graduate
16186,0.219285,0.151198,0.629517,Dropout
16187,0.027612,0.951510,0.020878,Graduate
