In [1]:
!pip install flaml



In [2]:
import pandas as pd
import warnings
import numpy as np
import re
import os
from sklearn.model_selection import StratifiedKFold, cross_val_score

from sklearn.metrics import accuracy_score

from flaml import AutoML

SEED = 42 # Muito importante manter a SEED igual em todos os modelos para garantir a consistência dos dados no ensemble
FOLDS = 5 # Muito importante manter o mesmo número de FOLDS em todos os modelos para garantir a consistência dos dados no ensemble

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
sintetico = pd.read_csv('../../src/train/train.csv', index_col='id')
original = pd.read_csv('../../src/train/original.csv')
test = pd.read_csv('../../src/test/test.csv', index_col='id')

train = pd.concat([sintetico, original], ignore_index=True)

initial_features = list(test.columns)

In [4]:
def cross_validation(model, X, y, encoder, scoring=accuracy_score):

    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)

    scores = []
    out_of_fold = []
    
    for i, (train_index, val_index) in enumerate(skf.split(X, y)):
        print(f"Fold {i + 1}")
        
        X_train = X.iloc[train_index]
        y_train = y.iloc[train_index]

        X_val = X.iloc[val_index]
        y_val = y.iloc[val_index]
        
        
        model.fit(X_train, y_train, task="classification",metric='roc_auc_ovo',time_budget=12000*3)

        probabilidades = model.predict_proba(X_val)

        # Recuperar a predição final a partir das probabilidades
        indices_predicoes = np.argmax(probabilidades, axis=1)
        classes_preditas = model.classes_[indices_predicoes]

        score = scoring(y_val, classes_preditas)

        scores.append(score)

        true_label = pd.Series(y_val.values, name='true')

        pred_label_df = pd.DataFrame(probabilidades)

        oof_pred = pd.concat([pred_label_df, true_label], axis=1, ignore_index=True)
        oof_pred.columns = [f'pred_{encoder[model.classes_[0]]}', f'pred_{encoder[model.classes_[1]]}', f'pred_{encoder[model.classes_[2]]}', 'true']

        out_of_fold.append(oof_pred)

    print(f"Score: {np.mean(scores)}")
    
    return scores, out_of_fold

In [5]:
def predict_test(model, X_train, y_train, X_test, encoder):

    model.fit(X_train, y_train, task='classification', estimator_list=['xgb_limitdepth'], metric='roc_auc_ovo',time_budget=12000*3)

    print('Best ML leaner:', model.best_estimator)
    print('Best hyperparmeter config:', model.best_config)
    print('Best roc_auc_ovo  on validation data: {0:.4g}'.format(1-model.best_loss))

    probabilidades = model.predict_proba(X_test)
    pred_label_df = pd.DataFrame(probabilidades)

    pred_label_df.columns = [f'pred_{encoder[model.classes_[0]]}', f'pred_{encoder[model.classes_[1]]}', f'pred_{encoder[model.classes_[2]]}']

    os.makedirs('test', exist_ok=True)

    pred_label_df.to_csv(f'test/test_pred.csv', index=False)

    return pred_label_df

In [6]:
def save_oof(oof):

    os.makedirs('oof', exist_ok=True)

    for i, fold in enumerate(oof):
        fold.to_csv(f'oof/fold_{i+1}.csv', index=False)

In [7]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('object')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [8]:
train = train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
train.head()

Unnamed: 0,Maritalstatus,Applicationmode,Applicationorder,Course,Daytimeeveningattendance,Previousqualification,Previousqualificationgrade,Nacionality,Mothersqualification,Fathersqualification,...,Curricularunits2ndsemcredited,Curricularunits2ndsemenrolled,Curricularunits2ndsemevaluations,Curricularunits2ndsemapproved,Curricularunits2ndsemgrade,Curricularunits2ndsemwithoutevaluations,Unemploymentrate,Inflationrate,GDP,Target
0,1,1,1,9238,1,1,126.0,1,1,19,...,0,6,7,6,12.428571,0,11.1,0.6,2.02,Graduate
1,1,17,1,9238,1,1,125.0,1,19,19,...,0,6,9,0,0.0,0,11.1,0.6,2.02,Dropout
2,1,17,2,9254,1,1,137.0,1,3,19,...,0,6,0,0,0.0,0,16.2,0.3,-0.92,Dropout
3,1,1,3,9500,1,1,131.0,1,19,3,...,0,8,11,7,12.82,0,11.1,0.6,2.02,Enrolled
4,1,1,2,9500,1,1,132.0,1,19,37,...,0,7,12,6,12.933333,0,7.6,2.6,0.32,Graduate


In [9]:
train = reduce_mem_usage(train)

train.info()

Memory usage of dataframe is 22.85 MB
Memory usage after optimization is: 4.17 MB
Decreased by 81.8%
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80942 entries, 0 to 80941
Data columns (total 37 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Maritalstatus                            80942 non-null  int8   
 1   Applicationmode                          80942 non-null  int8   
 2   Applicationorder                         80942 non-null  int8   
 3   Course                                   80942 non-null  int16  
 4   Daytimeeveningattendance                 80942 non-null  int8   
 5   Previousqualification                    80942 non-null  int8   
 6   Previousqualificationgrade               80942 non-null  float16
 7   Nacionality                              80942 non-null  int8   
 8   Mothersqualification                     80942 non-null  int8   
 9   Fathersqualific

In [10]:
y = train.pop('Target')
X = train

initial_features = list(X.columns)

encoder = {
        'Graduate':'Graduate',
        'Enrolled':'Enrolled',
        'Dropout':'Dropout'
           }

In [11]:
automl = AutoML()

In [12]:
warnings.filterwarnings("ignore")

scores,oof = cross_validation(automl, X, y, encoder)

Fold 1
[flaml.automl.logger: 06-18 00:11:46] {1680} INFO - task = classification
[flaml.automl.logger: 06-18 00:11:46] {1691} INFO - Evaluation method: cv
[flaml.automl.logger: 06-18 00:11:46] {1789} INFO - Minimizing error metric: 1-roc_auc_ovo
[flaml.automl.logger: 06-18 00:11:46] {1901} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1']
[flaml.automl.logger: 06-18 00:11:46] {2219} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 06-18 00:11:47] {2345} INFO - Estimated sufficient time budget=6451s. Estimated necessary time budget=158s.
[flaml.automl.logger: 06-18 00:11:47] {2392} INFO -  at 0.9s,	estimator lgbm's best error=0.1166,	best estimator lgbm's best error=0.1166
[flaml.automl.logger: 06-18 00:11:47] {2219} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 06-18 00:11:48] {2392} INFO -  at 1.5s,	estimator lgbm's best error=0.1166,	best estimator lgbm's best error=0.1166
[flaml.auto

In [None]:
print('Best ML leaner:', automl.best_estimator)
print('Best hyperparmeter config:', automl.best_config)
print('Best roc_auc_ovo  on validation data: {0:.4g}'.format(1-automl.best_loss))
print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))

AttributeError: 'AutoML' object has no attribute '_best_estimator'

In [None]:
for fold in oof:
    fold['true'] = fold['true'].replace(encoder)

save_oof(oof)

In [None]:
scores

[0.8338995614306011,
 0.832417073321391,
 0.8320978502594515,
 0.8322831727205338,
 0.8340128490239683]

In [None]:
test = test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
test = reduce_mem_usage(test)
X_test = test[initial_features]

automl = AutoML()

predict_test(automl, X, y, X_test, encoder)

Memory usage of dataframe is 2.63 MB
Memory usage after optimization is: 2.63 MB
Decreased by 0.0%
[flaml.automl.logger: 06-16 17:10:20] {1680} INFO - task = classification
[flaml.automl.logger: 06-16 17:10:20] {1691} INFO - Evaluation method: cv
[flaml.automl.logger: 06-16 17:10:20] {1789} INFO - Minimizing error metric: 1-roc_auc_ovo
[flaml.automl.logger: 06-16 17:10:20] {1901} INFO - List of ML learners in AutoML Run: ['xgb_limitdepth']
[flaml.automl.logger: 06-16 17:10:20] {2219} INFO - iteration 0, current learner xgb_limitdepth
[flaml.automl.logger: 06-16 17:10:21] {2345} INFO - Estimated sufficient time budget=13401s. Estimated necessary time budget=13s.
[flaml.automl.logger: 06-16 17:10:21] {2392} INFO -  at 1.7s,	estimator xgb_limitdepth's best error=0.0807,	best estimator xgb_limitdepth's best error=0.0807
[flaml.automl.logger: 06-16 17:10:21] {2219} INFO - iteration 1, current learner xgb_limitdepth
[flaml.automl.logger: 06-16 17:10:22] {2392} INFO -  at 3.0s,	estimator xgb_

Unnamed: 0,pred_Dropout,pred_Enrolled,pred_Graduate
0,0.995140,0.002671,0.002188
1,0.003964,0.011303,0.984732
2,0.039133,0.241275,0.719592
3,0.190587,0.374050,0.435363
4,0.253132,0.700345,0.046523
...,...,...,...
51007,0.872543,0.050366,0.077091
51008,0.986633,0.013246,0.000121
51009,0.976407,0.017002,0.006591
51010,0.866007,0.118625,0.015369
