In [1]:
%%capture
!pip install -U lightautoml

In [2]:
import os
import time

# Essential DS libraries
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import torch

# LightAutoML presets, task and report generation
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task
from lightautoml.automl.base import AutoML
from lightautoml.ml_algo.boost_lgbm import BoostLGBM
from lightautoml.ml_algo.tuning.optuna import OptunaTuner
from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures
from lightautoml.pipelines.ml.base import MLPipeline
from lightautoml.pipelines.selection.importance_based import ImportanceCutoffSelector,ModelBasedImportanceEstimator
from lightautoml.reader.base import PandasToPandasReader
from lightautoml.tasks import Task
from lightautoml.automl.blend import WeightedBlender

In [3]:
N_THREADS = 4
N_FOLDS = 10
RANDOM_STATE = 42
TEST_SIZE = 0.2
TIMEOUT = 10*60*60 
TARGET_NAME = 'Target'

In [4]:
train = pd.read_csv('/kaggle/input/playground-series-s4e6/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e6/test.csv')

In [6]:
%%time
train_data, test_data = train_test_split(train,
test_size=TEST_SIZE,
stratify=train[TARGET_NAME],
random_state=RANDOM_STATE)

CPU times: user 102 ms, sys: 6.08 ms, total: 108 ms
Wall time: 112 ms


In [7]:
task = Task('multiclass')
reader = PandasToPandasReader(task, cv=N_FOLDS, random_state=RANDOM_STATE)

In [9]:
 %%time
model0 = BoostLGBM(
default_params={'learning_rate': 0.05, 'num_leaves': 64, 'seed': 42, 'num_threads':N_THREADS}
)
pipe0 = LGBSimpleFeatures()
mbie = ModelBasedImportanceEstimator()
selector = ImportanceCutoffSelector(pipe0, model0, mbie, cutoff=0)

CPU times: user 283 µs, sys: 18 µs, total: 301 µs
Wall time: 306 µs


In [11]:
 %%time
pipe = LGBSimpleFeatures()
params_tuner1 = OptunaTuner(n_trials=1000, timeout=39000) # stop after 20 iterations or after30 seconds
model1 = BoostLGBM(
default_params={'learning_rate': 0.05, 'num_leaves': 128, 'seed': 1, 'num_threads':N_THREADS}
)
model2 = BoostLGBM(
default_params={'learning_rate': 0.025, 'num_leaves': 64, 'seed': 2, 'num_threads':N_THREADS}
)
pipeline_lvl1 = MLPipeline([
(model1, params_tuner1),
model2
], pre_selection=selector, features_pipeline=pipe, post_selection=None)


CPU times: user 106 µs, sys: 0 ns, total: 106 µs
Wall time: 110 µs


In [12]:
%%time
pipe1 = LGBSimpleFeatures()
model = BoostLGBM(
default_params={'learning_rate': 0.05, 'num_leaves': 64, 'max_bin': 1024, 'seed': 3,'num_threads': N_THREADS},
freeze_defaults=True
)
pipeline_lvl2 = MLPipeline([model], pre_selection=None, features_pipeline=pipe1, post_selection=None)

CPU times: user 90 µs, sys: 0 ns, total: 90 µs
Wall time: 93.9 µs


In [13]:
 %%time
automl = AutoML(reader, [
[pipeline_lvl1],
[pipeline_lvl2],
], skip_conn=False)

CPU times: user 43 µs, sys: 3 µs, total: 46 µs
Wall time: 49.8 µs


In [14]:
 %%time
oof_pred = automl.fit_predict(train_data, roles={'target': TARGET_NAME,'drop': ['id']})
print('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))


oof_pred:
array([[0.9332447 , 0.02432532, 0.04242998],
       [0.00372   , 0.98913664, 0.00714337],
       [0.8081205 , 0.10492322, 0.08695631],
       ...,
       [0.91556394, 0.01914421, 0.06529185],
       [0.63821983, 0.11725423, 0.24452594],
       [0.01176137, 0.89813316, 0.09010547]], dtype=float32)
Shape = (61214, 3)
CPU times: user 21min 58s, sys: 16.3 s, total: 22min 14s
Wall time: 5min 54s


In [15]:
%%time
print('Feature importances of selector:\n{}'
.format(selector.get_features_score()))
print('=' * 70)
print('Feature importances of top level algorithm:\n{}'
.format(automl.levels[-1][0].ml_algos[0].get_features_score()))
print('=' * 70)
print('Feature importances of lowest level algorithm - model 0:\n{}'
.format(automl.levels[0][0].ml_algos[0].get_features_score()))
print('=' * 70)
print('Feature importances of lowest level algorithm - model 1:\n{}'
.format(automl.levels[0][0].ml_algos[1].get_features_score()))
print('=' * 70)


Feature importances of selector:
Curricular units 2nd sem (approved)               248343.779711
Curricular units 2nd sem (grade)                   68605.854563
Curricular units 1st sem (approved)                59967.417771
Curricular units 2nd sem (evaluations)             25482.511462
Curricular units 1st sem (grade)                   23272.887287
Tuition fees up to date                            22010.078986
Admission grade                                    16272.744815
Curricular units 1st sem (evaluations)             15841.870934
Course                                             11734.622446
Previous qualification (grade)                     10372.149995
Curricular units 2nd sem (enrolled)                 9915.838797
Age at enrollment                                   9869.758956
Scholarship holder                                  9654.622764
Father's occupation                                 6908.007308
Curricular units 1st sem (enrolled)                 6250.524729
GDP    

In [10]:
# print(automl.create_model_str_desc())

Final prediction for new objects (level 0) = 
	 0.81410 * (1 averaged models Lvl_0_Pipe_1_Mod_0_LightGBM) +
	 0.18590 * (1 averaged models Lvl_0_Pipe_1_Mod_2_CatBoost) 


In [16]:
%%time
test_pred = automl.predict(test_data)
test_pred

CPU times: user 59.8 s, sys: 50.6 ms, total: 59.9 s
Wall time: 15.8 s


array([[0.00825326, 0.986689  , 0.00505775],
       [0.9461671 , 0.01724216, 0.03659075],
       [0.01965619, 0.90459335, 0.07575045],
       ...,
       [0.00714429, 0.9740584 , 0.0187973 ],
       [0.0310234 , 0.5439169 , 0.4250597 ],
       [0.98786855, 0.00714381, 0.00498765]], dtype=float32)

In [23]:
class_mapping = {0:'Graduate', 1:'Dropout', 2:'Enrolled'}
te_pred = np.argmax(np.array(test_pred.data),axis=1)
final_test_pred = [class_mapping[value] for value in te_pred]

oof_pred_ = np.argmax(np.array(oof_pred.data),axis=1)
final_oof_pred = [class_mapping[value] for value in oof_pred_]

In [24]:
 %%time
#test_pred = automl.predict(test_data)
print('Prediction for test data:\n{}\nShape = {}'
.format(test_pred, test_pred.shape))
print('Check scores...')
print('OOF score: {}'.format(accuracy_score(train_data[TARGET_NAME].values, final_oof_pred)))
print('TEST score: {}'.format(accuracy_score(test_data[TARGET_NAME].values, final_test_pred)))


Prediction for test data:
array([[0.00825326, 0.986689  , 0.00505775],
       [0.9461671 , 0.01724216, 0.03659075],
       [0.01965619, 0.90459335, 0.07575045],
       ...,
       [0.00714429, 0.9740584 , 0.0187973 ],
       [0.0310234 , 0.5439169 , 0.4250597 ],
       [0.98786855, 0.00714381, 0.00498765]], dtype=float32)
Shape = (15304, 3)
Check scores...
OOF score: 0.8309537034011827
TEST score: 0.8309592263460533
CPU times: user 130 ms, sys: 2.02 ms, total: 132 ms
Wall time: 129 ms


## Final Prediction on test set

In [25]:
preds = automl.predict(test)

In [26]:
final_preds = np.argmax(np.array(preds.data),axis=1)

In [27]:
final_prediction = [class_mapping[value] for value in final_preds]

In [28]:
submit = pd.read_csv('/kaggle/input/playground-series-s4e6/sample_submission.csv')
submit.Target = final_prediction
submit.to_csv('submission.csv',index=False)
submit

Unnamed: 0,id,Target
0,76518,Dropout
1,76519,Graduate
2,76520,Graduate
3,76521,Enrolled
4,76522,Enrolled
...,...,...
51007,127525,Dropout
51008,127526,Dropout
51009,127527,Dropout
51010,127528,Dropout
