In [1]:
%%capture
!pip install -U lightautoml

In [2]:
import os
import time

# Essential DS libraries
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import torch

from sklearn.preprocessing import LabelEncoder

# LightAutoML presets, task and report generation
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task
from lightautoml.automl.base import AutoML
from lightautoml.ml_algo.boost_lgbm import BoostLGBM
from lightautoml.ml_algo.tuning.optuna import OptunaTuner
from lightautoml.pipelines.features.lgb_pipeline import LGBSimpleFeatures
from lightautoml.pipelines.ml.base import MLPipeline
from lightautoml.pipelines.selection.importance_based import ImportanceCutoffSelector,ModelBasedImportanceEstimator
from lightautoml.reader.base import PandasToPandasReader
from lightautoml.tasks import Task
from lightautoml.automl.blend import WeightedBlender

In [3]:
N_THREADS = 4
N_FOLDS = 20
RANDOM_STATE = 42
TEST_SIZE = 0.2
TIMEOUT = 5*60*60 
TARGET_NAME = 'Target'

In [4]:
train = pd.read_csv('/kaggle/input/playground-series-s4e6/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e6/test.csv')

In [5]:
%%time
train_data, test_data = train_test_split(train,
test_size=TEST_SIZE,
stratify=train[TARGET_NAME],
random_state=RANDOM_STATE)

CPU times: user 171 ms, sys: 5.9 ms, total: 177 ms
Wall time: 186 ms


In [6]:
task = Task('multiclass')
reader = PandasToPandasReader(task, cv=N_FOLDS, random_state=RANDOM_STATE)

In [7]:
 %%time
model0 = BoostLGBM(
default_params={'learning_rate': 0.05, 'num_leaves': 78, 'seed': 42, 'num_threads':N_THREADS}
)
pipe0 = LGBSimpleFeatures()
mbie = ModelBasedImportanceEstimator()
selector = ImportanceCutoffSelector(pipe0, model0, mbie, cutoff=0)

CPU times: user 403 µs, sys: 0 ns, total: 403 µs
Wall time: 409 µs


In [8]:
pipe = LGBSimpleFeatures()
params_tuner1 = OptunaTuner(n_trials=100, timeout=3000) # stop after 20 iterations or after30 seconds
model1 = BoostLGBM(
default_params={'learning_rate': 0.05, 'num_leaves': 128, 'seed': 1, 'num_threads':N_THREADS}
)
model2 = BoostLGBM(
default_params={'learning_rate': 0.025, 'num_leaves': 64, 'seed': 2, 'num_threads':N_THREADS}
)
pipeline_lvl1 = MLPipeline([
(model1, params_tuner1),
model2
], pre_selection=selector, features_pipeline=pipe, post_selection=None)

In [9]:
%%time
pipe1 = LGBSimpleFeatures()
model = BoostLGBM(
default_params={'learning_rate': 0.05, 'num_leaves': 64, 'max_bin': 1024, 'seed': 3,'num_threads': N_THREADS},
freeze_defaults=True
)
pipeline_lvl2 = MLPipeline([model], pre_selection=None, features_pipeline=pipe1, post_selection=None)

CPU times: user 97 µs, sys: 5 µs, total: 102 µs
Wall time: 108 µs


In [10]:
 %%time
automl = AutoML(reader, [
[pipeline_lvl1],
[pipeline_lvl2],
], skip_conn=False)

CPU times: user 57 µs, sys: 3 µs, total: 60 µs
Wall time: 65.3 µs


In [11]:
 %%time
oof_pred = automl.fit_predict(train_data, roles={'target': TARGET_NAME,'drop': ['id']})
print('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))


oof_pred:
array([[0.9112515 , 0.03437404, 0.05437445],
       [0.00329686, 0.98463106, 0.01207209],
       [0.8570297 , 0.06512046, 0.07784985],
       ...,
       [0.8357799 , 0.02477043, 0.13944964],
       [0.66141105, 0.08205832, 0.25653064],
       [0.00748962, 0.89495575, 0.0975546 ]], dtype=float32)
Shape = (61214, 3)
CPU times: user 1h 48min 58s, sys: 1min 24s, total: 1h 50min 22s
Wall time: 28min 32s


In [12]:
%%time
print('Feature importances of selector:\n{}'
.format(selector.get_features_score()))
print('=' * 70)
print('Feature importances of top level algorithm:\n{}'
.format(automl.levels[-1][0].ml_algos[0].get_features_score()))
print('=' * 70)
print('Feature importances of lowest level algorithm - model 0:\n{}'
.format(automl.levels[0][0].ml_algos[0].get_features_score()))
print('=' * 70)
print('Feature importances of lowest level algorithm - model 1:\n{}'
.format(automl.levels[0][0].ml_algos[1].get_features_score()))
print('=' * 70)


Feature importances of selector:
Curricular units 2nd sem (approved)               260680.312033
Curricular units 1st sem (approved)                75381.769048
Curricular units 2nd sem (grade)                   70587.104404
Curricular units 2nd sem (evaluations)             30740.492733
Tuition fees up to date                            22856.546170
Curricular units 1st sem (grade)                   17384.426910
Admission grade                                    16707.729537
Curricular units 1st sem (evaluations)             12817.875477
Course                                             11800.132346
Curricular units 2nd sem (enrolled)                11047.054627
Age at enrollment                                  10469.078682
Previous qualification (grade)                     10053.551858
Scholarship holder                                  9775.860710
Father's occupation                                 7254.622168
Mother's occupation                                 6061.635454
GDP    

In [13]:
# print(automl.create_model_str_desc())

In [14]:
%%time
test_pred = automl.predict(test_data)
test_pred

CPU times: user 2min 16s, sys: 104 ms, total: 2min 16s
Wall time: 35.1 s


array([[0.00933286, 0.9864626 , 0.00420451],
       [0.9561636 , 0.015925  , 0.02791139],
       [0.02271964, 0.92311734, 0.054163  ],
       ...,
       [0.00718003, 0.9765776 , 0.01624238],
       [0.02236639, 0.6327946 , 0.34483898],
       [0.9873146 , 0.00790529, 0.00478013]], dtype=float32)

In [15]:
%%time
class_mapping = {0:'Graduate', 1:'Dropout', 2:'Enrolled'}
te_pred = np.argmax(np.array(test_pred.data),axis=1)
final_test_pred = [class_mapping[value] for value in te_pred]

oof_pred_ = np.argmax(np.array(oof_pred.data),axis=1)
final_oof_pred = [class_mapping[value] for value in oof_pred_]

CPU times: user 29.9 ms, sys: 6 µs, total: 29.9 ms
Wall time: 28.9 ms


In [16]:
 %%time
#test_pred = automl.predict(test_data)
print('Prediction for test data:\n{}\nShape = {}'
.format(test_pred, test_pred.shape))
print('Check scores...')
print('OOF score: {}'.format(accuracy_score(train_data[TARGET_NAME].values, final_oof_pred)))
print('TEST score: {}'.format(accuracy_score(test_data[TARGET_NAME].values, final_test_pred)))


Prediction for test data:
array([[0.00933286, 0.9864626 , 0.00420451],
       [0.9561636 , 0.015925  , 0.02791139],
       [0.02271964, 0.92311734, 0.054163  ],
       ...,
       [0.00718003, 0.9765776 , 0.01624238],
       [0.02236639, 0.6327946 , 0.34483898],
       [0.9873146 , 0.00790529, 0.00478013]], dtype=float32)
Shape = (15304, 3)
Check scores...
OOF score: 0.8316888293527624
TEST score: 0.8314166231050706
CPU times: user 220 ms, sys: 3.02 ms, total: 223 ms
Wall time: 221 ms


## Final Prediction on test set

In [17]:
%%time
preds = automl.predict(test)

CPU times: user 7min 37s, sys: 390 ms, total: 7min 38s
Wall time: 1min 58s


In [18]:
%%time
final_preds = np.argmax(np.array(preds.data),axis=1)

CPU times: user 1.24 ms, sys: 1 ms, total: 2.24 ms
Wall time: 1.5 ms


In [19]:
%%time
final_prediction = [class_mapping[value] for value in final_preds]

CPU times: user 18.3 ms, sys: 1.93 ms, total: 20.2 ms
Wall time: 19.7 ms


In [20]:
%%time
submit = pd.read_csv('/kaggle/input/playground-series-s4e6/sample_submission.csv')
submit.Target = final_prediction
submit.to_csv('submission.csv',index=False)
submit

CPU times: user 126 ms, sys: 7 ms, total: 133 ms
Wall time: 156 ms


Unnamed: 0,id,Target
0,76518,Dropout
1,76519,Graduate
2,76520,Graduate
3,76521,Graduate
4,76522,Enrolled
...,...,...
51007,127525,Dropout
51008,127526,Dropout
51009,127527,Dropout
51010,127528,Dropout
