In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
import datatable as dt
import time
from flaml import AutoML

from numba import jit #to speed up loops

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
# Getting Data from folder
start = time.time()
train = dt.fread(r"C:\Users\Ong Yi Kai\Desktop\Data\Kaggle competitions\Tabular Data Oct 2021\train.csv").to_pandas()
test = dt.fread(r"C:\Users\Ong Yi Kai\Desktop\Data\Kaggle competitions\Tabular Data Oct 2021\test.csv").to_pandas()
end = time.time()
total_time = end-start
print("DONE LOADING! Time taken:{:.2f}".format(total_time))

DONE LOADING! Time taken:40.31


Reducing memeory usage (change data type)

In [3]:
# this function will help to reduce momory 
# data will be samller with the same value

@jit(forceobj=True)
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
        
            
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [4]:
#reducing the memory of data types
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Memory usage of dataframe is 1878.74 MB
Memory usage after optimization is: 549.32 MB
Decreased by 70.8%
Memory usage of dataframe is 938.89 MB
Memory usage after optimization is: 273.70 MB
Decreased by 70.8%


splitting the data

In [5]:
# seperate data and target labels
X, y = train.iloc[:,1:-1], train.iloc[:,-1:]

In [6]:
# seperate test into ID and Data
ID_test = test.iloc[:,:1]
X_test = test.iloc[:,1:]

In [7]:
y = np.ravel(y)

AUTOML MODEL FIT

In [8]:
model = AutoML()

In [9]:
model.fit(X,y,metric='roc_auc', time_budget=5*3600,verbose=2)

[flaml.automl: 10-06 22:49:47] {1432} INFO - Evaluation method: holdout
[flaml.automl: 10-06 22:49:53] {1478} INFO - Minimizing error metric: 1-roc_auc
[flaml.automl: 10-06 22:49:53] {1515} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree', 'lrl1']
[flaml.automl: 10-06 22:49:53] {1748} INFO - iteration 0, current learner lgbm
[flaml.tune.tune: 10-06 22:49:54] {383} INFO - trial 1 config: {'n_estimators': 4, 'num_leaves': 4, 'min_child_samples': 20, 'learning_rate': 0.09999999999999995, 'log_max_bin': 8, 'colsample_bytree': 1.0, 'reg_alpha': 0.0009765625, 'reg_lambda': 1.0, 'FLAML_sample_size': 10000}
[flaml.automl: 10-06 22:49:54] {1865} INFO - Estimated sufficient time budget=906091s. Estimated necessary time budget=16446s.
[flaml.automl: 10-06 22:49:54] {1938} INFO -  at 105.9s,	estimator lgbm's best error=0.2080,	best estimator lgbm's best error=0.2080
[flaml.automl: 10-06 22:49:54] {1748} INFO - iteration 1, current learner lgbm
[flaml.tun

[flaml.automl: 10-06 22:50:09] {1748} INFO - iteration 16, current learner rf
[flaml.tune.tune: 10-06 22:50:09] {383} INFO - trial 1 config: {'n_estimators': 4, 'max_features': 0.9692029582222275, 'max_leaves': 13, 'criterion': 'gini', 'FLAML_sample_size': 10000}
[flaml.automl: 10-06 22:50:10] {1938} INFO -  at 121.9s,	estimator rf's best error=0.1868,	best estimator lgbm's best error=0.1737
[flaml.automl: 10-06 22:50:10] {1748} INFO - iteration 17, current learner extra_tree
[flaml.tune.tune: 10-06 22:50:10] {383} INFO - trial 1 config: {'n_estimators': 4, 'max_features': 0.9692029582222269, 'max_leaves': 13, 'criterion': 'gini', 'FLAML_sample_size': 10000}
[flaml.automl: 10-06 22:50:11] {1938} INFO -  at 122.5s,	estimator extra_tree's best error=0.1815,	best estimator lgbm's best error=0.1737
[flaml.automl: 10-06 22:50:11] {1748} INFO - iteration 18, current learner lgbm
[flaml.tune.tune: 10-06 22:50:11] {383} INFO - trial 1 config: {'n_estimators': 15, 'num_leaves': 4, 'min_child_sa

[flaml.automl: 10-06 22:50:47] {1748} INFO - iteration 33, current learner xgboost
[flaml.tune.tune: 10-06 22:50:47] {383} INFO - trial 1 config: {'n_estimators': 4, 'max_leaves': 4, 'min_child_weight': 8.551327855112987, 'learning_rate': 0.0794265994730958, 'subsample': 0.8866076531034812, 'colsample_bylevel': 0.7164309072871456, 'colsample_bytree': 0.788744259288662, 'reg_alpha': 0.0017273290408679647, 'reg_lambda': 19.450171169544824, 'FLAML_sample_size': 10000}
[flaml.automl: 10-06 22:50:49] {1938} INFO -  at 159.9s,	estimator xgboost's best error=0.1896,	best estimator lgbm's best error=0.1608
[flaml.automl: 10-06 22:50:49] {1748} INFO - iteration 34, current learner lgbm
[flaml.tune.tune: 10-06 22:50:49] {383} INFO - trial 1 config: {'n_estimators': 367, 'num_leaves': 4, 'min_child_samples': 9, 'learning_rate': 0.011408787540202704, 'log_max_bin': 9, 'colsample_bytree': 0.4540242457578447, 'reg_alpha': 0.001858538296879656, 'reg_lambda': 0.022504063052032567, 'FLAML_sample_size':

[flaml.automl: 10-06 22:52:02] {1938} INFO -  at 233.8s,	estimator xgboost's best error=0.1592,	best estimator lgbm's best error=0.1570
[flaml.automl: 10-06 22:52:02] {1748} INFO - iteration 49, current learner lgbm
[flaml.tune.tune: 10-06 22:52:02] {383} INFO - trial 1 config: {'n_estimators': 87, 'num_leaves': 8, 'min_child_samples': 7, 'learning_rate': 0.12253425750631962, 'log_max_bin': 10, 'colsample_bytree': 0.7700247033676194, 'reg_alpha': 0.0015564673105246886, 'reg_lambda': 0.014120003302197788, 'FLAML_sample_size': 40000}
[flaml.automl: 10-06 22:52:07] {1938} INFO -  at 238.7s,	estimator lgbm's best error=0.1565,	best estimator lgbm's best error=0.1565
[flaml.automl: 10-06 22:52:07] {1748} INFO - iteration 50, current learner rf
[flaml.tune.tune: 10-06 22:52:07] {383} INFO - trial 1 config: {'n_estimators': 4, 'max_features': 0.9692029582222269, 'max_leaves': 13, 'criterion': 'entropy', 'FLAML_sample_size': 10000}
[flaml.automl: 10-06 22:52:09] {1938} INFO -  at 240.5s,	estim

[flaml.automl: 10-06 22:54:04] {1938} INFO -  at 355.5s,	estimator lgbm's best error=0.1522,	best estimator lgbm's best error=0.1522
[flaml.automl: 10-06 22:54:04] {1748} INFO - iteration 66, current learner rf
[flaml.tune.tune: 10-06 22:54:04] {383} INFO - trial 1 config: {'n_estimators': 4, 'max_features': 0.6410790755454192, 'max_leaves': 20, 'criterion': 'gini', 'FLAML_sample_size': 10000}
[flaml.automl: 10-06 22:54:05] {1938} INFO -  at 356.6s,	estimator rf's best error=0.1814,	best estimator lgbm's best error=0.1522
[flaml.automl: 10-06 22:54:05] {1748} INFO - iteration 67, current learner xgboost
[flaml.tune.tune: 10-06 22:54:05] {383} INFO - trial 1 config: {'n_estimators': 160, 'max_leaves': 9, 'min_child_weight': 93.72300482528745, 'learning_rate': 0.08283108041569484, 'subsample': 0.8895588746662894, 'colsample_bylevel': 0.5471471897703085, 'colsample_bytree': 0.7705186512218506, 'reg_alpha': 0.0015245843735931768, 'reg_lambda': 12.241523485667024, 'FLAML_sample_size': 40000

[flaml.automl: 10-06 22:59:45] {1748} INFO - iteration 83, current learner catboost
[flaml.tune.tune: 10-06 22:59:45] {383} INFO - trial 1 config: {'early_stopping_rounds': 10, 'learning_rate': 0.2, 'FLAML_sample_size': 40000}
[flaml.automl: 10-06 23:00:01] {1938} INFO -  at 712.6s,	estimator catboost's best error=0.1537,	best estimator lgbm's best error=0.1455
[flaml.automl: 10-06 23:00:01] {1748} INFO - iteration 84, current learner lrl1
No low-cost partial config given to the search algorithm. For cost-frugal search, consider providing low-cost values for cost-related hps via 'low_cost_partial_config'.
[flaml.tune.tune: 10-06 23:00:01] {383} INFO - trial 1 config: {'C': 1.0, 'FLAML_sample_size': 10000}
[flaml.automl: 10-06 23:00:06] {1938} INFO -  at 717.0s,	estimator lrl1's best error=0.1695,	best estimator lgbm's best error=0.1455
[flaml.automl: 10-06 23:00:06] {1748} INFO - iteration 85, current learner lrl1
[flaml.tune.tune: 10-06 23:00:06] {383} INFO - trial 1 config: {'C': 0.2

[flaml.tune.tune: 10-06 23:07:04] {383} INFO - trial 1 config: {'n_estimators': 2164, 'num_leaves': 11, 'min_child_samples': 2, 'learning_rate': 0.42439733414465625, 'log_max_bin': 7, 'colsample_bytree': 0.7243710802709122, 'reg_alpha': 0.0009765625, 'reg_lambda': 0.1300144275114188, 'FLAML_sample_size': 900000}
[flaml.automl: 10-06 23:12:26] {1938} INFO -  at 1457.5s,	estimator lgbm's best error=0.1455,	best estimator lgbm's best error=0.1455
[flaml.automl: 10-06 23:12:26] {1748} INFO - iteration 105, current learner extra_tree
[flaml.tune.tune: 10-06 23:12:26] {383} INFO - trial 1 config: {'n_estimators': 93, 'max_features': 0.614156489203111, 'max_leaves': 15, 'criterion': 'gini', 'FLAML_sample_size': 10000}
[flaml.automl: 10-06 23:12:28] {1938} INFO -  at 1459.6s,	estimator extra_tree's best error=0.1752,	best estimator lgbm's best error=0.1455
[flaml.automl: 10-06 23:12:28] {1748} INFO - iteration 106, current learner xgboost
[flaml.tune.tune: 10-06 23:12:28] {383} INFO - trial 1 

[flaml.automl: 10-06 23:23:14] {1748} INFO - iteration 123, current learner xgboost
[flaml.tune.tune: 10-06 23:23:14] {383} INFO - trial 1 config: {'n_estimators': 125, 'max_leaves': 190, 'min_child_weight': 20.807842099430747, 'learning_rate': 0.02380640489504616, 'subsample': 0.8996608306054059, 'colsample_bylevel': 0.5249573810809469, 'colsample_bytree': 0.799097902538782, 'reg_alpha': 0.0009765625, 'reg_lambda': 140.80395051314565, 'FLAML_sample_size': 40000}
[flaml.automl: 10-06 23:23:46] {1938} INFO -  at 2137.3s,	estimator xgboost's best error=0.1526,	best estimator lgbm's best error=0.1446
[flaml.automl: 10-06 23:23:46] {1748} INFO - iteration 124, current learner catboost
[flaml.tune.tune: 10-06 23:23:46] {383} INFO - trial 1 config: {'early_stopping_rounds': 11, 'learning_rate': 0.005866846556775201, 'FLAML_sample_size': 160000}
[flaml.automl: 10-06 23:24:40] {1938} INFO -  at 2191.8s,	estimator catboost's best error=0.1487,	best estimator lgbm's best error=0.1446
[flaml.auto

[flaml.automl: 10-07 00:29:52] {1938} INFO -  at 6103.0s,	estimator xgboost's best error=0.1482,	best estimator lgbm's best error=0.1442
[flaml.automl: 10-07 00:29:52] {1748} INFO - iteration 141, current learner rf
[flaml.tune.tune: 10-07 00:29:52] {383} INFO - trial 1 config: {'n_estimators': 36, 'max_features': 0.5229242759531132, 'max_leaves': 74, 'criterion': 'entropy', 'FLAML_sample_size': 10000}
[flaml.automl: 10-07 00:30:03] {1938} INFO -  at 6114.3s,	estimator rf's best error=0.1753,	best estimator lgbm's best error=0.1442
[flaml.automl: 10-07 00:30:03] {1748} INFO - iteration 142, current learner xgboost
[flaml.tune.tune: 10-07 00:30:03] {383} INFO - trial 1 config: {'n_estimators': 277, 'max_leaves': 39, 'min_child_weight': 20.531012353518616, 'learning_rate': 0.020890565272497986, 'subsample': 0.93649839568779, 'colsample_bylevel': 0.23338384524947003, 'colsample_bytree': 0.741299553682497, 'reg_alpha': 0.0009765625, 'reg_lambda': 14.451665100581856, 'FLAML_sample_size': 16

[flaml.automl: 10-07 03:07:08] {1938} INFO -  at 15539.4s,	estimator lrl1's best error=0.1603,	best estimator lgbm's best error=0.1442
[flaml.automl: 10-07 03:07:08] {1748} INFO - iteration 158, current learner catboost
[flaml.tune.tune: 10-07 03:07:08] {383} INFO - trial 1 config: {'early_stopping_rounds': 10, 'learning_rate': 0.005, 'FLAML_sample_size': 900000}
[flaml.automl: 10-07 03:09:31] {1938} INFO -  at 15682.5s,	estimator catboost's best error=0.1467,	best estimator lgbm's best error=0.1442
[flaml.automl: 10-07 03:09:31] {1748} INFO - iteration 159, current learner rf
[flaml.tune.tune: 10-07 03:09:31] {383} INFO - trial 1 config: {'n_estimators': 59, 'max_features': 0.6772875021918483, 'max_leaves': 83, 'criterion': 'gini', 'FLAML_sample_size': 40000}
[flaml.automl: 10-07 03:10:22] {1938} INFO -  at 15733.7s,	estimator rf's best error=0.1729,	best estimator lgbm's best error=0.1442
[flaml.automl: 10-07 03:10:22] {1748} INFO - iteration 160, current learner rf
[flaml.tune.tune:

In [10]:
import pickle

In [11]:
filename = 'finalized_flaml_model_{}.sav'.format(time.time())
pickle.dump(model, open(filename, 'wb'))

In [12]:
submission_pred = model.predict(X_test)

In [13]:
submission_pred.shape

(500000,)

In [14]:
submit_final = pd.concat([ID_test,pd.DataFrame(submission_pred)],axis=1)
submit_final.shape

(500000, 2)

In [15]:
submit_final.to_csv('oct_AutoML_{}.csv'.format(time.time()),index_label=False)

In [17]:
from sklearn.metrics import roc_auc_score
small_pred = model.predict(X[:50000])
roc_auc = roc_auc_score(y[:50000],small_pred)
roc_auc

0.770607072308618