In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/porto-seguro-safe-driver-prediction/sample_submission.csv
/kaggle/input/porto-seguro-safe-driver-prediction/train 2.csv
/kaggle/input/porto-seguro-safe-driver-prediction/test 2.csv


In [2]:
import numpy as np
import pandas as pd
import optuna
import lightgbm as lgb
from path import Path
from sklearn.model_selection import StratifiedKFold

In [3]:
class Config:
    input_path = Path('../input/porto-seguro-safe-driver-prediction')
    optuna_lgb = False
    n_estimators = 1500
    early_stopping_round = 150
    cv_folds = 5
    random_state = 0
    params = {'objective': 'binary',
              'boosting_type': 'gbdt',
              'learning_rate': 0.01,
              'max_bin': 25,
              'num_leaves': 31,
              'min_child_samples': 1500,
              'colsample_bytree': 0.7,
              'subsample_freq': 1,
              'subsample': 0.7,
              'reg_alpha': 1.0,
              'reg_lambda': 1.0,
              'verbosity': 0,
              'random_state': 0}
    
config = Config()

In [4]:
config

<__main__.Config at 0x7cf48e612dd0>

In [5]:
train = pd.read_csv(config.input_path / 'train 2.csv', index_col='id')
test = pd.read_csv(config.input_path / 'test 2.csv', index_col='id')
submission = pd.read_csv(config.input_path / 'sample_submission.csv', index_col='id')

In [6]:
train.head()

Unnamed: 0_level_0,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,0,2,2,5,1,0,0,1,0,0,...,9,1,5,8,0,1,1,0,0,1
9,0,1,1,7,0,0,0,0,1,0,...,3,1,1,9,0,1,1,0,1,0
13,0,5,4,9,1,0,0,0,1,0,...,4,2,7,7,0,1,1,0,1,0
16,0,0,1,2,0,0,1,0,0,0,...,2,2,4,9,0,0,0,0,0,0
17,0,0,2,0,1,0,1,0,0,0,...,3,1,1,3,0,0,0,1,1,0


In [7]:
calc_features = [x for x in train.columns if "_calc" in x]
cat_features = [x for x in train.columns if "_cat" in x] 

In [8]:
cat_features

['ps_ind_02_cat',
 'ps_ind_04_cat',
 'ps_ind_05_cat',
 'ps_car_01_cat',
 'ps_car_02_cat',
 'ps_car_03_cat',
 'ps_car_04_cat',
 'ps_car_05_cat',
 'ps_car_06_cat',
 'ps_car_07_cat',
 'ps_car_08_cat',
 'ps_car_09_cat',
 'ps_car_10_cat',
 'ps_car_11_cat']

In [9]:
target = train["target"] 

In [10]:
target

id
7          0
9          0
13         0
16         0
17         0
          ..
1488013    0
1488016    0
1488017    0
1488021    0
1488027    0
Name: target, Length: 595212, dtype: int64

In [11]:
train = train.drop("target", axis = "columns")

In [12]:
train.columns

Index(['ps_ind_01', 'ps_ind_02_cat', 'ps_ind_03', 'ps_ind_04_cat',
       'ps_ind_05_cat', 'ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin',
       'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin',
       'ps_ind_13_bin', 'ps_ind_14', 'ps_ind_15', 'ps_ind_16_bin',
       'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_reg_01', 'ps_reg_02', 'ps_reg_03',
       'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat',
       'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat',
       'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat', 'ps_car_11',
       'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15', 'ps_calc_01',
       'ps_calc_02', 'ps_calc_03', 'ps_calc_04', 'ps_calc_05', 'ps_calc_06',
       'ps_calc_07', 'ps_calc_08', 'ps_calc_09', 'ps_calc_10', 'ps_calc_11',
       'ps_calc_12', 'ps_calc_13', 'ps_calc_14', 'ps_calc_15_bin',
       'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin',
       'ps_calc_20_bin'],
      dtype='obj

In [13]:
#removing calc features
train = train.drop(calc_features,axis = "columns")

In [14]:
test = test.drop(calc_features,axis = "columns")

In [16]:
train.columns

Index(['ps_ind_01', 'ps_ind_02_cat', 'ps_ind_03', 'ps_ind_04_cat',
       'ps_ind_05_cat', 'ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin',
       'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin',
       'ps_ind_13_bin', 'ps_ind_14', 'ps_ind_15', 'ps_ind_16_bin',
       'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_reg_01', 'ps_reg_02', 'ps_reg_03',
       'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat',
       'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat',
       'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat', 'ps_car_11',
       'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15'],
      dtype='object')

In [17]:
train = pd.get_dummies(train,columns = cat_features)

In [20]:
test = pd.get_dummies(test,columns = cat_features)

*This assert statement checks if the columns in your training dataset and test dataset are identical, 
both in names and order.If the columns don't match exactly, the assertion will raise an AssertionError*

In [22]:
assert((train.columns==test.columns).all())

In [23]:
from numba import jit

@jit
def eval_gini(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_pred)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

def gini_lgb(y_true, y_pred):
    eval_name = 'normalized_gini_coef'
    eval_result = eval_gini(y_true, y_pred)
    is_higher_better = True
    return eval_name, eval_result, is_higher_better

*A custom Gini coefficient evaluation metric, which can be integrated with LightGBM to assess the quality of model predictions.*

In [24]:
if config.optuna_lgb:
        
    def objective(trial):
        params = {
                'learning_rate': trial.suggest_float("learning_rate", 0.01, 1.0),
                'num_leaves': trial.suggest_int("num_leaves", 3, 255),
                'min_child_samples': trial.suggest_int("min_child_samples", 3, 3000),
                'colsample_bytree': trial.suggest_float("colsample_bytree", 0.1, 1.0),
                'subsample_freq': trial.suggest_int("subsample_freq", 0, 10),
                'subsample': trial.suggest_float("subsample", 0.1, 1.0),
                'reg_alpha': trial.suggest_loguniform("reg_alpha", 1e-9, 10.0),
                'reg_lambda': trial.suggest_loguniform("reg_lambda", 1e-9, 10.0),
        }
        
        score = list()
        skf = StratifiedKFold(n_splits=config.cv_folds, shuffle=True, random_state=config.random_state)

        for train_idx, valid_idx in skf.split(train, target):
            X_train, y_train = train.iloc[train_idx], target.iloc[train_idx]
            X_valid, y_valid = train.iloc[valid_idx], target.iloc[valid_idx]

            model = lgb.LGBMClassifier(**params,
                                    n_estimators=1500,
                                    early_stopping_round=150,
                                    force_row_wise=True)

            callbacks=[lgb.early_stopping(stopping_rounds=150, verbose=False)]
            model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric=gini_lgb, callbacks=callbacks)
            score.append(model.best_score_['valid_0']['normalized_gini_coef'])

        return np.mean(score)

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=300)

    print("Best Gini Normalized Score", study.best_value)
    print("Best parameters", study.best_params)
    
    params = {'objective': 'binary',
            'boosting_type': 'gbdt',
            'verbosity': 0,
            'random_state': 0}
    
    params.update(study.best_params)
    
else:
    params = config.params

In [25]:
preds = np.zeros(len(test))
oof = np.zeros(len(train))
metric_evaluations = list()

skf = StratifiedKFold(n_splits=config.cv_folds, shuffle=True, random_state=config.random_state)

for idx, (train_idx, valid_idx) in enumerate(skf.split(train, target)):
    print(f"CV fold {idx}")
    X_train, y_train = train.iloc[train_idx], target.iloc[train_idx]
    X_valid, y_valid = train.iloc[valid_idx], target.iloc[valid_idx]
    
    model = lgb.LGBMClassifier(**params,
                               n_estimators=config.n_estimators,
                               early_stopping_round=config.early_stopping_round,
                               force_row_wise=True)
    
    callbacks=[lgb.early_stopping(stopping_rounds=150), 
               lgb.log_evaluation(period=100, show_stdv=False)]
                                                                                           
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric=gini_lgb, callbacks=callbacks)
    metric_evaluations.append(model.best_score_['valid_0']['normalized_gini_coef'])
    preds += model.predict_proba(test, num_iteration=model.best_iteration_)[:,1] / skf.n_splits
    oof[valid_idx] = model.predict_proba(X_valid, num_iteration=model.best_iteration_)[:,1]

CV fold 0
Training until validation scores don't improve for 150 rounds
[100]	valid_0's binary_logloss: 0.153243	valid_0's normalized_gini_coef: 0.271457
[200]	valid_0's binary_logloss: 0.15228	valid_0's normalized_gini_coef: 0.280599
[300]	valid_0's binary_logloss: 0.15185	valid_0's normalized_gini_coef: 0.286829
[400]	valid_0's binary_logloss: 0.151651	valid_0's normalized_gini_coef: 0.289906
[500]	valid_0's binary_logloss: 0.151543	valid_0's normalized_gini_coef: 0.291906
[600]	valid_0's binary_logloss: 0.151473	valid_0's normalized_gini_coef: 0.293377
[700]	valid_0's binary_logloss: 0.151437	valid_0's normalized_gini_coef: 0.293827
[800]	valid_0's binary_logloss: 0.151417	valid_0's normalized_gini_coef: 0.294276
[900]	valid_0's binary_logloss: 0.15142	valid_0's normalized_gini_coef: 0.294119
CV fold 1
Training until validation scores don't improve for 150 rounds
[100]	valid_0's binary_logloss: 0.153553	valid_0's normalized_gini_coef: 0.255568
[200]	valid_0's binary_logloss: 0.15277

In [26]:
print(f"LightGBM CV Gini Normalized Score: {np.mean(metric_evaluations):0.3f} ({np.std(metric_evaluations):0.3f})")

LightGBM CV Gini Normalized Score: 0.289 (0.015)


In [27]:
submission['target'] = preds
submission.to_csv('lgb_submission.csv')

In [28]:
oofs = target.to_frame()
oofs['target'] = oof
oofs.to_csv('lgb_oof.csv')