In [85]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier

In [3]:
train_data = pd.read_csv('train.csv')

In [4]:
test_data = pd.read_csv('test.csv')

In [5]:
train_data.describe()

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f110,f111,f112,f113,f114,f115,f116,f117,f118,claim
count,957919.0,942672.0,942729.0,942428.0,942359.0,942514.0,942398.0,942415.0,942546.0,942670.0,...,942554.0,942420.0,942509.0,942686.0,942481.0,942360.0,942330.0,942512.0,942707.0,957919.0
mean,478959.0,0.090201,0.345964,4068.744207,0.201214,0.304869,-0.071458,1620.843815,377164.2,1806054000000000.0,...,-19.926398,2.07453,23.885245,1.748777,63152.97354,1.208876,4.276905e+16,3959.204669,0.559267,0.498492
std,276527.540591,0.043564,0.146251,6415.82944,0.21251,0.145343,2.123777,1276.281403,345432.5,2335204000000000.0,...,18.578439,0.895793,45.58136,10.088848,92435.016241,0.114959,6.732441e+16,3155.991777,0.408426,0.499998
min,0.0,-0.14991,-0.019044,-9421.7,-0.082122,-0.00699,-12.791,-224.8,-29843.0,-1153300000000000.0,...,-105.86,0.27704,-27.691,-26.589,-81977.0,0.90527,-8944400000000000.0,-415.24,-0.15124,0.0
25%,239479.5,0.070227,0.28305,418.43,0.035086,0.24052,-1.1207,481.545,91209.0,11531000000000.0,...,-28.812,1.4877,-0.62888,-4.473975,2443.2,1.1468,232110000000000.0,1306.2,0.27656,0.0
50%,478959.0,0.090135,0.3891,1279.5,0.137,0.32779,-0.38011,1446.1,289670.0,504305000000000.0,...,-14.636,1.6621,1.7277,0.88571,19479.0,1.1772,1.3275e+16,3228.0,0.47344,0.0
75%,718438.5,0.1165,0.45845,4444.4,0.2971,0.41283,0.92194,2495.9,560560.0,3103100000000000.0,...,-5.3253,2.522325,18.991,6.840775,88488.0,1.242,5.2787e+16,6137.9,0.74621,1.0
max,957918.0,0.41517,0.51899,39544.0,1.3199,0.55475,11.202,5426.6,1913700.0,1.0424e+16,...,1.6134,4.5659,217.84,47.757,526050.0,1.8867,3.2499e+17,13151.0,2.7436,1.0


In [6]:
train_data.shape

(957919, 120)

In [7]:
test_data.shape

(493474, 119)

In [8]:
train_data.head()

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f110,f111,f112,f113,f114,f115,f116,f117,f118,claim
0,0,0.10859,0.004314,-37.566,0.017364,0.28915,-10.251,135.12,168900.0,399240000000000.0,...,-12.228,1.7482,1.9096,-7.1157,4378.8,1.2096,861340000000000.0,140.1,1.0177,1
1,1,0.1009,0.29961,11822.0,0.2765,0.4597,-0.83733,1721.9,119810.0,3874100000000000.0,...,-56.758,4.1684,0.34808,4.142,913.23,1.2464,7575100000000000.0,1861.0,0.28359,0
2,2,0.17803,-0.00698,907.27,0.27214,0.45948,0.17327,2298.0,360650.0,12245000000000.0,...,-5.7688,1.2042,0.2629,8.1312,45119.0,1.1764,321810000000000.0,3838.2,0.4069,1
3,3,0.15236,0.007259,780.1,0.025179,0.51947,7.4914,112.51,259490.0,77814000000000.0,...,-34.858,2.0694,0.79631,-16.336,4952.4,1.1784,4533000000000.0,4889.1,0.51486,1
4,4,0.11623,0.5029,-109.15,0.29791,0.3449,-0.40932,2538.9,65332.0,1907200000000000.0,...,-13.641,1.5298,1.1464,-0.43124,3856.5,1.483,-8991300000000.0,,0.23049,1


In [9]:
train_data.isnull().sum()

id           0
f1       15247
f2       15190
f3       15491
f4       15560
         ...  
f115     15559
f116     15589
f117     15407
f118     15212
claim        0
Length: 120, dtype: int64

In [10]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

In [11]:
impured_train = pd.DataFrame(imp_mean.fit_transform(train_data), columns=train_data.columns)
impured_test = pd.DataFrame(imp_mean.fit_transform(test_data), columns=test_data.columns)

In [12]:
impured_train.isnull().sum()

id       0
f1       0
f2       0
f3       0
f4       0
        ..
f115     0
f116     0
f117     0
f118     0
claim    0
Length: 120, dtype: int64

In [13]:
impured_test.isnull().sum()

id      0
f1      0
f2      0
f3      0
f4      0
       ..
f114    0
f115    0
f116    0
f117    0
f118    0
Length: 119, dtype: int64

In [14]:
impured_train.head()

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f110,f111,f112,f113,f114,f115,f116,f117,f118,claim
0,0.0,0.10859,0.004314,-37.566,0.017364,0.28915,-10.251,135.12,168900.0,399240000000000.0,...,-12.228,1.7482,1.9096,-7.1157,4378.8,1.2096,861340000000000.0,140.1,1.0177,1.0
1,1.0,0.1009,0.29961,11822.0,0.2765,0.4597,-0.83733,1721.9,119810.0,3874100000000000.0,...,-56.758,4.1684,0.34808,4.142,913.23,1.2464,7575100000000000.0,1861.0,0.28359,0.0
2,2.0,0.17803,-0.00698,907.27,0.27214,0.45948,0.17327,2298.0,360650.0,12245000000000.0,...,-5.7688,1.2042,0.2629,8.1312,45119.0,1.1764,321810000000000.0,3838.2,0.4069,1.0
3,3.0,0.15236,0.007259,780.1,0.025179,0.51947,7.4914,112.51,259490.0,77814000000000.0,...,-34.858,2.0694,0.79631,-16.336,4952.4,1.1784,4533000000000.0,4889.1,0.51486,1.0
4,4.0,0.11623,0.5029,-109.15,0.29791,0.3449,-0.40932,2538.9,65332.0,1907200000000000.0,...,-13.641,1.5298,1.1464,-0.43124,3856.5,1.483,-8991300000000.0,3959.204669,0.23049,1.0


In [161]:
impured_train_X = impured_train.iloc[:, 1:-1]
impured_test = impured_test.iloc[:, 1:]

impured_train_y = pd.DataFrame(impured_train.iloc[:, 119])

In [16]:
impured_train_X.head()

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f109,f110,f111,f112,f113,f114,f115,f116,f117,f118
0,0.10859,0.004314,-37.566,0.017364,0.28915,-10.251,135.12,168900.0,399240000000000.0,86.489,...,0.11093,-12.228,1.7482,1.9096,-7.1157,4378.8,1.2096,861340000000000.0,140.1,1.0177
1,0.1009,0.29961,11822.0,0.2765,0.4597,-0.83733,1721.9,119810.0,3874100000000000.0,9953.6,...,0.97673,-56.758,4.1684,0.34808,4.142,913.23,1.2464,7575100000000000.0,1861.0,0.28359
2,0.17803,-0.00698,907.27,0.27214,0.45948,0.17327,2298.0,360650.0,12245000000000.0,15827.0,...,0.20102,-5.7688,1.2042,0.2629,8.1312,45119.0,1.1764,321810000000000.0,3838.2,0.4069
3,0.15236,0.007259,780.1,0.025179,0.51947,7.4914,112.51,259490.0,77814000000000.0,-36.837,...,-0.01182,-34.858,2.0694,0.79631,-16.336,4952.4,1.1784,4533000000000.0,4889.1,0.51486
4,0.11623,0.5029,-109.15,0.29791,0.3449,-0.40932,2538.9,65332.0,1907200000000000.0,144.12,...,0.92739,-13.641,1.5298,1.1464,-0.43124,3856.5,1.483,-8991300000000.0,3959.204669,0.23049


In [17]:
train_X, valid_X, train_y, valid_y = train_test_split(impured_train_X, impured_train_y, test_size=0.33)

In [18]:
train_X.shape

(641805, 118)

In [19]:
train_X.head()

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f109,f110,f111,f112,f113,f114,f115,f116,f117,f118
952720,0.09951,0.35222,-83.961,0.18424,0.22877,-1.7152,2104.8,17045.0,4453700000000000.0,925.43,...,0.42962,-19.926398,4.0049,-1.3367,15.956,56259.0,1.1868,4.276905e+16,4296.4,0.18248
663490,0.06652,0.25686,5780.6,0.005926,-0.000764,1.3942,1848.6,72084.0,600710000000000.0,4952.3,...,0.2715,-62.015,2.07453,0.2325,-3.3878,81037.0,1.1375,4345100000000000.0,5660.8,0.82403
105947,0.080324,0.46275,1583.1,0.22006,0.25641,1.6383,80.008,116310.0,8304600000000000.0,293.18,...,0.80136,-25.375,1.4659,8.2292,5.8778,10523.0,1.1715,3.7637e+16,2457.3,0.1956
627427,0.14254,0.29078,4665.3,0.35881,0.304869,-1.4051,584.93,796670.0,6189400000000000.0,97.834,...,0.11802,-24.976,2.3195,2.7838,-3.5508,322.56,1.1812,-213310000000000.0,3959.204669,0.24272
413352,0.090645,0.28649,313.52,0.37275,-0.000543,-0.67566,184.08,243370.0,1889500000000000.0,586.38,...,0.45009,-9.3349,1.6471,-1.579,29.838,168890.0,1.0343,2.0074e+16,1454.5,0.63143


In [20]:
train_y.head()

Unnamed: 0,claim
952720,0.0
663490,1.0
105947,1.0
627427,1.0
413352,0.0


In [21]:
train_y.shape

(641805, 1)

In [183]:
class MyRandomForestClassifier:
    def __init__(self, n_estimators=100, criterion='gini',
                 max_depth=None, min_samples_split=2,
                 min_samples_leaf=1, min_weight_fraction_leaf=0.0,
                 max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0,
                 min_impurity_split=None, bootstrap=True, random_state=None, max_samples=None):
        self.n_estimators = n_estimators
        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.min_weight_fraction_leaf = min_weight_fraction_leaf
        self.max_features = max_features
        self.max_leaf_nodes = max_leaf_nodes
        self.min_impurity_decrease = min_impurity_decrease
        # self.min_impurity_split = min_impurity_split
        self.bootstrap = bootstrap
        self.random_state = random_state
        self.max_samples = max_samples

        self.estimators = []
        for i in range(self.n_estimators):
            self.estimators.append(
                DecisionTreeClassifier(criterion=self.criterion, max_depth=self.max_depth,
                                       min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf,
                                       min_weight_fraction_leaf=self.min_weight_fraction_leaf,
                                       max_features=self.max_features,
                                       max_leaf_nodes=self.max_leaf_nodes,
                                       min_impurity_decrease=self.min_impurity_decrease,
                                       random_state=self.random_state))

    def fit(self, x, y):
        samples_count = x.shape[0]
        if self.max_samples is not None:
            samples_count = self.max_samples

        bootstrapped_x = pd.DataFrame()
        bootstrapped_y = pd.DataFrame()

        bootstrapped_rows = np.random.choice(x.shape[0], samples_count, replace=self.bootstrap)
        bootstrapped_x = x.iloc[bootstrapped_rows]
        bootstrapped_y = y.iloc[bootstrapped_rows]

        for clf in self.estimators:
            clf.fit(bootstrapped_x, bootstrapped_y)

    def predict(self, x):
        ones = [0 for i in range(x.shape[0])]

        for i, clf in enumerate(self.estimators):
            predictions = list(clf.predict(x))
            for val in predictions:
                if val == 1:
                    ones[i] += 1

        return pd.Series(ones)

    def predict_proba(self, x):
        ones = [0 for i in range(x.shape[0])]

        for i, clf in enumerate(self.estimators):
            predictions = list(clf.predict(x))
            for val in predictions:
                if val == 1:
                    ones[i] += 1

        ones_probabilities = [p / x.shape[0] for p in ones]
        zeroes_probabilities = [1 - p for p in ones_probabilities]
        probabilities = np.array([zeroes_probabilities, ones_probabilities])
        return probabilities.transpose()

In [184]:
my_clf = MyRandomForestClassifier(n_estimators=10, bootstrap=True, max_samples=1000)

In [185]:
my_clf.fit(train_X, train_y)

In [89]:
y_pred = my_clf.predict(valid_X)

In [90]:
accuracy_score(valid_y, y_pred)

0.5009395344717412

In [189]:
y_pred = my_clf.predict_proba(valid_X)

In [190]:
y_pred

array([[0.47268391, 0.52731609],
       [0.50104709, 0.49895291],
       [0.49413503, 0.50586497],
       ...,
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ]])

In [191]:
roc_auc_score(valid_y, y_pred[:, 1])

0.5000127136518436

In [94]:
rfc_sklearn = RandomForestClassifier(n_estimators=10, criterion='gini', 
                                     max_depth=None, min_samples_split=2, 
                                     min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                                     max_features='auto', max_leaf_nodes=None, 
                                     min_impurity_decrease=0.0, min_impurity_split=None,
                                     bootstrap=True, random_state=None, max_samples=None, verbose=2)

In [95]:
rfc_sklearn.fit(train_X, np.ravel(train_y))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 10


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   13.4s remaining:    0.0s


building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  2.2min finished


RandomForestClassifier(n_estimators=10, verbose=2)

In [96]:
y_pred = rfc_sklearn.predict(valid_X)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    3.6s finished


In [97]:
accuracy_score(valid_y, y_pred)

0.5159467786937624

In [98]:
y_pred = rfc_sklearn.predict_proba(valid_X)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    2.2s finished


In [106]:
roc_auc_score(valid_y, y_pred[:, 1])

0.5230826553899438

In [109]:
lightgbm_rfc = LGBMClassifier(n_estimators=100, criterion='gini', 
                              max_depth=None, min_samples_split=2, 
                              min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                              max_features='auto', max_leaf_nodes=None, 
                              min_impurity_decrease=0.0, min_impurity_split=None,
                              bootstrap=True, random_state=None, max_samples=None, verbose=0)

In [110]:
lightgbm_rfc.fit(train_X, np.ravel(train_y))

You can set `force_col_wise=true` to remove the overhead.


LGBMClassifier(bootstrap=True, criterion='gini', max_depth=None,
               max_features='auto', max_leaf_nodes=None, max_samples=None,
               min_impurity_decrease=0.0, min_impurity_split=None,
               min_samples_leaf=1, min_samples_split=2,
               min_weight_fraction_leaf=0.0, verbose=0)

In [111]:
y_pred = lightgbm_rfc.predict(valid_X)

In [112]:
accuracy_score(valid_y, y_pred)

0.700260032772987

In [113]:
y_pred = lightgbm_rfc.predict_proba(valid_X)

In [116]:
roc_auc_score(valid_y, y_pred[:, 1])

0.7626460481768772

In [33]:
params = {
    'n_estimators': range(1, 300),
    'max_depth': range(1, 10),
    'min_samples_split': range(2, 50),
    'min_samples_leaf': range(1, 100),
    'max_leaf_nodes': range(10, 200),
    'max_samples': range(10, 10000)
}

In [123]:
clf = RandomizedSearchCV(lightgbm_rfc, params, cv=5, verbose=2)

In [122]:
clf.fit(train_X, np.ravel(train_y))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
You can set `force_col_wise=true` to remove the overhead.


Traceback (most recent call last):
  File "/Users/fliahin/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 674, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/fliahin/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 74, in inner_f
    return f(**kwargs)
  File "/Users/fliahin/opt/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_ranking.py", line 522, in roc_auc_score
    y_type = type_of_target(y_true)
  File "/Users/fliahin/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/multiclass.py", line 254, in type_of_target
    raise ValueError('Expected array-like (array or non-string sequence), '
ValueError: Expected array-like (array or non-string sequence), got LGBMClassifier(bootstrap=True, criterion='gini', max_depth=3,
               max_features='auto', max_leaf_nodes=140, max_samples=3694,
               min_impurity_decrease=0.0, min_impurity_split=None,
               min_samples

[CV] END max_depth=3, max_leaf_nodes=140, max_samples=3694, min_samples_leaf=10, min_samples_split=20, n_estimators=261; total time=  29.0s
You can set `force_col_wise=true` to remove the overhead.


Traceback (most recent call last):
  File "/Users/fliahin/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 674, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/fliahin/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 74, in inner_f
    return f(**kwargs)
  File "/Users/fliahin/opt/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_ranking.py", line 522, in roc_auc_score
    y_type = type_of_target(y_true)
  File "/Users/fliahin/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/multiclass.py", line 254, in type_of_target
    raise ValueError('Expected array-like (array or non-string sequence), '
ValueError: Expected array-like (array or non-string sequence), got LGBMClassifier(bootstrap=True, criterion='gini', max_depth=3,
               max_features='auto', max_leaf_nodes=140, max_samples=3694,
               min_impurity_decrease=0.0, min_impurity_split=None,
               min_samples

[CV] END max_depth=3, max_leaf_nodes=140, max_samples=3694, min_samples_leaf=10, min_samples_split=20, n_estimators=261; total time=  25.1s
You can set `force_col_wise=true` to remove the overhead.


Traceback (most recent call last):
  File "/Users/fliahin/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 674, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/fliahin/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py", line 74, in inner_f
    return f(**kwargs)
  File "/Users/fliahin/opt/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_ranking.py", line 522, in roc_auc_score
    y_type = type_of_target(y_true)
  File "/Users/fliahin/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/multiclass.py", line 254, in type_of_target
    raise ValueError('Expected array-like (array or non-string sequence), '
ValueError: Expected array-like (array or non-string sequence), got LGBMClassifier(bootstrap=True, criterion='gini', max_depth=3,
               max_features='auto', max_leaf_nodes=140, max_samples=3694,
               min_impurity_decrease=0.0, min_impurity_split=None,
               min_samples

[CV] END max_depth=3, max_leaf_nodes=140, max_samples=3694, min_samples_leaf=10, min_samples_split=20, n_estimators=261; total time=  25.7s


KeyboardInterrupt: 

In [125]:
clf.best_params_

{'n_estimators': 260,
 'min_samples_split': 34,
 'min_samples_leaf': 56,
 'max_samples': 8940,
 'max_leaf_nodes': 187,
 'max_depth': 7}

In [211]:
my_clf = MyRandomForestClassifier(**clf.best_params_, bootstrap=True)
my_clf.fit(train_X, train_y)

In [212]:
rfc_sklearn = RandomForestClassifier(**clf.best_params_, verbose=2, bootstrap=True)
rfc_sklearn.fit(train_X, np.ravel(train_y))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


building tree 1 of 260
building tree 2 of 260
building tree 3 of 260
building tree 4 of 260
building tree 5 of 260
building tree 6 of 260
building tree 7 of 260
building tree 8 of 260
building tree 9 of 260
building tree 10 of 260
building tree 11 of 260
building tree 12 of 260
building tree 13 of 260
building tree 14 of 260
building tree 15 of 260
building tree 16 of 260
building tree 17 of 260
building tree 18 of 260
building tree 19 of 260
building tree 20 of 260
building tree 21 of 260
building tree 22 of 260
building tree 23 of 260
building tree 24 of 260
building tree 25 of 260
building tree 26 of 260
building tree 27 of 260
building tree 28 of 260
building tree 29 of 260
building tree 30 of 260
building tree 31 of 260
building tree 32 of 260
building tree 33 of 260
building tree 34 of 260
building tree 35 of 260
building tree 36 of 260
building tree 37 of 260
building tree 38 of 260
building tree 39 of 260
building tree 40 of 260
building tree 41 of 260
building tree 42 of 260
b

[Parallel(n_jobs=1)]: Done 260 out of 260 | elapsed:   42.2s finished


RandomForestClassifier(max_depth=7, max_leaf_nodes=187, max_samples=8940,
                       min_samples_leaf=56, min_samples_split=34,
                       n_estimators=260, verbose=2)

In [213]:
lightgbm_rfc = LGBMClassifier(**clf.best_params_, verbose=2, bootstrap=True)
lightgbm_rfc.fit(train_X, np.ravel(train_y))

[LightGBM] [Info] Number of positive: 319758, number of negative: 322047
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.000000
[LightGBM] [Debug] init for col-wise cost 0.000024 seconds, init for row-wise cost 0.700145 seconds
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 30074
[LightGBM] [Info] Number of data points in the train set: 641805, number of used features: 118
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498217 -> initscore=-0.007133
[LightGBM] [Info] Start training from score -0.007133
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with

LGBMClassifier(bootstrap=True, max_depth=7, max_leaf_nodes=187,
               max_samples=8940, min_samples_leaf=56, min_samples_split=34,
               n_estimators=260, verbose=2)

In [48]:
y_pred = my_clf.predict(valid_X)
accuracy_score(valid_y, y_pred)

0.500518800179682

In [192]:
y_pred = my_clf.predict_proba(valid_X)
roc_auc_score(valid_y, y_pred[:, 1])

0.5000127136518436

In [49]:
y_pred = rfc_sklearn.predict(valid_X)
accuracy_score(valid_y, y_pred)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 260 out of 260 | elapsed:    6.1s finished


0.5596873279892697

In [131]:
y_pred = rfc_sklearn.predict_proba(valid_X)
roc_auc_score(valid_y, y_pred[:, 1])

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 260 out of 260 | elapsed:    6.7s finished


0.5857196154081727

In [50]:
y_pred = lightgbm_rfc.predict(valid_X)
accuracy_score(valid_y, y_pred)

0.7227582454430996

In [155]:
y_pred = lightgbm_rfc.predict_proba(valid_X)
roc_auc_score(valid_y, y_pred[:, 1])

0.7871992319241768

In [53]:
%timeit my_clf.predict_proba(impured_test_X)

1min 27s ± 3.49 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [54]:
%timeit rfc_sklearn.predict_proba(impured_test_X)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 260 out of 260 | elapsed:    9.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 260 out of 260 | elapsed:    9.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 260 out of 260 | elapsed:    9.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 260 out of 260 | elapsed:   10.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent work

9.87 s ± 258 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


[Parallel(n_jobs=1)]: Done 260 out of 260 | elapsed:   10.0s finished


In [55]:
%timeit lightgbm_rfc.predict_proba(impured_test_X)

5.03 s ± 782 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [197]:
class MyEnsemble:
    def __init__(self, n_estimators=100, criterion='gini',
                 max_depth=None, min_samples_split=2,
                 min_samples_leaf=1, min_weight_fraction_leaf=0.0,
                 max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0,
                 min_impurity_split=None, bootstrap=True, random_state=None, max_samples=None):
        self.n_estimators = n_estimators
        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.min_weight_fraction_leaf = min_weight_fraction_leaf
        self.max_features = max_features
        self.max_leaf_nodes = max_leaf_nodes
        self.min_impurity_decrease = min_impurity_decrease
        # self.min_impurity_split = min_impurity_split
        self.bootstrap = bootstrap
        self.random_state = random_state
        self.max_samples = max_samples

        self.my_random_forest = MyRandomForestClassifier(n_estimators=self.n_estimators, criterion=self.criterion, 
                                                         max_depth=self.max_depth, min_samples_split=self.min_samples_split, 
                                                         min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, 
                                                         max_features=self.max_features, max_leaf_nodes=self.max_leaf_nodes, 
                                                         min_impurity_decrease=self.min_impurity_decrease, bootstrap=self.bootstrap, 
                                                         random_state=self.random_state, max_samples=self.max_samples)
        
        self.sklearn_rfc = RandomForestClassifier(n_estimators=self.n_estimators, criterion=self.criterion, 
                                                  max_depth=self.max_depth, min_samples_split=self.min_samples_split, 
                                                  min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, 
                                                  max_features=self.max_features, max_leaf_nodes=self.max_leaf_nodes, 
                                                  min_impurity_decrease=self.min_impurity_decrease, bootstrap=self.bootstrap, 
                                                  random_state=self.random_state, max_samples=self.max_samples)
        
        self.lightgbm_rfc = LGBMClassifier(n_estimators=self.n_estimators, criterion=self.criterion, 
                                           max_depth=self.max_depth, min_samples_split=self.min_samples_split, 
                                           min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, 
                                           max_features=self.max_features, max_leaf_nodes=self.max_leaf_nodes, 
                                           min_impurity_decrease=self.min_impurity_decrease, bootstrap=self.bootstrap, 
                                           random_state=self.random_state, max_samples=self.max_samples)
        
    def fit(self, x, y):
        samples_count = x.shape[0]
        if self.max_samples is not None:
            samples_count = self.max_samples

        bootstrapped_x = pd.DataFrame()
        bootstrapped_y = pd.DataFrame()

        bootstrapped_rows = np.random.choice(x.shape[0], samples_count, replace=self.bootstrap)
        bootstrapped_x = x.iloc[bootstrapped_rows]
        bootstrapped_y = y.iloc[bootstrapped_rows]
        
        self.my_random_forest.fit(bootstrapped_x, bootstrapped_y)
        self.sklearn_rfc.fit(bootstrapped_x, np.ravel(bootstrapped_y))
        self.lightgbm_rfc.fit(bootstrapped_x, np.ravel(bootstrapped_y))

    def predict(self, x):
        ones = [0 for i in range(x.shape[0])]

        predictions = list(self.my_random_forest.predict(x))
        for i, val in enumerate(predictions):
            ones[i] += val
            
        predictions = list(self.sklearn_rfc.predict(x))
        for i, val in enumerate(predictions):
            ones[i] += val

        predictions = list(self.lightgbm_rfc.predict(x))
        for i, val in enumerate(predictions):
            ones[i] += val
    
        predictions = [1 if x > 1 else 0 for x in ones]
        return pd.Series(predictions)
    
    def predict_proba(self, x):
        ones_probabilities = [0 for i in range(x.shape[0])]

        predictions = self.my_random_forest.predict_proba(x)
        for i, val in enumerate(predictions[:, 1]):
            ones_probabilities[i] += val
            
        predictions = self.sklearn_rfc.predict_proba(x)
        for i, val in enumerate(predictions[:, 1]):
            ones_probabilities[i] += val

        predictions = self.lightgbm_rfc.predict_proba(x)
        for i, val in enumerate(predictions[:, 1]):
            ones_probabilities[i] += val
        
        ones_probabilities = [x / 3 for x in ones_probabilities]
        zeroes_probabilities = [1 - p for p in ones_probabilities]
        probabilities = np.array([zeroes_probabilities, ones_probabilities])
        return probabilities.transpose()

In [198]:
my_ensemble = MyEnsemble(**clf.best_params_, bootstrap=True)

In [200]:
my_ensemble.fit(train_X, train_y)

In [140]:
y_pred = my_ensemble.predict(valid_X)

In [141]:
accuracy_score(valid_y, y_pred)

0.5494188805304415

In [201]:
y_pred = my_ensemble.predict_proba(valid_X)

In [202]:
roc_auc_score(valid_y, y_pred[:, 1])

0.5877384629348268

In [214]:
# y_pred = lightgbm_rfc.predict_proba(impured_test)
# y_pred = rfc_sklearn.predict_proba(impured_test)
y_pred = my_clf.predict_proba(impured_test)
# y_pred = my_ensemble.predict_proba(impured_test)

In [215]:
y_pred = pd.DataFrame(data={
    'id': test_data.id, 
    'claim': y_pred[:, 1]}, index=impured_test.index)

In [216]:
y_pred

Unnamed: 0,id,claim
0,957919,0.588181
1,957920,0.493084
2,957921,0.540083
3,957922,0.588266
4,957923,0.556272
...,...,...
493469,1451388,0.000000
493470,1451389,0.000000
493471,1451390,0.000000
493472,1451391,0.000000


In [217]:
pd.DataFrame(y_pred).to_csv('my_clf_solution.csv',
        columns=['id', 'claim'], index=False)