In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier

In [2]:
train_data = pd.read_csv('train.csv')

In [3]:
test_data = pd.read_csv('test.csv')

In [4]:
train_data.describe()

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f110,f111,f112,f113,f114,f115,f116,f117,f118,claim
count,957919.0,942672.0,942729.0,942428.0,942359.0,942514.0,942398.0,942415.0,942546.0,942670.0,...,942554.0,942420.0,942509.0,942686.0,942481.0,942360.0,942330.0,942512.0,942707.0,957919.0
mean,478959.0,0.090201,0.345964,4068.744207,0.201214,0.304869,-0.071458,1620.843815,377164.2,1806054000000000.0,...,-19.926398,2.07453,23.885245,1.748777,63152.97354,1.208876,4.276905e+16,3959.204669,0.559267,0.498492
std,276527.540591,0.043564,0.146251,6415.82944,0.21251,0.145343,2.123777,1276.281403,345432.5,2335204000000000.0,...,18.578439,0.895793,45.58136,10.088848,92435.016241,0.114959,6.732441e+16,3155.991777,0.408426,0.499998
min,0.0,-0.14991,-0.019044,-9421.7,-0.082122,-0.00699,-12.791,-224.8,-29843.0,-1153300000000000.0,...,-105.86,0.27704,-27.691,-26.589,-81977.0,0.90527,-8944400000000000.0,-415.24,-0.15124,0.0
25%,239479.5,0.070227,0.28305,418.43,0.035086,0.24052,-1.1207,481.545,91209.0,11531000000000.0,...,-28.812,1.4877,-0.62888,-4.473975,2443.2,1.1468,232110000000000.0,1306.2,0.27656,0.0
50%,478959.0,0.090135,0.3891,1279.5,0.137,0.32779,-0.38011,1446.1,289670.0,504305000000000.0,...,-14.636,1.6621,1.7277,0.88571,19479.0,1.1772,1.3275e+16,3228.0,0.47344,0.0
75%,718438.5,0.1165,0.45845,4444.4,0.2971,0.41283,0.92194,2495.9,560560.0,3103100000000000.0,...,-5.3253,2.522325,18.991,6.840775,88488.0,1.242,5.2787e+16,6137.9,0.74621,1.0
max,957918.0,0.41517,0.51899,39544.0,1.3199,0.55475,11.202,5426.6,1913700.0,1.0424e+16,...,1.6134,4.5659,217.84,47.757,526050.0,1.8867,3.2499e+17,13151.0,2.7436,1.0


In [5]:
train_data.shape

(957919, 120)

In [6]:
test_data.shape

(493474, 119)

In [7]:
train_data.head()

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f110,f111,f112,f113,f114,f115,f116,f117,f118,claim
0,0,0.10859,0.004314,-37.566,0.017364,0.28915,-10.251,135.12,168900.0,399240000000000.0,...,-12.228,1.7482,1.9096,-7.1157,4378.8,1.2096,861340000000000.0,140.1,1.0177,1
1,1,0.1009,0.29961,11822.0,0.2765,0.4597,-0.83733,1721.9,119810.0,3874100000000000.0,...,-56.758,4.1684,0.34808,4.142,913.23,1.2464,7575100000000000.0,1861.0,0.28359,0
2,2,0.17803,-0.00698,907.27,0.27214,0.45948,0.17327,2298.0,360650.0,12245000000000.0,...,-5.7688,1.2042,0.2629,8.1312,45119.0,1.1764,321810000000000.0,3838.2,0.4069,1
3,3,0.15236,0.007259,780.1,0.025179,0.51947,7.4914,112.51,259490.0,77814000000000.0,...,-34.858,2.0694,0.79631,-16.336,4952.4,1.1784,4533000000000.0,4889.1,0.51486,1
4,4,0.11623,0.5029,-109.15,0.29791,0.3449,-0.40932,2538.9,65332.0,1907200000000000.0,...,-13.641,1.5298,1.1464,-0.43124,3856.5,1.483,-8991300000000.0,,0.23049,1


In [8]:
train_data.isnull().sum()

id           0
f1       15247
f2       15190
f3       15491
f4       15560
         ...  
f115     15559
f116     15589
f117     15407
f118     15212
claim        0
Length: 120, dtype: int64

In [9]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

In [10]:
impured_train = pd.DataFrame(imp_mean.fit_transform(train_data), columns=train_data.columns)
impured_test = pd.DataFrame(imp_mean.fit_transform(test_data), columns=test_data.columns)

In [11]:
impured_train.isnull().sum()

id       0
f1       0
f2       0
f3       0
f4       0
        ..
f115     0
f116     0
f117     0
f118     0
claim    0
Length: 120, dtype: int64

In [12]:
impured_test.isnull().sum()

id      0
f1      0
f2      0
f3      0
f4      0
       ..
f114    0
f115    0
f116    0
f117    0
f118    0
Length: 119, dtype: int64

In [13]:
impured_train.head()

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f110,f111,f112,f113,f114,f115,f116,f117,f118,claim
0,0.0,0.10859,0.004314,-37.566,0.017364,0.28915,-10.251,135.12,168900.0,399240000000000.0,...,-12.228,1.7482,1.9096,-7.1157,4378.8,1.2096,861340000000000.0,140.1,1.0177,1.0
1,1.0,0.1009,0.29961,11822.0,0.2765,0.4597,-0.83733,1721.9,119810.0,3874100000000000.0,...,-56.758,4.1684,0.34808,4.142,913.23,1.2464,7575100000000000.0,1861.0,0.28359,0.0
2,2.0,0.17803,-0.00698,907.27,0.27214,0.45948,0.17327,2298.0,360650.0,12245000000000.0,...,-5.7688,1.2042,0.2629,8.1312,45119.0,1.1764,321810000000000.0,3838.2,0.4069,1.0
3,3.0,0.15236,0.007259,780.1,0.025179,0.51947,7.4914,112.51,259490.0,77814000000000.0,...,-34.858,2.0694,0.79631,-16.336,4952.4,1.1784,4533000000000.0,4889.1,0.51486,1.0
4,4.0,0.11623,0.5029,-109.15,0.29791,0.3449,-0.40932,2538.9,65332.0,1907200000000000.0,...,-13.641,1.5298,1.1464,-0.43124,3856.5,1.483,-8991300000000.0,3959.204669,0.23049,1.0


In [14]:
impured_train_X = impured_train.iloc[:, 1:-1]
impured_test = impured_test.iloc[:, 1:]

impured_train_y = pd.DataFrame(impured_train.iloc[:, 119])

In [15]:
impured_train_X.head()

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f109,f110,f111,f112,f113,f114,f115,f116,f117,f118
0,0.10859,0.004314,-37.566,0.017364,0.28915,-10.251,135.12,168900.0,399240000000000.0,86.489,...,0.11093,-12.228,1.7482,1.9096,-7.1157,4378.8,1.2096,861340000000000.0,140.1,1.0177
1,0.1009,0.29961,11822.0,0.2765,0.4597,-0.83733,1721.9,119810.0,3874100000000000.0,9953.6,...,0.97673,-56.758,4.1684,0.34808,4.142,913.23,1.2464,7575100000000000.0,1861.0,0.28359
2,0.17803,-0.00698,907.27,0.27214,0.45948,0.17327,2298.0,360650.0,12245000000000.0,15827.0,...,0.20102,-5.7688,1.2042,0.2629,8.1312,45119.0,1.1764,321810000000000.0,3838.2,0.4069
3,0.15236,0.007259,780.1,0.025179,0.51947,7.4914,112.51,259490.0,77814000000000.0,-36.837,...,-0.01182,-34.858,2.0694,0.79631,-16.336,4952.4,1.1784,4533000000000.0,4889.1,0.51486
4,0.11623,0.5029,-109.15,0.29791,0.3449,-0.40932,2538.9,65332.0,1907200000000000.0,144.12,...,0.92739,-13.641,1.5298,1.1464,-0.43124,3856.5,1.483,-8991300000000.0,3959.204669,0.23049


In [16]:
train_X, valid_X, train_y, valid_y = train_test_split(impured_train_X, impured_train_y, test_size=0.33)

In [17]:
train_X.shape

(641805, 118)

In [18]:
train_X.head()

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f109,f110,f111,f112,f113,f114,f115,f116,f117,f118
280372,0.058115,0.45691,-110.91,0.25558,0.44703,2.0247,-43.882,40589.0,10385000000000.0,179.11,...,0.9385,-2.27,1.9577,1.6057,-14.496,348550.0,1.1217,5.4102e+16,6058.2,1.7811
226091,0.070141,0.39961,-331.66,0.036449,0.42966,-1.2863,853.01,944100.0,3036500000000000.0,8964.8,...,0.026138,-87.694,2.07453,114.76,-0.60993,38896.0,1.2051,1.2916e+17,8423.7,0.62378
626495,0.12129,0.50441,816.7,-0.000257,0.48801,0.7869,3306.0,492940.0,3820700000000000.0,398.32,...,0.84292,-1.3555,3.5227,0.91394,0.90474,1713.3,1.2325,997640000000000.0,2447.8,0.17523
582250,0.054117,0.26968,13.114,0.22083,0.002199,5.2,1324.7,671710.0,568000000000000.0,12883.0,...,0.50776,-36.696,3.9833,93.291,-5.5308,61873.0,1.1564,1.2309e+16,69.47,0.61345
621518,0.10825,0.49983,-104.6,0.30163,0.32869,1.1009,162.75,532770.0,1787400000000000.0,85.526,...,0.58246,-2.9728,3.105,0.55572,-1.1647,11172.0,1.1391,3.4795e+16,799.94,0.40529


In [19]:
train_y.head()

Unnamed: 0,claim
280372,0.0
226091,1.0
626495,0.0
582250,1.0
621518,1.0


In [20]:
train_y.shape

(641805, 1)

In [21]:
class MyRandomForestClassifier:
    def __init__(self, n_estimators=100, criterion='gini',
                 max_depth=None, min_samples_split=2,
                 min_samples_leaf=1, min_weight_fraction_leaf=0.0,
                 max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0,
                 min_impurity_split=None, bootstrap=True, random_state=None, max_samples=None):
        self.n_estimators = n_estimators
        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.min_weight_fraction_leaf = min_weight_fraction_leaf
        self.max_features = max_features
        self.max_leaf_nodes = max_leaf_nodes
        self.min_impurity_decrease = min_impurity_decrease
        # self.min_impurity_split = min_impurity_split
        self.bootstrap = bootstrap
        self.random_state = random_state
        self.max_samples = max_samples

        self.estimators = []
        for i in range(self.n_estimators):
            self.estimators.append(
                DecisionTreeClassifier(criterion=self.criterion, max_depth=self.max_depth,
                                       min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf,
                                       min_weight_fraction_leaf=self.min_weight_fraction_leaf,
                                       max_features=self.max_features,
                                       max_leaf_nodes=self.max_leaf_nodes,
                                       min_impurity_decrease=self.min_impurity_decrease,
                                       random_state=self.random_state))

    def fit(self, x, y):
        samples_count = x.shape[0]
        if self.max_samples is not None:
            samples_count = self.max_samples

        bootstrapped_x = pd.DataFrame()
        bootstrapped_y = pd.DataFrame()

        bootstrapped_rows = np.random.choice(x.shape[0], samples_count, replace=self.bootstrap)
        bootstrapped_x = x.iloc[bootstrapped_rows]
        bootstrapped_y = y.iloc[bootstrapped_rows]

        for clf in self.estimators:
            clf.fit(bootstrapped_x, bootstrapped_y)

    def predict(self, x):
        ones = [0 for i in range(x.shape[0])]

        for i, clf in enumerate(self.estimators):
            predictions = list(clf.predict(x))
            for val in predictions:
                if val == 1:
                    ones[i] += 1

        return pd.Series(ones)

    def predict_proba(self, x):
        ones = [0 for i in range(x.shape[0])]

        for i, clf in enumerate(self.estimators):
            predictions = list(clf.predict(x))
            for val in predictions:
                if val == 1:
                    ones[i] += 1

        ones_probabilities = [p / x.shape[0] for p in ones]
        zeroes_probabilities = [1 - p for p in ones_probabilities]
        probabilities = np.array([zeroes_probabilities, ones_probabilities])
        return probabilities.transpose()

In [22]:
my_clf = MyRandomForestClassifier(n_estimators=10, bootstrap=True, max_samples=1000)

In [23]:
my_clf.fit(train_X, train_y)

In [24]:
y_pred = my_clf.predict(valid_X)

In [25]:
accuracy_score(valid_y, y_pred)

0.5008825929886054

In [26]:
y_pred = my_clf.predict_proba(valid_X)

In [27]:
y_pred

array([[0.49722568, 0.50277432],
       [0.50409662, 0.49590338],
       [0.51706346, 0.48293654],
       ...,
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ]])

In [28]:
roc_auc_score(valid_y, y_pred[:, 1])

0.5000253633456776

In [29]:
rfc_sklearn = RandomForestClassifier(n_estimators=100, criterion='gini', 
                                     max_depth=None, min_samples_split=2, 
                                     min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                                     max_features='auto', max_leaf_nodes=None, 
                                     min_impurity_decrease=0.0, min_impurity_split=None,
                                     bootstrap=True, random_state=None, max_samples=None, verbose=2)

In [30]:
rfc_sklearn.fit(train_X, np.ravel(train_y))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 100


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   14.0s remaining:    0.0s


building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 26.6min finished


RandomForestClassifier(verbose=2)

In [31]:
y_pred = rfc_sklearn.predict(valid_X)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   29.4s finished


In [32]:
accuracy_score(valid_y, y_pred)

0.5445250763964898

In [33]:
y_pred = rfc_sklearn.predict_proba(valid_X)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   26.5s finished


In [34]:
roc_auc_score(valid_y, y_pred[:, 1])

0.5621270993070389

In [35]:
lightgbm_rfc = LGBMClassifier(n_estimators=100, criterion='gini', 
                              max_depth=None, min_samples_split=2, 
                              min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                              max_features='auto', max_leaf_nodes=None, 
                              min_impurity_decrease=0.0, min_impurity_split=None,
                              bootstrap=True, random_state=None, max_samples=None, verbose=0)

In [36]:
lightgbm_rfc.fit(train_X, np.ravel(train_y))

You can set `force_col_wise=true` to remove the overhead.


LGBMClassifier(bootstrap=True, criterion='gini', max_depth=None,
               max_features='auto', max_leaf_nodes=None, max_samples=None,
               min_impurity_decrease=0.0, min_impurity_split=None,
               min_samples_leaf=1, min_samples_split=2,
               min_weight_fraction_leaf=0.0, verbose=0)

In [37]:
y_pred = lightgbm_rfc.predict(valid_X)

In [38]:
accuracy_score(valid_y, y_pred)

0.6975394952453862

In [39]:
y_pred = lightgbm_rfc.predict_proba(valid_X)

In [40]:
roc_auc_score(valid_y, y_pred[:, 1])

0.7604868440278877

In [41]:
params = {
    'n_estimators': range(1, 300),
    'max_depth': range(1, 10),
    'min_samples_split': range(2, 50),
    'min_samples_leaf': range(1, 100),
    'max_leaf_nodes': range(10, 200),
    'max_samples': range(10, 10000)
}

In [42]:
clf = RandomizedSearchCV(lightgbm_rfc, params, cv=5, verbose=2)

In [43]:
clf.fit(train_X, np.ravel(train_y))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
You can set `force_col_wise=true` to remove the overhead.
[CV] END max_depth=1, max_leaf_nodes=178, max_samples=4591, min_samples_leaf=19, min_samples_split=26, n_estimators=284; total time=  28.4s
You can set `force_col_wise=true` to remove the overhead.
[CV] END max_depth=1, max_leaf_nodes=178, max_samples=4591, min_samples_leaf=19, min_samples_split=26, n_estimators=284; total time=  28.0s
You can set `force_col_wise=true` to remove the overhead.
[CV] END max_depth=1, max_leaf_nodes=178, max_samples=4591, min_samples_leaf=19, min_samples_split=26, n_estimators=284; total time=  28.3s
You can set `force_col_wise=true` to remove the overhead.
[CV] END max_depth=1, max_leaf_nodes=178, max_samples=4591, min_samples_leaf=19, min_samples_split=26, n_estimators=284; total time=  26.1s
You can set `force_col_wise=true` to remove the overhead.
[CV] END max_depth=1, max_leaf_nodes=178, max_samples=4591, min_samples_leaf=19, min_samp

RandomizedSearchCV(cv=5,
                   estimator=LGBMClassifier(bootstrap=True, criterion='gini',
                                            max_depth=None, max_features='auto',
                                            max_leaf_nodes=None,
                                            max_samples=None,
                                            min_impurity_decrease=0.0,
                                            min_impurity_split=None,
                                            min_samples_leaf=1,
                                            min_samples_split=2,
                                            min_weight_fraction_leaf=0.0,
                                            verbose=0),
                   param_distributions={'max_depth': range(1, 10),
                                        'max_leaf_nodes': range(10, 200),
                                        'max_samples': range(10, 10000),
                                        'min_samples_leaf': range(1, 100),
 

In [44]:
clf.best_params_

{'n_estimators': 243,
 'min_samples_split': 28,
 'min_samples_leaf': 4,
 'max_samples': 1939,
 'max_leaf_nodes': 158,
 'max_depth': 8}

In [45]:
my_clf = MyRandomForestClassifier(**clf.best_params_, bootstrap=True)
my_clf.fit(train_X, train_y)

In [46]:
rfc_sklearn = RandomForestClassifier(**clf.best_params_, verbose=2, bootstrap=True)
rfc_sklearn.fit(train_X, np.ravel(train_y))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


building tree 1 of 243
building tree 2 of 243
building tree 3 of 243
building tree 4 of 243
building tree 5 of 243
building tree 6 of 243
building tree 7 of 243
building tree 8 of 243
building tree 9 of 243
building tree 10 of 243
building tree 11 of 243
building tree 12 of 243
building tree 13 of 243
building tree 14 of 243
building tree 15 of 243
building tree 16 of 243
building tree 17 of 243
building tree 18 of 243
building tree 19 of 243
building tree 20 of 243
building tree 21 of 243
building tree 22 of 243
building tree 23 of 243
building tree 24 of 243
building tree 25 of 243
building tree 26 of 243
building tree 27 of 243
building tree 28 of 243
building tree 29 of 243
building tree 30 of 243
building tree 31 of 243
building tree 32 of 243
building tree 33 of 243
building tree 34 of 243
building tree 35 of 243
building tree 36 of 243
building tree 37 of 243
building tree 38 of 243
building tree 39 of 243
building tree 40 of 243
building tree 41 of 243
building tree 42 of 243
b

[Parallel(n_jobs=1)]: Done 243 out of 243 | elapsed:   22.2s finished


RandomForestClassifier(max_depth=8, max_leaf_nodes=158, max_samples=1939,
                       min_samples_leaf=4, min_samples_split=28,
                       n_estimators=243, verbose=2)

In [47]:
lightgbm_rfc = LGBMClassifier(**clf.best_params_, verbose=2, bootstrap=True)
lightgbm_rfc.fit(train_X, np.ravel(train_y))

[LightGBM] [Info] Number of positive: 319738, number of negative: 322067
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.000000
[LightGBM] [Debug] init for col-wise cost 0.002173 seconds, init for row-wise cost 0.800146 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 30074
[LightGBM] [Info] Number of data points in the train set: 641805, number of used features: 118
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498186 -> initscore=-0.007258
[LightGBM] [Info] Start training from score -0.007258
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 31 and dept

LGBMClassifier(bootstrap=True, max_depth=8, max_leaf_nodes=158,
               max_samples=1939, min_samples_leaf=4, min_samples_split=28,
               n_estimators=243, verbose=2)

In [48]:
y_pred = my_clf.predict(valid_X)
accuracy_score(valid_y, y_pred)

0.5004871660223843

In [49]:
y_pred = my_clf.predict_proba(valid_X)
roc_auc_score(valid_y, y_pred[:, 1])

0.4999728768227981

In [50]:
y_pred = rfc_sklearn.predict(valid_X)
accuracy_score(valid_y, y_pred)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 243 out of 243 | elapsed:    6.8s finished


0.5403936554534123

In [51]:
y_pred = rfc_sklearn.predict_proba(valid_X)
roc_auc_score(valid_y, y_pred[:, 1])

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 243 out of 243 | elapsed:    7.1s finished


0.5575579520992007

In [52]:
y_pred = lightgbm_rfc.predict(valid_X)
accuracy_score(valid_y, y_pred)

0.7195189077358168

In [53]:
y_pred = lightgbm_rfc.predict_proba(valid_X)
roc_auc_score(valid_y, y_pred[:, 1])

0.7853847066619809

In [55]:
%timeit my_clf.predict_proba(impured_test)

1min 40s ± 5.91 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [56]:
%timeit rfc_sklearn.predict_proba(impured_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 243 out of 243 | elapsed:    9.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 243 out of 243 | elapsed:   11.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 243 out of 243 | elapsed:   14.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 243 out of 243 | elapsed:    9.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent work

11.4 s ± 1.82 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


[Parallel(n_jobs=1)]: Done 243 out of 243 | elapsed:   10.1s finished


In [57]:
%timeit lightgbm_rfc.predict_proba(impured_test)

5.6 s ± 1.72 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [58]:
class MyEnsemble:
    def __init__(self, n_estimators=100, criterion='gini',
                 max_depth=None, min_samples_split=2,
                 min_samples_leaf=1, min_weight_fraction_leaf=0.0,
                 max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0,
                 min_impurity_split=None, bootstrap=True, random_state=None, max_samples=None):
        self.n_estimators = n_estimators
        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.min_weight_fraction_leaf = min_weight_fraction_leaf
        self.max_features = max_features
        self.max_leaf_nodes = max_leaf_nodes
        self.min_impurity_decrease = min_impurity_decrease
        # self.min_impurity_split = min_impurity_split
        self.bootstrap = bootstrap
        self.random_state = random_state
        self.max_samples = max_samples

        self.my_random_forest = MyRandomForestClassifier(n_estimators=self.n_estimators, criterion=self.criterion, 
                                                         max_depth=self.max_depth, min_samples_split=self.min_samples_split, 
                                                         min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, 
                                                         max_features=self.max_features, max_leaf_nodes=self.max_leaf_nodes, 
                                                         min_impurity_decrease=self.min_impurity_decrease, bootstrap=self.bootstrap, 
                                                         random_state=self.random_state, max_samples=self.max_samples)
        
        self.sklearn_rfc = RandomForestClassifier(n_estimators=self.n_estimators, criterion=self.criterion, 
                                                  max_depth=self.max_depth, min_samples_split=self.min_samples_split, 
                                                  min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, 
                                                  max_features=self.max_features, max_leaf_nodes=self.max_leaf_nodes, 
                                                  min_impurity_decrease=self.min_impurity_decrease, bootstrap=self.bootstrap, 
                                                  random_state=self.random_state, max_samples=self.max_samples)
        
        self.lightgbm_rfc = LGBMClassifier(n_estimators=self.n_estimators, criterion=self.criterion, 
                                           max_depth=self.max_depth, min_samples_split=self.min_samples_split, 
                                           min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, 
                                           max_features=self.max_features, max_leaf_nodes=self.max_leaf_nodes, 
                                           min_impurity_decrease=self.min_impurity_decrease, bootstrap=self.bootstrap, 
                                           random_state=self.random_state, max_samples=self.max_samples)
        
    def fit(self, x, y):
        samples_count = x.shape[0]
        if self.max_samples is not None:
            samples_count = self.max_samples

        bootstrapped_x = pd.DataFrame()
        bootstrapped_y = pd.DataFrame()

        bootstrapped_rows = np.random.choice(x.shape[0], samples_count, replace=self.bootstrap)
        bootstrapped_x = x.iloc[bootstrapped_rows]
        bootstrapped_y = y.iloc[bootstrapped_rows]
        
        self.my_random_forest.fit(bootstrapped_x, bootstrapped_y)
        self.sklearn_rfc.fit(bootstrapped_x, np.ravel(bootstrapped_y))
        self.lightgbm_rfc.fit(bootstrapped_x, np.ravel(bootstrapped_y))

    def predict(self, x):
        ones = [0 for i in range(x.shape[0])]

        predictions = list(self.my_random_forest.predict(x))
        for i, val in enumerate(predictions):
            ones[i] += val
            
        predictions = list(self.sklearn_rfc.predict(x))
        for i, val in enumerate(predictions):
            ones[i] += val

        predictions = list(self.lightgbm_rfc.predict(x))
        for i, val in enumerate(predictions):
            ones[i] += val
    
        predictions = [1 if x > 1 else 0 for x in ones]
        return pd.Series(predictions)
    
    def predict_proba(self, x):
        ones_probabilities = [0 for i in range(x.shape[0])]

        predictions = self.my_random_forest.predict_proba(x)
        for i, val in enumerate(predictions[:, 1]):
            ones_probabilities[i] += val
            
        predictions = self.sklearn_rfc.predict_proba(x)
        for i, val in enumerate(predictions[:, 1]):
            ones_probabilities[i] += val

        predictions = self.lightgbm_rfc.predict_proba(x)
        for i, val in enumerate(predictions[:, 1]):
            ones_probabilities[i] += val
        
        ones_probabilities = [x / 3 for x in ones_probabilities]
        zeroes_probabilities = [1 - p for p in ones_probabilities]
        probabilities = np.array([zeroes_probabilities, ones_probabilities])
        return probabilities.transpose()

In [59]:
my_ensemble = MyEnsemble(**clf.best_params_, bootstrap=True)

In [61]:
my_ensemble.fit(train_X, train_y)

In [62]:
y_pred = my_ensemble.predict(valid_X)

In [63]:
accuracy_score(valid_y, y_pred)

0.5178827891203807

In [64]:
y_pred = my_ensemble.predict_proba(valid_X)

In [65]:
roc_auc_score(valid_y, y_pred[:, 1])

0.5229143593613285

In [66]:
# y_pred = lightgbm_rfc.predict_proba(impured_test)
# y_pred = rfc_sklearn.predict_proba(impured_test)
y_pred = my_clf.predict_proba(impured_test)
# y_pred = my_ensemble.predict_proba(impured_test)

In [67]:
y_pred = pd.DataFrame(data={
    'id': test_data.id, 
    'claim': y_pred[:, 1]}, index=impured_test.index)

In [68]:
y_pred

Unnamed: 0,id,claim
0,957919,0.542774
1,957920,0.513999
2,957921,0.640871
3,957922,0.465769
4,957923,0.560972
...,...,...
493469,1451388,0.000000
493470,1451389,0.000000
493471,1451390,0.000000
493472,1451391,0.000000


In [69]:
pd.DataFrame(y_pred).to_csv('my_clf_solution.csv',
        columns=['id', 'claim'], index=False)