In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool, EShapCalcType, EFeaturesSelectionAlgorithm
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split, StratifiedKFold
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
import lightgbm as lgb
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel,RFECV
from sklearn.svm import SVC, LinearSVC
import joblib
from sklearn.model_selection import StratifiedKFold

In [2]:
X_train = pd.read_feather('datasets\dataset_noiseoof2\X_train.feather')
X_val = pd.read_feather('datasets\dataset_noiseoof2\X_val.feather')
y_train = pd.read_feather('datasets\dataset_noiseoof2\y_train.feather')
y_val = pd.read_feather('datasets\dataset_noiseoof2\y_val.feather')
X_test = pd.read_feather('datasets\dataset_noiseoof2\X_test.feather')

In [22]:
id = X_test['ID']

## Models

In [5]:
#1. CatBoost Classifier
clf_cat = CatBoostClassifier(
    eval_metric='Accuracy',
    random_seed=42,
    n_estimators=1000,
    learning_rate=0.05, 
    task_type='GPU'
)

# 2. LGBM Classifier (GBDT)
clf_lgb1 = lgb.LGBMClassifier(
    boosting_type='gbdt',
    num_leaves = 64,
    random_state=42,
    n_estimators=1000,
    learning_rate=0.05,
    objective= 'multiclass',
    metric = 'multi_error',
    )

# 3. LGBM (Dart)
clf_lgb2 = lgb.LGBMClassifier(
    boosting_type='dart',
    num_leaves = 64,
    random_state=42,
    n_estimators=200, 
    learning_rate=0.035,
    objective= 'multiclass',
    metric = 'multi_error',
    verbosity = 1
    )

#4. XGBoost

clf_xgb1 = xgb.XGBClassifier(
    n_estimators = 500, 
    learning_rate = 0.05, 
    random_state = 42,
    n_jobs=-1,
    eval_metric = 'merror',
    tree_method =  'gpu_hist',
    early_stopping_rounds = 50,
    disable_default_eval_metric = True
)

In [6]:
eval_set = [(X_val,y_val)]

In [7]:
clf_cat.fit(X_train,y_train,eval_set = eval_set, early_stopping_rounds=50)

0:	learn: 0.8161850	test: 0.8165891	best: 0.8165891 (0)	total: 14.4ms	remaining: 14.4s
1:	learn: 0.8164574	test: 0.8171012	best: 0.8171012 (1)	total: 30.4ms	remaining: 15.2s
2:	learn: 0.8166426	test: 0.8167961	best: 0.8171012 (1)	total: 48.7ms	remaining: 16.2s
3:	learn: 0.8166181	test: 0.8170358	best: 0.8171012 (1)	total: 64.8ms	remaining: 16.1s
4:	learn: 0.8169967	test: 0.8177767	best: 0.8177767 (4)	total: 81.1ms	remaining: 16.1s
5:	learn: 0.8169586	test: 0.8178312	best: 0.8178312 (5)	total: 96.7ms	remaining: 16s
6:	learn: 0.8170594	test: 0.8176351	best: 0.8178312 (5)	total: 114ms	remaining: 16.1s
7:	learn: 0.8170975	test: 0.8177004	best: 0.8178312 (5)	total: 132ms	remaining: 16.3s
8:	learn: 0.8171084	test: 0.8178312	best: 0.8178312 (5)	total: 151ms	remaining: 16.7s
9:	learn: 0.8171356	test: 0.8178857	best: 0.8178857 (9)	total: 165ms	remaining: 16.3s
10:	learn: 0.8172010	test: 0.8180164	best: 0.8180164 (10)	total: 178ms	remaining: 16s
11:	learn: 0.8171955	test: 0.8180382	best: 0.81803

<catboost.core.CatBoostClassifier at 0x13daa5dba60>

In [8]:
clf_lgb1.fit(X_train,y_train,eval_set=eval_set,early_stopping_rounds=200)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[1]	valid_0's multi_error: 0.258937
[2]	valid_0's multi_error: 0.258937
[3]	valid_0's multi_error: 0.258937
[4]	valid_0's multi_error: 0.258937
[5]	valid_0's multi_error: 0.258937
[6]	valid_0's multi_error: 0.258937
[7]	valid_0's multi_error: 0.258937
[8]	valid_0's multi_error: 0.248074
[9]	valid_0's multi_error: 0.237179
[10]	valid_0's multi_error: 0.232592
[11]	valid_0's multi_error: 0.228888
[12]	valid_0's multi_error: 0.227068
[13]	valid_0's multi_error: 0.225063
[14]	valid_0's multi_error: 0.21772
[15]	valid_0's multi_error: 0.211477
[16]	valid_0's multi_error: 0.205496
[17]	valid_0's multi_error: 0.2008
[18]	valid_0's multi_error: 0.197597
[19]	valid_0's multi_error: 0.195036
[20]	valid_0's multi_error: 0.192312
[21]	valid_0's multi_error: 0.190743
[22]	valid_0's multi_error: 0.18936
[23]	valid_0's multi_error: 0.18802
[24]	valid_0's multi_error: 0.187181
[25]	valid_0's multi_error: 0.186178
[26]	valid_0's multi_error: 0.185808
[27]	valid_0's multi_error: 0.185481
[28]	valid_0's 

In [9]:
clf_lgb2.fit(X_train,y_train,eval_set=eval_set,early_stopping_rounds=200)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 20464
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 208
[LightGBM] [Info] Start training from score -0.299664
[LightGBM] [Info] Start training from score -2.587864
[LightGBM] [Info] Start training from score -2.291048
[LightGBM] [Info] Start training from score -2.493842
[1]	valid_0's multi_error: 0.258937




[2]	valid_0's multi_error: 0.258937
[3]	valid_0's multi_error: 0.258937
[4]	valid_0's multi_error: 0.258937
[5]	valid_0's multi_error: 0.258937
[6]	valid_0's multi_error: 0.258937
[7]	valid_0's multi_error: 0.258937
[8]	valid_0's multi_error: 0.258937
[9]	valid_0's multi_error: 0.258937
[10]	valid_0's multi_error: 0.258937
[11]	valid_0's multi_error: 0.258937
[12]	valid_0's multi_error: 0.258937
[13]	valid_0's multi_error: 0.258937
[14]	valid_0's multi_error: 0.258937
[15]	valid_0's multi_error: 0.258937
[16]	valid_0's multi_error: 0.248118
[17]	valid_0's multi_error: 0.258937
[18]	valid_0's multi_error: 0.242224
[19]	valid_0's multi_error: 0.243422
[20]	valid_0's multi_error: 0.244501
[21]	valid_0's multi_error: 0.250624
[22]	valid_0's multi_error: 0.230021
[23]	valid_0's multi_error: 0.227853
[24]	valid_0's multi_error: 0.204918
[25]	valid_0's multi_error: 0.2064
[26]	valid_0's multi_error: 0.200996
[27]	valid_0's multi_error: 0.197433
[28]	valid_0's multi_error: 0.199438
[29]	valid_

In [10]:
clf_xgb1.fit(X_train,y_train,eval_set = eval_set)

[0]	validation_0-merror:0.18219
[1]	validation_0-merror:0.18213
[2]	validation_0-merror:0.18173
[3]	validation_0-merror:0.18179
[4]	validation_0-merror:0.18175
[5]	validation_0-merror:0.18186
[6]	validation_0-merror:0.18180
[7]	validation_0-merror:0.18179
[8]	validation_0-merror:0.18173
[9]	validation_0-merror:0.18178
[10]	validation_0-merror:0.18202
[11]	validation_0-merror:0.18209
[12]	validation_0-merror:0.18215
[13]	validation_0-merror:0.18215
[14]	validation_0-merror:0.18231
[15]	validation_0-merror:0.18220
[16]	validation_0-merror:0.18220
[17]	validation_0-merror:0.18225
[18]	validation_0-merror:0.18210
[19]	validation_0-merror:0.18227
[20]	validation_0-merror:0.18219
[21]	validation_0-merror:0.18225
[22]	validation_0-merror:0.18216
[23]	validation_0-merror:0.18218
[24]	validation_0-merror:0.18206
[25]	validation_0-merror:0.18204
[26]	validation_0-merror:0.18205
[27]	validation_0-merror:0.18202
[28]	validation_0-merror:0.18195
[29]	validation_0-merror:0.18185
[30]	validation_0-me

## Voting

In [11]:
estimators = [('clf_cat',clf_cat),('clf_lgb1',clf_lgb1),('clf_lgb2',clf_lgb2),('clf_xgb1',clf_xgb1)]

In [12]:
for estimator in estimators:
    print(estimator[0]+" :",accuracy_score(y_val,estimator[1].predict(X_val)))

clf_cat : 0.8186265430417398
clf_lgb1 : 0.8184086377651635
clf_lgb2 : 0.8185829619864246
clf_xgb1 : 0.8184631140843075


In [13]:
class Vote:
    ''' 
    We can pass trained classifiers inside this. Default sklearn Voting Classifier
    doesn't allow to pass prefit classifiers inside. Also, we can't experiment
    by passing different weights in the sklearn tool
    '''
    def __init__(self, estimators, use_ = [True]*len(estimators)):
        self.estimators = estimators
        self.use_ = use_ 

    def predict_proba(self,X):
        ans = np.zeros((len(X),4))
        n = len(self.estimators)

        denom = 0
        for i in range(n):
            ans += (self.use_[i]*self.estimators[i][1].predict_proba(X))
            denom+=self.use_[i] 

        return ans/denom 

    def predict(self,X):
        self.ans = self.predict_proba(X)
        return np.argmax(self.ans,axis=1)

In [14]:
# parameter use_ is weights here (Default: Equal weightage to all classifiers)
vote = Vote(estimators,use_ = [1,1,1,1])

In [15]:
y_valpred = vote.predict(X_val)

In [16]:
# On 20 percent validation data
accuracy_score(y_val,y_valpred), accuracy_score(y_train,vote.predict(X_train))

(0.8185938572502534, 0.8226486530656716)

## Submission

In [23]:
submission = pd.DataFrame()
submission['ID'] = id
submission['Default_Flag'] = vote.predict(X_test)

In [24]:
submission

Unnamed: 0,ID,Default_Flag
0,3337446730,0
1,7888784125,0
2,9871378905,2
3,8891869609,0
4,2006443827,2
...,...,...
456650,2739966666,0
456651,9557081325,0
456652,3306267917,0
456653,2090730014,1


In [25]:
submission.to_csv('New_test oofsquare_4classifierensemble.csv',index=False,header=None)

In [None]:
# joblib.dump(clf_cat,'Classifier Models/clf_cat.pkl')
# joblib.dump(clf_lgb1,'Classifier Models/clf_lgb1.pkl')
# joblib.dump(clf_lgb2,'Classifier Models/clf_lgb2.pkl')
# joblib.dump(clf_xgb1, 'Classifier Models/clf_xgb.pkl')
# joblib.dump(vote,'Classifier Models/voting_classifier.pkl')

In [6]:
# clf_cat = joblib.load('best_model/clf_cat.pkl')
# clf_lgb1 = joblib.load('best_model/clf_lgb1.pkl')
# clf_lgb2 = joblib.load('best_model/clf_lgb2.pkl')
# clf_xgb1 = joblib.load('best_model/clf_xgb.pkl')
# vote = joblib.load('best_model/voting_classifier.pkl')