In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import lightgbm as lgb
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import StackingClassifier

In [2]:
import os
os.chdir('/content/drive/MyDrive/sasha_babuin/data_santander')

In [3]:
train_df = pd.read_csv('data/train 2.csv')
test_df = pd.read_csv('data/test.csv')
X = train_df.drop(['ID_code', 'target'], axis = 1)
y = train_df['target']
test_df.drop(['ID_code'], axis = 1, inplace = True)
scaler = MinMaxScaler()
scaler.fit(X)
scaler.fit(test_df)
X = pd.DataFrame(scaler.transform(X), columns = X.columns)
test_df = pd.DataFrame(scaler.transform(test_df), columns = test_df.columns)


In [4]:
params_rf = {'max_depth' : 3,
          'criterion' : 'entropy',
          'min_samples_split' : 2,
          'n_estimators' : 500,
    'n_jobs' : -1}

    
params_svc = {'kernel' : 'rbf',
              'C':0.01,
              'max_iter' : 7000,
              'probability' : True}

params_knn = {
    'n_neighbors' : 5,
    'n_jobs' : -1
}

params_gbt = {
    'bagging_freq': 5,
    'bagging_fraction': 0.5, ## 
    'boost_from_average': False, ## 
    'boost': 'gbdt', ## 
    'feature_fraction': 0.1, ##
    'learning_rate': 0.01, 
    'max_depth': 5,  
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 16,
    'tree_learner': 'serial',
    'objective': 'binary',
    'num_round' : 500_000, 
    'early_stopping_round' : 3000,
    'n_jobs' : -1
  
}

params_nb = {
    'priors':None, 
    'var_smoothing':1e-09
}

In [5]:
X_train = X.values

y_train = y.values

X_test = test_df.values

In [6]:
SEED = 0

In [7]:
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None, bayes = False):
        # params['random_state'] = seed
        self.clf = clf(**params)
        self.bayes = bayes

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        if self.bayes:
          return self.clf.predict_proba(x)[:, 1].reshape(1,-1)
        return self.clf.predict_proba(x)[:, 1].reshape(1,-1)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)
    

In [8]:
NFOLDS = 5

kf = StratifiedKFold(n_splits = NFOLDS, shuffle = True, random_state=SEED)
from IPython.display import clear_output

def get_oof(clf, x_train, y_train, x_test, save_results = True, model_name = 'model'):
    ntrain = x_train.shape[0]
    ntest = x_test.shape[0]
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))
    
    for i, (train_index, test_index) in enumerate(kf.split(x_train, y_train)):
        print('Starting {} validation'.format(i))
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]
        y_te = y_train[test_index]
        print(x_te.shape)
        clf.fit(x_tr, y_tr)
        
        y_test_pred = clf.predict(x_te)
        oof_train[test_index] = y_test_pred
        oof_test_skf[i, :] = clf.predict(x_test)
        
    


    oof_test[:] = oof_test_skf.mean(axis=0)

    if save_results:
      oof_test_pd = pd.DataFrame(oof_test)
      oof_test_pd.to_csv('test_{}.csv'.format(model_name))
      oof_train_pd = pd.DataFrame(oof_train)
      oof_train_pd.to_csv('train_{}.csv'.format(model_name))
    return oof_train, oof_test

In [9]:

rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=params_rf)
# lr = SklearnHelper(clf = LogisticRegression, seed=SEED)
gb = SklearnHelper(clf=lgb.LGBMClassifier, seed=SEED, params=params_gbt)
svc = SklearnHelper(clf=SVC, seed=SEED, params=params_svc)
## knn = SklearnHelper(clf=KNeighborsClassifier, seed=SEED, params=params_knn)
nb = SklearnHelper(clf=GaussianNB, seed=SEED, params = params_nb, bayes = True)


In [10]:
lr = LogisticRegression()

In [11]:
ntrain = X_train.shape[0]
ntest = X_test.shape[0]
oof_train = np.zeros((ntrain,))
oof_test = np.zeros((ntest,))
oof_test_skf = np.empty((5, ntest))
svc.fit(X_train, y_train)

y_test_pred = svc.predict(X_test)




In [12]:
y_train_pred = svc.predict(X_train)[0]
y_test_pred = svc.predict(X_test)[0]

In [13]:
roc_auc_score(y_train, y_train_pred)

0.8787149867186069

In [14]:

pd.DataFrame(y_train_pred).to_csv('train_svc.csv')

pd.DataFrame(y_test_pred).to_csv('test_svc.csv')

In [None]:
oof_train[test_index] = y_test_pred
oof_test_skf[i, :] = clf.predict(x_test)




oof_test[:] = oof_test_skf.mean(axis=0)

In [None]:
et_oof_train, et_oof_test = get_oof(svc, X_train, y_train, X_test, model_name = 'svc')

Starting 0 validation
(40000, 200)




AttributeError: ignored

In [None]:
et_oof_train

array([0.08810551, 0.09458323, 0.08832082, ..., 0.10496526, 0.11245042,
       0.09824582])

In [None]:
roc_auc_score(y, et_oof_train)

0.7757138676420438

In [None]:
et_oof_test

array([0.19040552, 0.24665543, 0.04741382, ..., 0.00187707, 0.05012756,
       0.1393923 ])

In [None]:
svc.fit(X_train, y_train)
results = svc.predict(X_train)
results_test = svc.predict(X_test)

In [None]:
pd.DataFrame(results).to_csv('train_svc.csv')
pd.DataFrame(results_test).to_csv('test_svc.csv')

In [None]:
pd.DataFrame(results)

In [None]:
pd.DataFrame(et_oof_train).value_counts()

0.0    200000
dtype: int64

In [None]:
pd.DataFrame(et_oof_train).value_counts()

0.0    189762
1.0     10238
dtype: int64

In [None]:
et_oof_train, et_oof_test = get_oof(rf, X_train, y_train, X_test, model_name = 'rf')


Starting 0 validation
(40000, 200)


KeyboardInterrupt: ignored

In [None]:
et_oof_test = pd.DataFrame(et_oof_test)
et_oof_test.to_csv('test_{}'.format('rf'))

In [None]:
accuracy_score(y_train, et_oof_test.values)

ValueError: ignored

In [None]:
x_cut = np.random.choice(np.arange(len(X)), size = 50_000)

(50000, 200)

In [None]:
et_oof_train, et_oof_test = get_oof(svc, X_train[x_cut], y_train[x_cut], X_test, model_name = 'svc')

Starting 0 validation
(10000, 200)
Starting 1 validation
(10000, 200)
Starting 2 validation
(10000, 200)
Starting 3 validation
(10000, 200)
Starting 4 validation
(10000, 200)


In [None]:
accuracy_score(y_train[x_cut], et_oof_train)

0.89802

In [None]:
np.unique(et_oof_train.reshape(1, -1)[0])


array([0.])

In [None]:
et_oof_train, et_oof_test = get_oof(rf, X_train, y_train, X_test, model_name = 'rf')

Starting 0 validation
(40000, 200)
Starting 1 validation
(40000, 200)
Starting 2 validation
(40000, 200)
Starting 3 validation
(40000, 200)
Starting 4 validation
(40000, 200)
