In [8]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *
from sklearn.model_selection import * 
from sklearn.pipeline import *
from sklearn.model_selection import * 
from sklearn.compose import ColumnTransformer, make_column_transformer
from category_encoders import BinaryEncoder

In [3]:
os.chdir("D:/Project_II")

In [4]:
### load cleaned pickle file 
df = pd.read_pickle("reduced_df.pkl") 

In [5]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,EmployeeBank,ResidenceCountry,Sex,Age,first_contract_date,new_customer,seniority_time,cust_type,cust_relation_type,foreigner_birth,...,Credit_Card_target,Securities_target,Home_Account_target,Payroll_target,Pension_target,Direct_Debit_target,new_ones,leave_ones,num_of_added_products,Target
Date,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2015-12-28,1471649.0,2,0.0,1,20.0,82.0,1.0,2.0,1.0,0,0,...,0,0,0,0,0,0,0,0,0,0
2016-04-28,1433168.0,2,0.0,1,20.0,262.0,0.0,8.0,1.0,0,0,...,0,0,0,0,0,0,0,0,0,0
2016-04-28,1441129.0,2,0.0,1,32.0,239.0,0.0,8.0,1.0,0,0,...,0,0,0,0,0,0,0,0,0,0
2015-02-28,1338660.0,2,0.0,0,42.0,129.0,0.0,9.0,1.0,0,0,...,0,0,0,0,0,0,0,0,0,0
2016-04-28,1328177.0,2,0.0,0,27.0,568.0,0.0,18.0,1.0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
#### make split of df in train /test

col = df.columns[:39].to_list() +["new_ones","leave_ones"]
X = df.loc[:,col].copy()
Y = df.loc[:,"Saving_Account_target":"Direct_Debit_target"].copy()



x_train,x_test, Y_train,Y_test = train_test_split(X,Y,test_size=0.333,random_state=123)

In [15]:
### define functions compatible with sklearn pipeline
### Discretizer takes continous features as input and binarizes to quintiles as output
from sklearn.base import BaseEstimator, TransformerMixin

class Discretizer(BaseEstimator,TransformerMixin):
    
    def __init__(self,cuts):
        self.cuts = cuts
        
    def fit(self,X,y=None):
        return self
    
    def transform(self,X,y=None):
        labelS = [str(ii) for ii in range(0,self.cuts)]
        
        out = X.apply(lambda i: pd.qcut(i,self.cuts,labels=labelS),axis=0)
        return out 


In [19]:
### make preprocessing steps
### initialize RF
### preprocessing first convert to catgoricals
### then binary encode these values --this is only an example of the pipelines 

cfl = RandomForestClassifier(n_jobs=-1)
preprocess = make_column_transformer(
        (Discretizer(cuts=5),["Age","gross_income","first_contract_date","seniority_time"]),remainder="passthrough")
pipe = make_pipeline(preprocess, BinaryEncoder(cols=[0,1,2,3]),clf)



In [20]:
### set up scoring functions and other inputs for GridSearchCV
parameters ={"randomforestclassifier__n_estimators":[3,5,10,100]}
scoring1 = {'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score)}

In [21]:
## initialize GridSearchCV with 5 fold cross validation
cc = GridSearchCV(pipe,param_grid= parameters,cv = 2,scoring=scoring1,refit="AUC",n_jobs=-1) 

In [22]:
### fit GridSearchCV on test data after selecting one target (Direct_Debit)
ys = Y_train.Direct_Debit_target

cc.fit(x_train,ys) 


GridSearchCV(cv=2, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('columntransformer',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='passthrough',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('discretizer',
                                                                         Discretizer(cuts=5),
                                                                         ['Age',
                                                                          'gross_income',
                                                                          'first_contract_date',
                                                                          'seniority_time'

In [23]:
### inspect
cc.best_params_

{'randomforestclassifier__n_estimators': 5}

In [29]:
### now change Kfold to custom Kfold
### custom_cv_1 does downsampling,i.e. takes train set with positive targets and creates new train set by randomly matching
### with samples of non positive class by factor alternation 2 and repeats; then the model is fitted and predicted on test set 
### with test_index two ; this repated stop_crit* times 
def custom_cv_1(X,y,n_splits=10,test_size=0.3,alternation1=1,alternation2=1.2,stop_crit=2,random_state=123):
    sss = StratifiedShuffleSplit(n_splits=n_splits,random_state=110,test_size=test_size)
    for train_index, test_index in sss.split(X,y):
        mask1 = y[train_index] ==1
        targets_ix = train_index[mask1]
        remainIX = np.delete(train_index,np.where(mask1==1))
        nI=np.floor(len(remainIX) /(y[train_index]==1).sum())
        #if nI > 5:
         #   nI = 5
        #stop_crit = nI*2*alternation1
        i = 0
        while i < stop_crit:
                comIX =np.random.choice(remainIX,size=int(mask1.sum()*alternation2 ))
                new_train = np.concatenate((targets_ix,comIX))
                yield new_train, test_index
                i +=1




In [31]:
cva = custom_cv_1(x_train,ys,n_splits=3,test_size=0.3,alternation1=1,alternation2=2.3,random_state=123)
cc = GridSearchCV(pipe,param_grid= parameters,cv = cva ,scoring=scoring1,refit="AUC",n_jobs=-1) 

In [None]:

cc.fit(x_train,ys)