In [None]:
import numpy as np
import pandas as pd
import random
import math
import statsmodels.formula.api as smf
import sys
import warnings
warnings.filterwarnings('ignore')
import statsmodels.api as sm
import xgboost as xgb
from sklearn.ensemble import  AdaBoostClassifier,AdaBoostRegressor
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from mlxtend.classifier import StackingCVClassifier,StackingClassifier
from mlxtend.regressor import StackingCVRegressor,StackingRegressor
from sklearn.neural_network import MLPClassifier,MLPRegressor
from pygam import GAM,LogisticGAM
import itertools
from scipy import stats
from scipy.special import expit
xgb.set_config(verbosity=0)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
class DRModel():
    def __init__(self):
        self.binary_model_lst = None
        self.continue_model_lst = None
        self.train_df = None
        self.test_df = None 
    def data_loadin(self,train_df,test_df):
        self.train_df = train_df
        self.test_df = test_df 
    def ml_models(self,binary_models,continue_models):
        self.binary_model_lst = binary_models
        self.continue_model_lst = continue_models
        
    def KFold_val(self,train=None,indep_var=None,dep_var=None,model=None,model_type = 'binary'):
        X, Y = pd.DataFrame(train[indep_var]), pd.DataFrame(train[dep_var])
        if model_type == 'binary':
            scores = cross_val_score(model, X, Y, cv=3, scoring='accuracy')
        elif model_type == 'continue':
            scores = cross_val_score(model, X, Y, cv=3, scoring='neg_mean_squared_error')
#         print (str(model)+ str(np.mean(scores)))
        return np.mean(scores)      
    
    def stacking(self,formula,train=None,test=None,return_proba=False,model_type = 'binary'):
        indep_var = formula.split('~')[-1].replace(' ','').split('+')
        dep_var = formula.split('~')[0].replace(' ','')
        stacking_df = pd.DataFrame()
        stacking_test_df = pd.DataFrame()
        if model_type == 'binary':
            model_lst = self.binary_model_lst
        elif model_type == 'continue':
            model_lst = self.continue_model_lst 
        for model in model_lst:
            result = np.array([])
            for train_idx,text_idx in KFold(n_splits=3).split(train):
                model1 = model
                X_train, X_test = train.loc[train_idx,indep_var], train.loc[text_idx,indep_var]
                y_train, y_test = train.loc[train_idx,dep_var], train.loc[text_idx,dep_var]
                result = np.append(result,model1.fit(X_train,y_train).predict(X_test))
            stacking_df[str(model).split('(')[0]] = result
            model.fit(train[indep_var],train[dep_var])
            stacking_test_df[str(model).split('(')[0]] = model.predict(test[indep_var])
        stacking_df['label'] = train[dep_var]
        
        if model_type == 'binary':
            meta_model = LogisticRegression()
            meta_model.fit(stacking_df[stacking_df.columns[:-1]],stacking_df['label'])
            if return_proba==True:
                res = meta_model.predict_proba(stacking_test_df)[:,1]
            else:
                res = meta_model.predict(stacking_test_df)
        elif model_type == 'continue':
            meta_model = LinearRegression()
            meta_model.fit(stacking_df[stacking_df.columns[:-1]],stacking_df['label'])
            res = meta_model.predict(stacking_test_df)
        return res
            
        
    def ps_Model(self,formula,ml_method=False,model_type='binary'):
        indep_var = formula.split('~')[-1].replace(' ','').split('+')
        treatment_var = formula.split('~')[0].replace(' ','')
        # calculate propensity score
        start = time.perf_counter()
        if ml_method==True:
            if model_type=='binary':
                mods = self.binary_model_lst.copy()[1:]
            elif model_type == 'continue':
                mods = self.continue_model_lst.copy()[1:]
            ml_val_lst = []
            for model1 in mods:
                ml_val_lst.append(dr.KFold_val(self.train_df,indep_var,treatment_var,model1))
            model = mods[np.argmax(ml_val_lst)]
#             model = BPNN_psmodel
            print ('ps'+str(model)[:8])
            model.fit(self.train_df[indep_var],self.train_df[treatment_var])
            try:
                try:
                    propensity_score = model.predict_proba(self.test_df[indep_var])[:,1]
                except:
                    propensity_score = model.predict_proba(self.test_df[indep_var])
            except:
                propensity_score = model.predict(self.test_df[indep_var])[:,1]
            propensity_score = np.where(propensity_score<0.005,0.005,propensity_score)
            propensity_score = np.where(propensity_score>0.995,0.995,propensity_score)
        else:
            if model_type=='binary':
                model = LogisticRegression()
            elif model_type == 'continue':
                model = LinearRegression()
            model.fit(self.train_df[indep_var],self.train_df[treatment_var])
            propensity_score = model.predict_proba(self.test_df[indep_var])[:,1]
        return propensity_score,time.perf_counter()-start 
            
    def rsp_Model(self,formula,ml_method=False,model_type='binary'):
        indep_var = formula.split('~')[-1].replace(' ','').split('+')
        response_var = formula.split('~')[0].replace(' ','')
        train = self.train_df.copy()
        test = self.test_df.copy()
        start = time.perf_counter()
        if ml_method==True:
            if model_type=='binary':
                mods = self.binary_model_lst.copy()[1:]
                stacking_model = StackingClassifier(self.binary_model_lst[1:],LG_psmodel)
            elif model_type == 'continue':
                mods = self.continue_model_lst.copy()[1:]
                stacking_model = StackingRegressor(self.continue_model_lst[1:],Linear_rspmodel)            
            mods.append(stacking_model)
            ml_val_lst = []
            for model1 in mods:
                ml_val_lst.append(dr.KFold_val(train,indep_var,response_var,model1,model_type=model_type))
            model = mods[np.argmax(ml_val_lst)]
        else:
            if model_type=='binary':
                model = LogisticRegression()
            elif model_type == 'continue':
                model = LinearRegression()
        model.fit(train[indep_var],train[response_var])
        test['Treatment']=0
        test.reset_index(drop=True,inplace=True)
        u0_x = model.predict(test[indep_var])
        test['Treatment']=1
        test.reset_index(drop=True,inplace=True)
        u1_x = model.predict(test[indep_var])        
        return u0_x,u1_x,time.perf_counter()-start 
