In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2,f_classif,SelectFromModel
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.linear_model import RidgeClassifier

In [None]:
pd.set_option('display.max_columns',500)
train=pd.read_csv('../input/santander-customer-satisfaction/train.csv')
test=pd.read_csv('../input/santander-customer-satisfaction/test.csv')


In [None]:
train.var3.value_counts()

In [None]:
train.loc[train['var3']==-999999,'var3']=2
train.var3.value_counts()

In [None]:
plt.hist(np.log(train['var38']),bins=100)

In [None]:
mat=train.corr()

In [None]:
x=pd.DataFrame({'columns': train.columns,'corr':mat['TARGET'].values})
x=x.set_index('columns',drop=True).sort_values(by='corr',ascending=False)
x[:30]

In [None]:
sns.FacetGrid(train,hue='TARGET',size=10).map(plt.hist,'var36',bins=100).add_legend()

In [None]:
train['var38'].value_counts()

In [None]:
train.loc[~np.isclose(train.var38,117310.979016),'var38'].map(np.log).hist(bins=100)

In [None]:
train['log_var38']=train.loc[~np.isclose(train.var38,117310.979016),'var38'].map(np.log)
train.loc[np.isclose(train.var38,117310.979016),'log_var38']=0

In [None]:
train['var38_hasmostcommon']=np.isclose(train.var38, 117310.979016)

In [None]:
y=train['TARGET']
train=train.drop('TARGET',axis=1)

In [None]:
rf=RandomForestClassifier(n_estimators=100)
rf.fit(train,y)

In [None]:
feat=pd.DataFrame({'columns':train.columns,'imp':rf.feature_importances_})
feat=feat.set_index('columns',drop=True).sort_values('imp',ascending=False)
x=feat[:25]
x.plot(kind='bar')

In [None]:
from sklearn.feature_selection import VarianceThreshold


In [None]:
test.loc[~np.isclose(test.var38,117310.979016),'var38'].map(np.log).hist(bins=100)

In [None]:
test['log_var38']=test.loc[~np.isclose(test.var38,117310.979016),'var38'].map(np.log)
test.loc[np.isclose(test.var38,117310.979016),'log_var38']=0
test['var38_hasmostcommon']=np.isclose(test.var38, 117310.979016)

In [None]:
n_train=len(train)
n_test=len(test)

In [None]:
all_df=pd.concat((train,test)).reset_index()

In [None]:
all_df.drop('index',axis=1,inplace=True)

In [None]:
all_df.loc[all_df['var3']==-999999,'var3']=2

In [None]:
sel=VarianceThreshold(threshold=0)
sel.fit(all_df)
cols=[x for x in all_df.columns if x not in all_df.columns[sel.get_support()]]

In [None]:
all_df.drop(cols,axis=1,inplace=True)

In [None]:
train1=all_df.iloc[:n_train,:]
test1=all_df.iloc[n_train:,:]
id=test['ID']
train=train.drop('ID',axis=1)
test=test.drop('ID',axis=1)

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(train1,y,test_size=0.2,stratify=y,random_state=8)

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn import metrics
dtrain=xgb.DMatrix(x_train,label=y_train)
dtest=xgb.DMatrix(x_test,label=y_test)

In [None]:
def modelfit(alg, dtrain, y,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        
        cvresult = xgb.cv(xgb_param, dtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain, y,eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain)
    dtrain_predprob = alg.predict_proba(dtrain)[:,1]
        
    #Print model report:
    print ("\nModel Report")
    print ("Accuracy : %.4g".format(metrics.accuracy_score(y, dtrain_predictions)))
    print ("AUC Score (Train): %f".format(metrics.roc_auc_score(y, dtrain_predprob)))

In [None]:
params={
    'max_depth':5,
    'min_child_weight': 1,
    'eta':0.1,
    'subsample': 0.80,
    'colsample_bytree': 0.80,
    'reg_alpha': 0,
    'reg_lambda': 0,
    # Other parameters
    'objective':'binary:logistic',
}

In [None]:
params['eval_metric']='auc'
num_boost_round=4000
model=xgb.train(params,dtrain,num_boost_round=num_boost_round,evals=[(dtest,"Test")],early_stopping_rounds= 50)
print("Best Auc : {:.2f} with {} rounds".format(model.best_score,model.best_iteration+1))

In [None]:
num_boost_round=model.best_iteration+1

In [None]:
cv_results=xgb.cv(params,dtrain,num_boost_round = num_boost_round,nfold=5,seed=10,metrics='auc',early_stopping_rounds=50)

In [None]:
cv_results['test-auc-mean'].max()

In [None]:
gs_params=[
    (max_depth,min_child_weight)
    for max_depth in range(4,13,2)
    for min_child_weight in range(0,5,1)
    
]
max_auc=float("Inf")
best_params=None
for max_depth,min_child_weight in gs_params:
    print('Max depth: {} ,min_child_weight : {}'.format(max_depth,min_child_weight))
    params['max_depth']=max_depth
    params['min_child_weight']=min_child_weight
    cv=xgb.cv(params,dtrain,num_boost_round=num_boost_round,nfold=5,metrics='auc',early_stopping_rounds=40,seed=10)
    maxauc=cv['test-auc-mean'].max()
    boost_round=cv['test-auc-mean'].idxmax()
    print("\tAUC {} for {} rounds".format(maxauc, boost_round))
    if maxauc>max_auc:
        max_auc=maxauc
        best_params=(max_depth,min_child_weight)
print('best params : {} , {} , AUC : {:.4f}'.format(best_params[0], best_params[1],max_auc))
    

In [None]:
gs_params=[
    (max_depth,min_child_weight)
    for max_depth in range(3,6,1)
    for min_child_weight in range(1,3,1)
    
]
max_auc=float("Inf")
best_params=()
for max_depth,min_child_weight in gs_params:
    print('Max depth: {} ,min_child_weight : {}'.format(max_depth,min_child_weight))
    params['max_depth']=max_depth
    params['min_child_weight']=min_child_weight
    cv=xgb.cv(params,dtrain,num_boost_round=num_boost_round,nfold=5,metrics='auc',early_stopping_rounds=40,seed=10)
    maxauc=cv['test-auc-mean'].max()
    boost_round=cv['test-auc-mean'].idxmax()
    print("\tAUC {} for {} rounds".format(maxauc, boost_round))
    if maxauc>max_auc:
        max_auc=maxauc
        best_params=(max_depth,min_child_weight)
print('best params : {} , {} , AUC : {:.4f}'.format(best_params[0], best_params[1],max_auc))
    

In [None]:
params['max_depth']=5
params['min_child_weight']=2


In [None]:
num_boost_round=4000
model=xgb.train(params,dtrain,num_boost_round=num_boost_round,evals=[(dtest,"Test")],early_stopping_rounds= 50)
print("Best Auc : {:.2f} with {} rounds".format(model.best_score,model.best_iteration+1))

In [None]:
num_boost_round=model.best_iteration +1

In [None]:
gs_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(6,11)]
    for colsample in [i/10. for i in range(6,11)]
]
max_auc=float("Inf")
best_params=()
for subsample,colsample in gs_params:
    print('Max depth: {} ,min_child_weight : {}'.format(subsample,colsample))
    params['subsample']=subsample
    params['colsample_bytree']=colsample
    cv=xgb.cv(params,dtrain,num_boost_round=num_boost_round,nfold=5,metrics='auc',early_stopping_rounds=40,seed=10)
    maxauc=cv['test-auc-mean'].max()
    boost_round=cv['test-auc-mean'].idxmax()
    print("\tAUC {} for {} rounds".format(maxauc, boost_round))
    if maxauc>max_auc:
        max_auc=maxauc
        best_params=(subsample,colsample)
print('best params : {} , {} , AUC : {:.4f}'.format(best_params[0], best_params[1],max_auc))
    