In [1]:
import os
os.chdir("../../")

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import xgboost as xgb
from scr.util import *

In [4]:
df_train = pd.read_csv('data/feature_engineered/null_cat/train_scaled_for_not_gbdt.csv')
df_test = pd.read_csv('data/feature_engineered/null_cat/test_scaled_for_not_gbdt.csv')

In [5]:
feature = [
    'Age', 
    'DurationOfPitch', 
    'NumberOfPersonVisiting',
    'NumberOfFollowups', 
    'NumberOfTrips', 
    'MonthlyIncome', 
    #'ProdTaken',
    'Motivation', 
    'EconomicPower', 
    'TripEasier', 
    'SalesPerformance',
    'LivingCost', 
    'EconomicStability', 
    'NumberOfTrips_log', 
    'TravelCost',
    'EconomicSegment', 
    'PackageMatch', 
    'Monetary', 
    #'ContractRate_FM',
    #'ContractRate_G1',
    #'ContractRate_G2', 
    #'ContractRate_G3',
    #'ContractRate_G4',
    #'ContractRate_G5', 
    #'ContractRate_G6',
    'TypeofContact_No',
    'TypeofContact_Self Enquiry',
    'CityTier_2',
    'CityTier_3',
    'Occupation_Salaried',
    'Occupation_Small Business',
    'Gender_male', 
    'ProductPitched_Deluxe', 
    'ProductPitched_King',
    'ProductPitched_Standard',
    'ProductPitched_Super Deluxe',
    'PreferredPropertyStar_4',
    'PreferredPropertyStar_5', 
    'Passport_1',
    'PitchSatisfactionScore_2', 
    'PitchSatisfactionScore_3',
    'PitchSatisfactionScore_4',
    'PitchSatisfactionScore_5',
    'Designation_Executive',
    'Designation_Manager',
    'Designation_Senior Manager', 
    'Designation_VP',
    'Marry_Married',
    'Marry_Single', 
    'Car_No Car', 
    'Child_1_child', 
    'Child_2_child',
    'Child_3_child',
    'AgeGroup_20s',
    'AgeGroup_30s',
    'AgeGroup_40s',
    'AgeGroup_50s', 
    'AgeGroup_60s', 
    'TypeofContactNULL_1',
    'Child01_1',
    'IsFamily_1',
    'FreaqencySeg_1',
    'FreaqencySeg_2',
    'MonetarySeg_2',
    'MonetarySeg_3', 
    'MonetarySeg_4', 
    'AgeNull', 
    'DurationOfPitchNull',
    'NumberOfTripsNull', 
    'MonthlyIncomeNull'
]

X = df_train[feature]
y = df_train['ProdTaken']

In [10]:
models = []
scores = []

params = {
    'objective': 'binary:logistic',  # Adjust for your specific objective
    'eval_metric': 'auc',
    'learning_rate': 0.001,
    'max_depth': 6,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42
        }

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for tr_idx, va_idx in skf.split(X, y):
    tr_x, va_x = X.iloc[tr_idx], X.iloc[va_idx]
    tr_y, va_y = y.iloc[tr_idx], y.iloc[va_idx]
    
    dtrain = xgb.DMatrix(tr_x.values, label=tr_y.values)
    dvalid = xgb.DMatrix(va_x.values, label=va_y.values)
    watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
    
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=5000,
        evals=watchlist,
        verbose_eval=100)
    
    pred = model.predict(xgb.DMatrix(va_x.values))
    models.append(model)
    score = roc_auc_score(va_y, pred)
    scores.append(score)

print('AUC : ', scores)

[0]	train-auc:0.85294	valid-auc:0.77487


[100]	train-auc:0.91371	valid-auc:0.83426
[200]	train-auc:0.91622	valid-auc:0.83258
[300]	train-auc:0.91885	valid-auc:0.83242
[400]	train-auc:0.92111	valid-auc:0.83324
[500]	train-auc:0.92368	valid-auc:0.83299
[600]	train-auc:0.92583	valid-auc:0.83291
[700]	train-auc:0.92807	valid-auc:0.83349
[800]	train-auc:0.93034	valid-auc:0.83388
[900]	train-auc:0.93217	valid-auc:0.83383
[1000]	train-auc:0.93441	valid-auc:0.83413
[1100]	train-auc:0.93643	valid-auc:0.83440
[1200]	train-auc:0.93848	valid-auc:0.83480
[1300]	train-auc:0.94055	valid-auc:0.83497
[1400]	train-auc:0.94246	valid-auc:0.83538
[1500]	train-auc:0.94451	valid-auc:0.83605
[1600]	train-auc:0.94645	valid-auc:0.83654
[1700]	train-auc:0.94836	valid-auc:0.83701
[1800]	train-auc:0.95026	valid-auc:0.83756
[1900]	train-auc:0.95211	valid-auc:0.83766
[2000]	train-auc:0.95385	valid-auc:0.83834
[2100]	train-auc:0.95555	valid-auc:0.83863
[2200]	train-auc:0.95719	valid-auc:0.83935
[2300]	train-auc:0.95868	valid-auc:0.83941
[2400]	train-auc:0.9