In [2]:
import pandas as pd
from sklearn import metrics
from sklearn import ensemble
from sklearn.preprocessing import StandardScaler, QuantileTransformer, RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, train_test_split
import numpy as np
import xgboost
import seaborn as sns
from mdesc.eval import ErrorViz, SensitivityViz
import viztracer

## EDA

In [3]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [4]:
df_train.shape

(10738, 12)

In [5]:
df_train.isnull().sum()

customer_id                          0
customer_visit_score                 0
customer_product_search_score       42
customer_ctr_score                   0
customer_stay_score                 37
customer_frequency_score             0
customer_product_variation_score    46
customer_order_score                66
customer_affinity_score              0
customer_active_segment             23
X1                                  37
customer_category                    0
dtype: int64

In [6]:
df_train.dtypes

customer_id                          object
customer_visit_score                float64
customer_product_search_score       float64
customer_ctr_score                  float64
customer_stay_score                 float64
customer_frequency_score            float64
customer_product_variation_score    float64
customer_order_score                float64
customer_affinity_score             float64
customer_active_segment              object
X1                                   object
customer_category                     int64
dtype: object

In [7]:
target = df_train['customer_category']
df_train.drop(['customer_category'], axis=1, inplace=True)

In [8]:
def preprocess_data(df):
    df.drop(['customer_id'], inplace=True, axis=1)
    df = pd.get_dummies(df, columns=['customer_active_segment', 'X1'])
    df.fillna(method='ffill', inplace=True)
    
    return df

In [9]:
df_train = preprocess_data(df_train)
df_test = preprocess_data(df_test)

### Analysis 
To reduce class imbalance there are two ways:
- 1. Upsampling/downsampling
- 2. Different metrics

We can use standard scaler as there is not much of variation in the data

In [13]:
# target = df_train['ACTION']
# df_train = df_train.iloc[:,3:]

In [14]:
cols = df_train.columns

In [15]:
ss = RobustScaler((10, 90))
df_tr = ss.fit_transform(df_train) 
df_te = ss.transform(df_test[cols])

In [16]:
df_te = pd.DataFrame(df_te, columns=cols)
df = pd.DataFrame(df_tr, columns=cols)

In [17]:
# df

In [18]:
tr_x, va_x, tr_y, va_y = train_test_split(df, 
                                          target, 
                                          test_size=.3,
                                          random_state=4311, 
                                          stratify=target)

## Modeling

### Xgboost

In [19]:
param = {'max_depth': 10, 'eta': 1, 'objective': 'binary:logistic', 
         'subsample':0.6, 'sampling_method':'gradient_based'}
param['eval_metric'] = ['auc']
num_round = 10
bst = xgboost.XGBRFClassifier(n_estimators=100,
                              max_depth=40, 
                              eta=1, 
                              objective= 'binary:logistic', 
                              subsample=0.2, 
                              sampling_method='uniform')
clf_model = bst.fit(tr_x, tr_y)

In [20]:
params = {
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 0.7,
    'colsample_bytree': 1,
    'objective':'binary:logistic',
    'eval_metric':'auc'
}

In [23]:
# num_boost_round = 800
# xgboost.train(tr_x, tr_y,
#     params,
#     dtrain,
#     num_boost_round=num_boost_round,
#     evals=[(dtest, "Test")],
#     early_stopping_rounds=10)

In [24]:
va_preds = clf_model.predict(va_x)
fpr, tpr, thresholds = metrics.roc_curve(va_preds, va_y)
auc = metrics.auc(fpr,tpr)
print(auc)

0.951710582235444


In [25]:
we_prec_score = metrics.precision_score(va_preds, va_y, average='macro')

In [26]:
we_prec_score

0.9243641547425359

In [27]:
predictions = clf_model.predict(df_te)
predict_df = pd.DataFrame(predictions, columns=['customer_category'])

In [80]:
submission = pd.read_csv('test.csv')
submission = pd.concat([submission, predict_df], axis=1)
submission = submission[['customer_id',  'customer_category']]
submission.to_csv('submit.csv', index=None)

### Random forest

In [28]:
RFclassifier = ensemble.RandomForestClassifier(n_estimators=50, max_depth=6, max_features=15)
rfmodel = RFclassifier.fit(tr_x, tr_y)

In [29]:
va_preds_rf = rfmodel.predict(va_x)
fpr, tpr, thresholds = metrics.roc_curve(va_preds_rf, va_y)
auc = metrics.auc(fpr,tpr)

In [30]:
we_score = metrics.precision_score(va_preds_rf, va_y, average='macro')
print(we_score)

0.9236082817546054


In [31]:
tr_x.columns

Index(['customer_visit_score', 'customer_product_search_score',
       'customer_ctr_score', 'customer_stay_score', 'customer_frequency_score',
       'customer_product_variation_score', 'customer_order_score',
       'customer_affinity_score', 'customer_active_segment_A',
       'customer_active_segment_AA', 'customer_active_segment_B',
       'customer_active_segment_C', 'customer_active_segment_D', 'X1_A',
       'X1_AA', 'X1_BA', 'X1_E', 'X1_F'],
      dtype='object')

## HPO

In [32]:
params = {
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 0.7,
    'colsample_bytree': 1,
    'objective':'binary:logistic',
    'eval_metric':'auc'
}

In [33]:
dtrain = xgboost.DMatrix(tr_x, label=tr_y)
dtest = xgboost.DMatrix(va_x, label=va_y)

In [34]:
num_boost_round = 800
tr = xgboost.train(params,
            dtrain,
            num_boost_round=num_boost_round,
            evals=[(dtest, "Test")],
            early_stopping_rounds=30)

[0]	Test-auc:0.96096
Will train until Test-auc hasn't improved in 30 rounds.
[1]	Test-auc:0.96104
[2]	Test-auc:0.96350
[3]	Test-auc:0.96534
[4]	Test-auc:0.96591
[5]	Test-auc:0.96739
[6]	Test-auc:0.96755
[7]	Test-auc:0.97074
[8]	Test-auc:0.97042
[9]	Test-auc:0.96996
[10]	Test-auc:0.96961
[11]	Test-auc:0.96850
[12]	Test-auc:0.96801
[13]	Test-auc:0.96723
[14]	Test-auc:0.96765
[15]	Test-auc:0.96693
[16]	Test-auc:0.96734
[17]	Test-auc:0.96748
[18]	Test-auc:0.96880
[19]	Test-auc:0.96890
[20]	Test-auc:0.96879
[21]	Test-auc:0.96935
[22]	Test-auc:0.96866
[23]	Test-auc:0.96922
[24]	Test-auc:0.96942
[25]	Test-auc:0.96963
[26]	Test-auc:0.97021
[27]	Test-auc:0.97086
[28]	Test-auc:0.97059
[29]	Test-auc:0.97047
[30]	Test-auc:0.97050
[31]	Test-auc:0.97089
[32]	Test-auc:0.97031
[33]	Test-auc:0.96950
[34]	Test-auc:0.96985
[35]	Test-auc:0.97053
[36]	Test-auc:0.97017
[37]	Test-auc:0.97027
[38]	Test-auc:0.97012
[39]	Test-auc:0.97001
[40]	Test-auc:0.96957
[41]	Test-auc:0.96939
[42]	Test-auc:0.96907
[43]	Tes

In [151]:
df_tst = xgboost.DMatrix(df_te, label=None)
values = tr.predict(df_tst)
vals = []
for x in values:
    if x < 0.5:
        vals.append(0)
    else:
        vals.append(1)

In [153]:
predict_df = pd.DataFrame(vals, columns=['customer_category'])
submission = pd.read_csv('test.csv')
submission = pd.concat([submission, predict_df], axis=1)
submission = submission[['customer_id',  'customer_category']]
submission.to_csv('submit.csv', index=None)