In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold, KFold
from scipy.stats import t as t_dist
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from scipy.stats import sem
from scipy import stats
from scipy.stats import chi2

from imblearn.over_sampling import SMOTE
from mlxtend.evaluate import paired_ttest_kfold_cv
from statsmodels.stats.contingency_tables import mcnemar
from mlxtend.evaluate import mcnemar_table

In [39]:
# Load csv file
ageGroup = pd.read_csv('ageGroup.csv', sep=',')
ageGroup = ageGroup.drop('communication_onlinerate', axis=1)
ageGroup = ageGroup.drop('uid', axis=1)
ageGroup[ageGroup < 0] = 0
ageGroup.head()

Unnamed: 0,label,task_id,adv_id,creat_type_cd,adv_prim_id,dev_id,inter_type_cd,slot_id,spread_app_id,tags,...,emui_dev,list_time,device_price,up_life_duration,up_membership_grade,membership_life_duration,consume_purchase,communication_avgonline_30d,indu_name,pt_d
0,0,2521,1925,7,207,17,5,21,13,37,...,14,13,2,0,0,0,2,11,17,1
1,0,3456,3720,7,107,15,5,21,46,39,...,14,10,3,14,0,0,2,12,36,1
2,0,3854,3367,7,207,17,5,17,13,37,...,27,17,4,11,0,0,5,13,17,1
3,0,3401,1766,7,156,56,5,14,58,37,...,20,14,5,14,0,0,2,11,17,1
4,0,3219,6128,4,143,60,3,17,78,23,...,12,9,4,0,0,0,2,11,37,1


In [40]:
def test_significant(p):
    if p<=0.05:
        print("Statistically Significant\n")
    else:
        print("Not Statistically Significant\n")
        
def mcnemar_test(y_test, y_1, y_2):
    b = sum(np.logical_and((y_2 != y_test),(y_1 == y_test)))
    c = sum(np.logical_and((y_2 == y_test),(y_1 != y_test)))
    
    c_ = (np.abs(b - c) - 1)**2 / (b + c)
    
    p_value = chi2.sf(c_, 1)
    return c_, p_value

### All Features

In [28]:
# Features variable
x = ageGroup.loc[:, ['task_id', 'adv_id', 'creat_type_cd', 'adv_prim_id', 'dev_id',
       'inter_type_cd', 'slot_id', 'spread_app_id', 'tags', 'app_first_class',
       'app_second_class', 'age', 'city', 'city_rank', 'device_name',
       'device_size', 'career', 'gender', 'net_type', 'residence',
       'his_app_size', 'his_on_shelf_time', 'app_score', 'emui_dev',
       'list_time', 'device_price', 'up_life_duration', 'up_membership_grade',
       'membership_life_duration', 'consume_purchase',
       'communication_avgonline_30d', 'indu_name', 'pt_d']]

# Target variable
y = ageGroup["label"]
y = LabelEncoder().fit_transform(y)

In [29]:
lgbm = LGBMClassifier(application='binary', max_depth=-10, metrics = 'auc')
xgbc = XGBClassifier(max_depth=6, min_child_weight=2, use_label_encoder =False, eval_metric='mlogloss')

In [33]:
cv = StratifiedKFold(n_splits=5, shuffle=True)
for train_idx, test_idx in cv.split(x, y):
    x_train, y_train = x.iloc[train_idx], y[train_idx]
    x_test, y_test = x.iloc[test_idx], y[test_idx]
    
    over = SMOTE()
    x_train_oversample, y_train_oversample = over.fit_resample(x_train, y_train)  
    x_test_oversample, y_test_oversample = over.fit_resample(x_test, y_test)
    
lgbm.fit(x_train_oversample, y_train_oversample)
xgbc.fit(x_train_oversample, y_train_oversample)

lgbm_y = lgbm.predict(x_test)
xgbc_y = xgbc.predict(x_test)

lgbm_y_oversample = lgbm.predict(x_test_oversample)
xgbc_y_oversample = xgbc.predict(x_test_oversample)

# Oversample train and test data
table = mcnemar_table(y_target=y_test_oversample, 
               y_model1=lgbm_y_oversample, 
               y_model2=xgbc_y_oversample)

# print(table)

result = mcnemar(table, exact=False)
chi_2 = result.statistic
p = result.pvalue

print(f"Chi-square statistic: {chi_2}, p-value: {p}")
test_significant(p)

# Oversample train data
table = mcnemar_table(y_target=y_test, 
                   y_model1=lgbm_y, 
                   y_model2=xgbc_y)

# print(table)

result = mcnemar(table, exact=False)
chi_2 = result.statistic
p = result.pvalue

print(f"Chi-square statistic: {chi_2}, p-value: {p}")
test_significant(p)

Chi-square statistic: 337.06371191135736, p-value: 2.781954040592237e-75
Statistically Significant

Chi-square statistic: 32.964019851116625, p-value: 9.388029752161137e-09
Statistically Significant



### Top 10 Features

In [41]:
# Features variable
x = ageGroup.loc[:, ['task_id', 'adv_id', 'adv_prim_id', 'dev_id', 'slot_id', 'tags',
       'device_size', 'career', 'up_life_duration', 'indu_name']]

# Target variable
y = ageGroup["label"]
y = LabelEncoder().fit_transform(y)

In [42]:
lgbm = LGBMClassifier(application='binary', max_depth=19, metrics = 'auc', num_leaves=248)
xgbc = XGBClassifier(use_label_encoder =False, eval_metric='mlogloss')

In [43]:
cv = StratifiedKFold(n_splits=5, shuffle=True)
for train_idx, test_idx in cv.split(x, y):
    x_train, y_train = x.iloc[train_idx], y[train_idx]
    x_test, y_test = x.iloc[test_idx], y[test_idx]
    
    over = SMOTE()
    x_train_oversample, y_train_oversample = over.fit_resample(x_train, y_train)  
    x_test_oversample, y_test_oversample = over.fit_resample(x_test, y_test)
    
lgbm.fit(x_train_oversample, y_train_oversample)
xgbc.fit(x_train_oversample, y_train_oversample)

lgbm_y = lgbm.predict(x_test)
xgbc_y = xgbc.predict(x_test)

lgbm_y_oversample = lgbm.predict(x_test_oversample)
xgbc_y_oversample = xgbc.predict(x_test_oversample)

# Oversample train and test data
table = mcnemar_table(y_target=y_test_oversample, 
               y_model1=lgbm_y_oversample, 
               y_model2=xgbc_y_oversample)

# print(table)

result = mcnemar(table, exact=False)
chi_2 = result.statistic
p = result.pvalue

print(f"Chi-square statistic: {chi_2}, p-value: {p}")
test_significant(p)

# Oversample train data
table = mcnemar_table(y_target=y_test, 
                   y_model1=lgbm_y, 
                   y_model2=xgbc_y)

# print(table)

result = mcnemar(table, exact=False)
chi_2 = result.statistic
p = result.pvalue

print(f"Chi-square statistic: {chi_2}, p-value: {p}")
test_significant(p)

Chi-square statistic: 9.192307692307692, p-value: 0.002430342921624515
Statistically Significant

Chi-square statistic: 222.94010840108402, p-value: 2.0658075721264871e-50
Statistically Significant



### Top 10 Features + Age Column

In [17]:
# Features variable
x = ageGroup.loc[:, ['task_id', 'adv_id', 'adv_prim_id', 'dev_id', 'slot_id', 'tags',
       'device_size', 'career', 'up_life_duration', 'indu_name', 'age']]

# Target variable
y = ageGroup["label"]
y = LabelEncoder().fit_transform(y)

In [18]:
lgbm = LGBMClassifier(application='binary', max_depth=16, metrics = 'auc', num_leaves=242)
xgbc = XGBClassifier(use_label_encoder =False, eval_metric='mlogloss')

In [19]:
cv = StratifiedKFold(n_splits=5, shuffle=True)
for train_idx, test_idx in cv.split(x, y):
    x_train, y_train = x.iloc[train_idx], y[train_idx]
    x_test, y_test = x.iloc[test_idx], y[test_idx]
    
    over = SMOTE()
    x_train_oversample, y_train_oversample = over.fit_resample(x_train, y_train)  
    x_test_oversample, y_test_oversample = over.fit_resample(x_test, y_test)
    
lgbm.fit(x_train_oversample, y_train_oversample)
xgbc.fit(x_train_oversample, y_train_oversample)

lgbm_y = lgbm.predict(x_test)
xgbc_y = xgbc.predict(x_test)

lgbm_y_oversample = lgbm.predict(x_test_oversample)
xgbc_y_oversample = xgbc.predict(x_test_oversample)

# Oversample train and test data
table = mcnemar_table(y_target=y_test_oversample, 
               y_model1=lgbm_y_oversample, 
               y_model2=xgbc_y_oversample)

# print(table)

result = mcnemar(table, exact=False)
chi_2 = result.statistic
p = result.pvalue

print(f"Chi-square statistic: {chi_2}, p-value: {p}")
test_significant(p)

# Oversample train data
table = mcnemar_table(y_target=y_test, 
                   y_model1=lgbm_y, 
                   y_model2=xgbc_y)

# print(table)

result = mcnemar(table, exact=False)
chi_2 = result.statistic
p = result.pvalue

print(f"Chi-square statistic: {chi_2}, p-value: {p}")
test_significant(p)

Chi-square statistic: 8.596885245901639, p-value: 0.003367384376418539
Statistically Significant

Chi-square statistic: 209.4277038750723, p-value: 1.8311561960538942e-47
Statistically Significant



### Top 10 Features + Age & Gender Column

In [20]:
# Features variable
x = ageGroup.loc[:, ['task_id', 'adv_id', 'adv_prim_id', 'dev_id', 'slot_id', 'tags',
       'device_size', 'career', 'up_life_duration', 'indu_name', 'age', 'gender']]

# Target variable
y = ageGroup["label"]
y = LabelEncoder().fit_transform(y)

In [21]:
lgbm = LGBMClassifier(application='binary', max_depth=19, metrics = 'auc', num_leaves=234)
xgbc = XGBClassifier(use_label_encoder =False, eval_metric='mlogloss')

In [22]:
cv = StratifiedKFold(n_splits=5, shuffle=True)
for train_idx, test_idx in cv.split(x, y):
    x_train, y_train = x.iloc[train_idx], y[train_idx]
    x_test, y_test = x.iloc[test_idx], y[test_idx]
    
    over = SMOTE()
    x_train_oversample, y_train_oversample = over.fit_resample(x_train, y_train)  
    x_test_oversample, y_test_oversample = over.fit_resample(x_test, y_test)
    
lgbm.fit(x_train_oversample, y_train_oversample)
xgbc.fit(x_train_oversample, y_train_oversample)

lgbm_y = lgbm.predict(x_test)
xgbc_y = xgbc.predict(x_test)

lgbm_y_oversample = lgbm.predict(x_test_oversample)
xgbc_y_oversample = xgbc.predict(x_test_oversample)

# Oversample train and test data
table = mcnemar_table(y_target=y_test_oversample, 
               y_model1=lgbm_y_oversample, 
               y_model2=xgbc_y_oversample)

# print(table)

result = mcnemar(table, exact=False)
chi_2 = result.statistic
p = result.pvalue

print(f"Chi-square statistic: {chi_2}, p-value: {p}")
test_significant(p)

# Oversample train data
table = mcnemar_table(y_target=y_test, 
                   y_model1=lgbm_y, 
                   y_model2=xgbc_y)

# print(table)

result = mcnemar(table, exact=False)
chi_2 = result.statistic
p = result.pvalue

print(f"Chi-square statistic: {chi_2}, p-value: {p}")
test_significant(p)

Chi-square statistic: 35.563322368421055, p-value: 2.468927055439793e-09
Statistically Significant

Chi-square statistic: 250.93388674770233, p-value: 1.6249703329652802e-56
Statistically Significant



### Top 10 Features (without ID) + Age & Gender Column

In [23]:
# Features variable
x = ageGroup.loc[:, ['tags', 'device_size', 'career', 'up_life_duration', 'indu_name', 'age', 'gender']]

# Target variable
y = ageGroup["label"]
y = LabelEncoder().fit_transform(y)

In [24]:
lgbm = LGBMClassifier(application='binary', max_depth=16, metrics = 'auc', num_leaves=242)
xgbc = XGBClassifier(use_label_encoder =False, eval_metric='mlogloss')

In [25]:
cv = StratifiedKFold(n_splits=5, shuffle=True)
for train_idx, test_idx in cv.split(x, y):
    x_train, y_train = x.iloc[train_idx], y[train_idx]
    x_test, y_test = x.iloc[test_idx], y[test_idx]
    
    over = SMOTE()
    x_train_oversample, y_train_oversample = over.fit_resample(x_train, y_train)  
    x_test_oversample, y_test_oversample = over.fit_resample(x_test, y_test)
    
lgbm.fit(x_train_oversample, y_train_oversample)
xgbc.fit(x_train_oversample, y_train_oversample)

lgbm_y = lgbm.predict(x_test)
xgbc_y = xgbc.predict(x_test)

lgbm_y_oversample = lgbm.predict(x_test_oversample)
xgbc_y_oversample = xgbc.predict(x_test_oversample)

# Oversample train and test data
table = mcnemar_table(y_target=y_test_oversample, 
               y_model1=lgbm_y_oversample, 
               y_model2=xgbc_y_oversample)

# print(table)

result = mcnemar(table, exact=False)
chi_2 = result.statistic
p = result.pvalue

print(f"Chi-square statistic: {chi_2}, p-value: {p}")
test_significant(p)

# Oversample train data
table = mcnemar_table(y_target=y_test, 
                   y_model1=lgbm_y, 
                   y_model2=xgbc_y)

# print(table)

result = mcnemar(table, exact=False)
chi_2 = result.statistic
p = result.pvalue

print(f"Chi-square statistic: {chi_2}, p-value: {p}")
test_significant(p)

Chi-square statistic: 158.2897050365623, p-value: 2.6750960403078604e-36
Statistically Significant

Chi-square statistic: 87.6566091954023, p-value: 7.786393218344766e-21
Statistically Significant

