In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import optuna
from catboost import CatBoostClassifier as CBC
import lightgbm as LGB
from sklearn.ensemble import HistGradientBoostingClassifier as HGBC
from sklearn.preprocessing import OrdinalEncoder as OE
from sklearn.model_selection import KFold, StratifiedKFold, StratifiedGroupKFold
from sklearn.metrics import accuracy_score

In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s4e11/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e11/test.csv')
train.shape, test.shape

((140700, 20), (93800, 19))

In [3]:
trc = list(train.columns)
tec = list(test.columns)
NUM_COLS, CAT_COLS = [],[]
for i in trc:
    if i not in tec:
        TARGET = i
    elif train[i].dtype in [int,float]:
        NUM_COLS.append(i)
    else:
        CAT_COLS.append(i)

print((len(CAT_COLS) + len(NUM_COLS) + 1) == len(trc))
NUM_COLS.remove('id')

True


In [4]:
nns = train.Name.value_counts()
nns = nns.loc[nns<2]
nns.index
train.loc[train.index[train.Name.isin(nns.index)],'Name'] ='rare'
train.Name.nunique()

245

In [5]:
oe = OE( handle_unknown = 'use_encoded_value', unknown_value= -1, encoded_missing_value= -2)
oe.fit(train[CAT_COLS])
train[CAT_COLS] = oe.transform(train[CAT_COLS])
test[CAT_COLS] = oe.transform(test[CAT_COLS])

In [6]:
for i in CAT_COLS:
    train[i] = train[i].astype(np.int64) + 2   
    test[i] = test[i].astype(np.int64) + 2
    print(f"{i}: ",train[i].isna().sum(),test[i].isna().sum())

Name:  0 0
Gender:  0 0
City:  0 0
Working Professional or Student:  0 0
Profession:  0 0
Sleep Duration:  0 0
Dietary Habits:  0 0
Degree:  0 0
Have you ever had suicidal thoughts ?:  0 0
Family History of Mental Illness:  0 0


In [7]:
BOTH = CAT_COLS + NUM_COLS

sts = train[TARGET].value_counts()
sts[0] / sts.sum(), sts[1]/sts.sum()
wts = {0:0.8182871357498224, 1:0.18171286425017769}
wts

{0: 0.8182871357498224, 1: 0.18171286425017769}

In [8]:
#make HGBC model and preds 
hg_params = {
    'loss': 'log_loss',
    'max_iter': 1512, 
    'learning_rate': 0.08406660355512156,
    'max_leaf_nodes': 69,
    'max_depth': 9,
    'min_samples_leaf':20,
    'l2_regularization': 0,
    'categorical_features': CAT_COLS,
    'early_stopping': 'auto',
    "validation_fraction": 0.1, 
    'verbose':0,
    'class_weight': wts
}
#'learning_rate': 0.14378400577638836, 'max_leaf_nodes': 56, 'l2_regularization': 6}
#Best score: 0.9243529411764705

# Best parameters: {'learning_rate': 0.08406660355512156, 'max_leaf_nodes': 69, 'max_depth': 9, 'l2_regularization': 0}
# Best score: 0.9243529411764705
    
FOLDS = 7
scores = np.zeros(FOLDS)
hgoof = np.zeros(len(train['id']))
hgpred = np.zeros(len(test['id']))

kf = KFold(n_splits = FOLDS)
for i, (trndex,valdex) in enumerate(kf.split(train.id,train[TARGET])):
    xtrain,ytrain = train.loc[trndex,BOTH], train.loc[trndex,TARGET]
    xval, yval    = train.loc[valdex,BOTH], train.loc[valdex,TARGET]

    #make the model
    model = HGBC(**hg_params) 
    model = model.fit(xtrain,ytrain)

    #oof predictions
    oofpred = np.array(model.predict_proba(xval))
    hgoof[valdex] = oofpred[:,1]

    #test set predictions
    testpred = model.predict_proba(test[BOTH])
    hgpred += testpred[:,1]
    if i % 20 == 0:
        print("HGBC#"+str(i))

#average folds on test set predictions
hgpred /= FOLDS

# use oofpred as whole number set of probas (hgoof)
oofpreds = np.array(hgoof)
oofpreds[oofpreds>=.5]=1
oofpreds[oofpreds<.5]=0

# get oof score
ascore = accuracy_score(train[TARGET],oofpreds)
 
print(f"Mean Score: {ascore}") 
 

HGBC#0
Mean Score: 0.9243354655294954


In [9]:
# save oof probas
hgoo = train[['id',TARGET]]
hgoo[TARGET] = np.array(hgoof)
hgoo.to_csv('hgoof.csv', index=False)

# save test probas
hgsub = pd.read_csv('/kaggle/input/playground-series-s4e11/sample_submission.csv')
hgsub[TARGET] = hgpred
hgsub.to_csv('hgsubprobs.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hgoo[TARGET] = np.array(hgoof)


In [10]:
# what is the optimal threshold value for these predictions?
top = 0
scored = 0
for i in range(100):
    tester = hgoof.copy() 
    tester[tester >= i/100]=1
    tester[tester < i/100]=0
    score = accuracy_score(train[TARGET],tester)
    if score > scored:
        scored = score
        top = i

# print the optimal threshold value and score
print(top, scored)

# apply optimal threshold and save submission
hgsub = pd.read_csv('/kaggle/input/playground-series-s4e11/sample_submission.csv')
tested = hgpred.copy()
tested[tested>= top/100]=1
tested[tested < top/100]=0
hgsub[TARGET] = tested
hgsub.to_csv('submission.csv',index=False)
hgsub.to_csv('hgsubmission.csv',index=False)

23 0.937093105899076


In [11]:
!head /kaggle/working/hgoof.csv

id,Depression
0,0.0012880711116379962
1,0.4544447436692491
2,0.13372074413409876
3,0.6964394578584504
4,0.1366557038623319
5,0.0008165059374109035
6,0.001440165887448981
7,0.002417239908508628
8,0.011621558010650724


In [12]:
!head /kaggle/working/submission.csv

id,Depression
140700,0.0
140701,0.0
140702,0.0
140703,1.0
140704,0.0
140705,0.0
140706,0.0
140707,0.0
140708,0.0


In [13]:
train.shape,test.shape

((140700, 20), (93800, 19))

In [14]:
# now do the same for catboost
cb_params = {
    'objective': 'CrossEntropy',
    'iterations': 1512, 

    # # From 2 best catboost optuna explorations elsewhere
    # 'learning_rate': 0.10138991939014416,
    # 'depth': 9,
    # 'reg_lambda': 11,
    # # Best score in test: 0.9393529411764705
    # # Best score on LB: 0.9401847903340441
     
    'learning_rate': 0.055905048327263715,
    'depth': 5,
    'reg_lambda': 12,
    # # ??? 
    
    'min_data_in_leaf':20,
    'use_best_model':True,
    #'task_type':"GPU",
    
}
 
kf = KFold(n_splits = FOLDS)
cboof= np.zeros(len(train['id']))
cbpreds = np.zeros(len(test['id']))

for i, (trndex,valdex) in enumerate(kf.split(train.id,train[TARGET])):
    xtrain,ytrain = train.loc[trndex,BOTH], train.loc[trndex,TARGET]
    xval, yval    = train.loc[valdex,BOTH], train.loc[valdex,TARGET]
    model = CBC(cat_features = CAT_COLS, **cb_params)
    model.fit(xtrain,ytrain,
             eval_set=(xval,yval),
             verbose=False,
             early_stopping_rounds = 25,
             use_best_model=True)
    
    ypred =  model.predict_proba(xval) 
    cboof[valdex] = ypred[:,1]
    tpred = model.predict_proba(test[BOTH])
    cbpreds += tpred[:,1]
    if i % 20 == 0:
        print("CBC#"+str(i))
        
# average of preds by folds
cbpreds /= FOLDS

# use oofpred as whole number set of probas (hgoof)
oofpreds = np.array(cboof)
oofpreds[oofpreds>=.5]=1
oofpreds[oofpreds<.5]=0

# get oof score
ascore = accuracy_score(train[TARGET],oofpreds)
 
print(f"Mean Score: {ascore}") 



CBC#0
Mean Score: 0.9402416488983654


In [15]:
# save oof probas
cboo = train[['id',TARGET]].copy()
cboo[TARGET] = np.array(cboof)
cboo.to_csv('cboof.csv', index=False)

# save test probas
cbsub = pd.read_csv('/kaggle/input/playground-series-s4e11/sample_submission.csv')
cbsub[TARGET] = cbpreds
cbsub.to_csv('cbsubprobs.csv',index=False)

In [16]:
top = 0
scored = 0
for i in range(100):
    tester = cboof.copy() 
    tester[tester >= i/100]=1
    tester[tester < i/100]=0
    score = accuracy_score(train[TARGET],tester)
    if score > scored:
        scored = score
        top = i

print(top, scored)

# save submission
cbsub = pd.read_csv('/kaggle/input/playground-series-s4e11/sample_submission.csv')
tested = cbpreds.copy()
tested[tested>= top/100]=1
tested[tested < top/100]=0
cbsub[TARGET] = tested
cbsub.to_csv('submission.csv',index=False)
cbsub.to_csv('cbsubmission.csv',index=False)

48 0.9403056147832267


In [17]:
# # Next up is LGBM
# # Optuna Test 0.9370980392156864
# # Best parameters: {'learning_rate': 0.039237220350979124, 'num_leaves': 55, 'reg_lambda': 21}

lgb_params = {
    'objective': 'cross_entropy',
    'num_iterations': 1512, 
    'learning_rate': 0.039237220350979124,
    'num_leaves': 55,
    'max_depth': 7,
    #'subsample':trial.suggest_float('subsample',0,1),
    'reg_lambda': 3,
    #'bagging_temperature':trial.suggest_float('bagging_temperature',1,2),
    'min_data_in_leaf':20,
    'use_best_model':True,
    'num_threads': 4,
    'early_stopping_round': 25,
    "bagging_freq": 5, 
    "bagging_fraction": 0.75,
    'verbosity':-1
    
}
FOLDS = 3
lgoof= np.zeros(len(train['id']))
lgpreds = np.zeros(len(test['id']))
scores = np.zeros(FOLDS)
kf = KFold(n_splits = FOLDS)
for i, (trndex,valdex) in enumerate(kf.split(train.id,train[TARGET])):
    xtrain,ytrain = train.loc[trndex,BOTH], train.loc[trndex,TARGET]
    xval, yval    = train.loc[valdex,BOTH], train.loc[valdex,TARGET]
    
    train_data = LGB.Dataset(xtrain, label=ytrain, categorical_feature=CAT_COLS)
    val_data = LGB.Dataset(xval, label=yval, categorical_feature=CAT_COLS)
 
    model = LGB.train(lgb_params, train_data, valid_sets=[val_data])
    ypred = np.array(model.predict(xval))
    lgoof[valdex] = ypred
    lgpreds += np.array(model.predict(test[BOTH]))
    if i % 20 == 0:
        print("LGC#"+str(i))  

lgpreds /= FOLDS
lgpreds[:10]





LGC#0




array([0.00178501, 0.00120652, 0.02244175, 0.95882248, 0.01769344,
       0.02498952, 0.0079884 , 0.09230652, 0.05689888, 0.92106107])

In [18]:
top = 0
scored = 0
for i in range(100):
    tester = lgoof.copy() 
    tester[tester >= i/100]=1
    tester[tester < i/100]=0
    score = accuracy_score(train[TARGET],tester)
    if score > scored:
        scored = score
        top = i

print(top, scored)

# save probabilities
lgsub = pd.read_csv('/kaggle/input/playground-series-s4e11/sample_submission.csv')
lgsub[TARGET] = lgpreds.copy()
lgsub.to_csv('lgsubprobs.csv',index=False)

#save oof
lgoofdf = train[['id',TARGET]].copy()
lgoofdf[TARGET] = lgoof
lgoofdf.to_csv('lgoof.csv')

# save submission
lgsub = pd.read_csv('/kaggle/input/playground-series-s4e11/sample_submission.csv')
tested = lgpreds.copy()
tested[tested>= top/100]=1
tested[tested < top/100]=0
lgsub[TARGET] = tested
lgsub.to_csv('submission.csv',index=False)
lgsub.to_csv('lgsubmission.csv',index=False)

51 0.938542999289268


In [19]:
# hillclimb with them:
sco = 0
for i in range(100):
    predout = lgoof * i/100 + cboof * (100-i)/100
    predout[predout>=.5] = 1
    predout[predout<.5] = 0
    score = accuracy_score(train[TARGET],predout)
    if score > sco:
        sco = score
        ii = i
        
# report the results to stdout
print(ii,sco)
newoof = lgoof * ii/100 + (100-ii)/100 * cboof
newpreds = lgpreds * ii/100 + cbpreds * (100-ii)/100

# use the best i to make interim submission
finsub = pd.read_csv('/kaggle/input/playground-series-s4e11/sample_submission.csv')
finsub[TARGET] = newpreds 
finsub[TARGET] = finsub[TARGET].apply(lambda x: 1 if x >=.5 else 0)
finsub.to_csv('submissiona.csv', index=False)

sco = 0
for i in range(100):
    predout = newoof * i/100 + hgoof * (100-i)/100
    predout[predout>=.5] = 1
    predout[predout<.5] = 0
    score = accuracy_score(train[TARGET],predout)
    if score > sco:
        sco = score
        ii = i
        
# report the results to stdout
print(ii,sco)

# use the best i and j to make final submission
finsub = pd.read_csv('/kaggle/input/playground-series-s4e11/sample_submission.csv')
finsub[TARGET] = newpreds * ii/100 + hgsub[TARGET] * (100-ii)/100
finsub[TARGET] = finsub[TARGET].apply(lambda x: 1 if x >=.5 else 0)
finsub.to_csv('submission.csv', index=False)

15 0.9403482587064677
98 0.9401776830135039


In [20]:
!head submission.csv


id,Depression
140700,0
140701,0
140702,0
140703,1
140704,0
140705,0
140706,0
140707,0
140708,0
