In [1]:
# import the libraries
import tensorflow
import keras
from sklearn.metrics import accuracy_score
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder as OE
from catboost import CatBoostClassifier, Pool

In [2]:
# load all the data
train = pd.read_csv('//kaggle/input/playground-series-s4e11/train.csv')
cbo = pd.read_csv('/kaggle/input/s4e11-inference-a/cboof.csv')
hgo = pd.read_csv('/kaggle/input/s4e11-inference-a/hgoof.csv')
lgo = pd.read_csv('/kaggle/input/s4e11-inference-a/lgoof.csv')

test = pd.read_csv('/kaggle/input/playground-series-s4e11/test.csv')
cbt = pd.read_csv('/kaggle/input/s4e11-inference-a/cbsubprobs.csv')
hgt = pd.read_csv('/kaggle/input/s4e11-inference-a/hgsubprobs.csv')
lgt = pd.read_csv('/kaggle/input/s4e11-inference-a/lgsubprobs.csv')


In [3]:
# set the target
TARGET = 'Depression'

# make a dataframe of the baselines and predictions
tt = train[['id',TARGET]].copy()
tt['cb']= cbo[TARGET]
tt['hg']= hgo[TARGET]
tt['lg']= lgo[TARGET]


te = pd.read_csv('/kaggle/input/playground-series-s4e11/sample_submission.csv')
te['cb']= cbt[TARGET]
te['hg']= hgt[TARGET]
te['lg']= lgt[TARGET]

In [4]:
tt.head()

Unnamed: 0,id,Depression,cb,hg,lg
0,0,0,0.00064,0.000693,0.001523
1,1,1,0.654026,0.662836,0.778344
2,2,1,0.484512,0.126605,0.344253
3,3,1,0.881125,0.660651,0.891852
4,4,0,0.472058,0.082273,0.293054


In [5]:
te.head()

Unnamed: 0,id,Depression,cb,hg,lg
0,140700,0,0.000324,0.00065,0.001785
1,140701,0,0.000216,0.000566,0.001207
2,140702,0,0.032181,0.005036,0.022442
3,140703,0,0.977608,0.884274,0.958822
4,140704,0,0.016718,0.004565,0.017693


In [6]:
# Prepare Cat, Numerical, and Both Columns
trc = list(train.columns)
tec = list(test.columns)
NUM_COLS, CAT_COLS = [],[]
for i in trc:
    if i not in tec:
        TARGET = i
    elif train[i].dtype in [int,float]:
        NUM_COLS.append(i)
    else:
        CAT_COLS.append(i)

# test that they add up
print((len(CAT_COLS) + len(NUM_COLS) + 1) == len(trc))

# make BOTH for CV ease
BOTH = NUM_COLS + CAT_COLS

# remove the ids
NUM_COLS.remove('id')

# filter out rare names
nns = train.Name.value_counts()
nns = nns.loc[nns<2] 
train.loc[train.index[train.Name.isin(nns.index)],'Name'] ='rare'
train.Name.nunique()

# ordinal encode the text
oe = OE( handle_unknown = 'use_encoded_value', unknown_value= -1, encoded_missing_value= -2)
oe.fit(train[CAT_COLS])
train[CAT_COLS] = oe.transform(train[CAT_COLS])
test[CAT_COLS] = oe.transform(test[CAT_COLS])

# Move up the numbers to 0 and make them numpy integers
for i in CAT_COLS:
    train[i] = train[i].astype(np.int64) + 2   
    test[i] = test[i].astype(np.int64) + 2
    print(f"{i}: ",train[i].isna().sum(),test[i].isna().sum())

True
Name:  0 0
Gender:  0 0
City:  0 0
Working Professional or Student:  0 0
Profession:  0 0
Sleep Duration:  0 0
Dietary Habits:  0 0
Degree:  0 0
Have you ever had suicidal thoughts ?:  0 0
Family History of Mental Illness:  0 0


In [7]:
# set up the Cross Validation and Some Variables
FOLDS = 21
kf = KFold(n_splits = FOLDS)
bigpreds, bigoof = [] , []

# cycle through the baselines
for i in [2,3,4]:
    # interim variables for each baseline
    oof = np.zeros(len(train['id']))
    preds = np.zeros(len(test['id']))

    # cycle through the CV for the baseline
    for j, (trndex,valdex) in enumerate(kf.split(train['id'])):
        xtrain, ytrain = train.loc[trndex, BOTH], train.loc[trndex,TARGET]
        xval,   yval   = train.loc[valdex, BOTH], train.loc[valdex,TARGET]

        # make the catboost pools
        trainpool = Pool(xtrain,ytrain,baseline=tt.iloc[trndex,i],feature_names=BOTH,cat_features=CAT_COLS)
        valpool = Pool(xval,yval,baseline=tt.iloc[valdex,i],feature_names=BOTH,cat_features=CAT_COLS)
        testpool = Pool(test[BOTH],pd.Series(np.zeros(len(test['id']))),baseline=te.iloc[:,i],feature_names=BOTH,cat_features=CAT_COLS)

        
        # build the model
        mod = CatBoostClassifier(objective = 'CrossEntropy',
                        iterations= 1512, 
                        learning_rate= 0.10138991939014416,
                        depth= 9,
                        reg_lambda= 11,    
                        min_data_in_leaf=20,
                        use_best_model=True,
                        cat_features= CAT_COLS,
                        task_type="GPU",
                        )
        # fit
        mod.fit(trainpool, 
                eval_set=valpool,
                early_stopping_rounds = 25,
                use_best_model=True,
                verbose= 0)

        # make predictions and keep score
        ypred = mod.predict_proba(valpool)
        oof[valdex] = ypred[:,1]
        tepred = mod.predict_proba(testpool)
        preds += tepred[:,1]

    # post process and record the results
    preds /= FOLDS
    bigpreds.append(preds)
    bigoof.append(oof)
    print(i, ' Completed')

2  Completed
3  Completed
4  Completed


In [8]:
# make a list of the columns we want...
titles = ['cb','hg','lg']

# write out the csvs...
for i in range(len(bigoof)):
    tt[titles[i]+'oof'] = bigoof[i]
    tt[['id',titles[i]+'oof']].to_csv(titles[i]+'oof.csv',index=False)

    te[titles[i]+'preds'] = bigpreds[i]
    te[['id',titles[i]+'preds']].to_csv(titles[i]+'preds.csv',index=False)

In [9]:
tt.head()

Unnamed: 0,id,Depression,cb,hg,lg,cboof,hgoof,lgoof
0,0,0,0.00064,0.000693,0.001523,0.00079,0.000565,0.000395
1,1,1,0.654026,0.662836,0.778344,0.681626,0.682415,0.690321
2,2,1,0.484512,0.126605,0.344253,0.572483,0.489644,0.482529
3,3,1,0.881125,0.660651,0.891852,0.88752,0.847041,0.914803
4,4,0,0.472058,0.082273,0.293054,0.474997,0.409113,0.448942


In [10]:
te.head()

Unnamed: 0,id,Depression,cb,hg,lg,cbpreds,hgpreds,lgpreds
0,140700,0,0.000324,0.00065,0.001785,0.000653,0.000741,0.000738
1,140701,0,0.000216,0.000566,0.001207,0.000475,0.000466,0.000492
2,140702,0,0.032181,0.005036,0.022442,0.044406,0.044758,0.042599
3,140703,0,0.977608,0.884274,0.958822,0.979662,0.978684,0.979847
4,140704,0,0.016718,0.004565,0.017693,0.021465,0.020006,0.02099


In [11]:
# threshold the values for testing in new columns
tt['cbo'] = tt.cboof.apply(lambda x : 1 if x >=.5 else 0)
tt['hgo'] = tt.hgoof.apply(lambda x : 1 if x >=.5 else 0)
tt['lgo'] = tt.lgoof.apply(lambda x : 1 if x >=.5 else 0)

te['cbp'] = te.cbpreds.apply(lambda x : 1 if x >=.5 else 0)
te['hgp'] = te.hgpreds.apply(lambda x : 1 if x >=.5 else 0)
te['lgp'] = te.lgpreds.apply(lambda x : 1 if x >=.5 else 0)

In [12]:
# Compute each models' accuracy with the 50/50 threshold
a = accuracy_score(train[TARGET],tt.cbo)
b = accuracy_score(train[TARGET],tt.hgo)
c = accuracy_score(train[TARGET],tt.lgo)

a,b,c

(0.9402061122956645, 0.9399147121535181, 0.9399786780383795)

In [13]:
# output interim submissions for lb scoring
te[['id','cbp']].to_csv('b_set_cb.csv',index=False) #94243
te[['id','hgp']].to_csv('b_set_hg.csv',index=False) #less - not tested
te[['id','lgp']].to_csv('b_set_lg.csv',index=False) #94227


In [14]:
#hill climb with the models
best = 0
ii = 0

# test the posssibilities
for i in range(100):
    hcoof = tt['cboof'] * i/100 + tt['lgoof'] * (100-i)/100
    hcoof[hcoof >=.5] = 1
    hcoof[hcoof <.5 ] = 0
    hcscore = accuracy_score(train[TARGET],hcoof)
    if hcscore > best:
        best = hcscore
        ii = i

print(f"Best Score: {best} at {ii}%")

# apply the blend rate to an oof out
tt['blend1'] = tt.cboof * ii/100 + tt.lgoof * (100-ii)/100

# apply the blend rate and make a dataframe
te['blend1'] = te.cbpreds * ii/100 + te.lgpreds * (100-ii)/100
blend1 = te[['id','blend1']].copy()

# set up the dataframe to make a submission
blend1.columns = ['id','Depression']
blend1['Depression'] = blend1.Depression.apply(lambda x : 1 if x >=.5 else 0)
blend1.to_csv('b_set_blend1.csv', index=False)
blend1.head()

Best Score: 0.9401847903340441 at 95%


Unnamed: 0,id,Depression
0,140700,0
1,140701,0
2,140702,0
3,140703,1
4,140704,0


In [15]:
!head b_set_blend1.csv

  pid, fd = os.forkpty()


id,Depression
140700,0
140701,0
140702,0
140703,1
140704,0
140705,0
140706,0
140707,0
140708,0


In [16]:
te.head()

Unnamed: 0,id,Depression,cb,hg,lg,cbpreds,hgpreds,lgpreds,cbp,hgp,lgp,blend1
0,140700,0,0.000324,0.00065,0.001785,0.000653,0.000741,0.000738,0,0,0,0.000657
1,140701,0,0.000216,0.000566,0.001207,0.000475,0.000466,0.000492,0,0,0,0.000476
2,140702,0,0.032181,0.005036,0.022442,0.044406,0.044758,0.042599,0,0,0,0.044316
3,140703,0,0.977608,0.884274,0.958822,0.979662,0.978684,0.979847,1,1,1,0.979671
4,140704,0,0.016718,0.004565,0.017693,0.021465,0.020006,0.02099,0,0,0,0.021441


In [17]:
#hill climb stage 2 with the models
best = 0
ii = 0

# test the posssibilities
for i in range(100):
    hcoof = tt['blend1'] * i/100 + tt['hgoof'] * (100-i)/100
    hcoof[hcoof >=.5] = 1
    hcoof[hcoof <.5 ] = 0
    hcscore = accuracy_score(train[TARGET],hcoof)
    if hcscore > best:
        best = hcscore
        ii = i

print(f"Best Score: {best} at {ii}%")

#apply the blend to oof
tt['blend2'] = tt.blend1 * ii/100 + tt.hgoof * (100-ii)/100

# apply the blend rate and make a dataframe
te['blend2'] = te.blend1 * ii/100 + te.hgpreds * (100-ii)/100
blend2 = te[['id','blend2']].copy()

# set up the dataframe to make a submission
blend2.columns = ['id','Depression']
blend2['Depression'] = blend2.Depression.apply(lambda x : 1 if x >=.5 else 0)
blend2.to_csv('b_set_blend2.csv', index=False)
blend2.head()

Best Score: 0.940362473347548 at 48%


Unnamed: 0,id,Depression
0,140700,0
1,140701,0
2,140702,0
3,140703,1
4,140704,0


In [18]:
# now compute the thresholds for blend 1 and blend 
best, best2 = 0, 0

for i in range(100):
    a = tt['blend1'].copy()
    a[a>=i/100]=1
    a[a<i/100]=0
    score = accuracy_score(train[TARGET],a)
    if score > best:
        best = score
        ii = i
for j in range(100):
    b = tt['blend2'].copy()
    b[b>=j/100]=1
    b[b<j/100]=0
    score = accuracy_score(train[TARGET],b)
    if score > best2:
        best2 = score
        jj = j

print(f"Best of Blend 1 {best} at {ii}%")
print(f"Best of Blend 2 {best2} at {jj}%")

# These are the same values as above. 
# The Thresholding was the best these 3 models can produce.

Best of Blend 1 0.9402203269367448 at 51%
Best of Blend 2 0.940362473347548 at 50%
