In [1]:
# import the libraries
import tensorflow
import keras
from sklearn.metrics import accuracy_score
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder as OE
from catboost import CatBoostClassifier, Pool

In [2]:
# load all the data
train = pd.read_csv('//kaggle/input/playground-series-s4e11/train.csv')
cbo = pd.read_csv('/kaggle/input/s4e11-inference-a/cboof.csv')
hgo = pd.read_csv('/kaggle/input/s4e11-inference-a/hgoof.csv')
lgo = pd.read_csv('/kaggle/input/s4e11-inference-a/lgoof.csv')
ggo = pd.read_csv('/kaggle/input/ps4e11-mental-health-prediction-classification/oof_predss_v22.csv')

test = pd.read_csv('/kaggle/input/playground-series-s4e11/test.csv')
cbt = pd.read_csv('/kaggle/input/s4e11-inference-a/cbsubprobs.csv')
hgt = pd.read_csv('/kaggle/input/s4e11-inference-a/hgsubprobs.csv')
lgt = pd.read_csv('/kaggle/input/s4e11-inference-a/lgsubprobs.csv')
ggt = pd.read_csv('/kaggle/input/ps4e11-mental-health-prediction-classification/submission_pred_v22.csv')

In [3]:
# set the target
TARGET = 'Depression'

# make a dataframe of the baselines and predictions
tt = train[['id',TARGET]].copy()
tt['cb']= cbo[TARGET]
tt['hg']= hgo[TARGET]
tt['lg']= lgo[TARGET]
tt['gg']= ggo['final_threshold']


te = pd.read_csv('/kaggle/input/playground-series-s4e11/sample_submission.csv')
te['cb']= cbt[TARGET]
te['hg']= hgt[TARGET]
te['lg']= lgt[TARGET]
te['gg']= ggt[TARGET]

In [4]:
tt.head()

Unnamed: 0,id,Depression,cb,hg,lg,gg
0,0,0,0.000483,0.000869,0.001523,0.000558
1,1,1,0.66699,0.497017,0.778344,0.7088
2,2,1,0.529499,0.216119,0.344253,0.475249
3,3,1,0.911831,0.625092,0.891852,0.91552
4,4,0,0.425229,0.097551,0.293054,0.36768


In [5]:
te.head()

Unnamed: 0,id,Depression,cb,hg,lg,gg
0,140700,0,0.000358,0.000696,0.001785,0.000618
1,140701,0,0.000188,0.000607,0.001207,0.000322
2,140702,0,0.031073,0.006772,0.022442,0.014303
3,140703,0,0.97778,0.885894,0.958822,0.975888
4,140704,0,0.016685,0.004565,0.017693,0.014672


In [6]:
# Prepare Cat, Numerical, and Both Columns
trc = list(train.columns)
tec = list(test.columns)
NUM_COLS, CAT_COLS = [],[]
for i in trc:
    if i not in tec:
        TARGET = i
    elif train[i].dtype in [int,float]:
        NUM_COLS.append(i)
    else:
        CAT_COLS.append(i)

# test that they add up
print((len(CAT_COLS) + len(NUM_COLS) + 1) == len(trc))

# make BOTH for CV ease
BOTH = NUM_COLS + CAT_COLS

# remove the ids
NUM_COLS.remove('id')

# filter out rare names
nns = train.Name.value_counts()
nns = nns.loc[nns<2] 
train.loc[train.index[train.Name.isin(nns.index)],'Name'] ='rare'
train.Name.nunique()

# ordinal encode the text
oe = OE( handle_unknown = 'use_encoded_value', unknown_value= -1, encoded_missing_value= -2)
oe.fit(train[CAT_COLS])
train[CAT_COLS] = oe.transform(train[CAT_COLS])
test[CAT_COLS] = oe.transform(test[CAT_COLS])

# Move up the numbers to 0 and make them numpy integers
for i in CAT_COLS:
    train[i] = train[i].astype(np.int64) + 2   
    test[i] = test[i].astype(np.int64) + 2
    print(f"{i}: ",train[i].isna().sum(),test[i].isna().sum())

True
Name:  0 0
Gender:  0 0
City:  0 0
Working Professional or Student:  0 0
Profession:  0 0
Sleep Duration:  0 0
Dietary Habits:  0 0
Degree:  0 0
Have you ever had suicidal thoughts ?:  0 0
Family History of Mental Illness:  0 0


In [7]:
# threshold the values for testing in new columns
tt['cbo'] = tt.cb.apply(lambda x : 1 if x >=.5 else 0)
tt['hgo'] = tt.hg.apply(lambda x : 1 if x >=.5 else 0)
tt['lgo'] = tt.lg.apply(lambda x : 1 if x >=.5 else 0)
tt['ggo'] = tt.gg.apply(lambda x : 1 if x >=.5 else 0)

te['cbp'] = te.cb.apply(lambda x : 1 if x >=.5 else 0)
te['hgp'] = te.hg.apply(lambda x : 1 if x >=.5 else 0)
te['lgp'] = te.lg.apply(lambda x : 1 if x >=.5 else 0)
te['ggp'] = te.gg.apply(lambda x : 1 if x >=.5 else 0)

In [8]:
# Compute each models' accuracy with the 50/50 threshold
a = accuracy_score(train[TARGET],tt.cbo)
b = accuracy_score(train[TARGET],tt.hgo)
c = accuracy_score(train[TARGET],tt.lgo)
d = accuracy_score(train[TARGET],tt.ggo)
a,b,c, d

(0.9402416488983654,
 0.9244633972992182,
 0.9384719260838664,
 0.9403340440653873)

In [9]:
# output interim submissions for lb scoring
te[['id','cbp']].to_csv('b_set_cb.csv',index=False) #94243
te[['id','hgp']].to_csv('b_set_hg.csv',index=False) #less - not tested
te[['id','lgp']].to_csv('b_set_lg.csv',index=False) #94227
te[['id','ggp']].to_csv('b_set_lg.csv',index=False) #94227


In [10]:
#hill climb with the models
best = 0
ii = 0

# test the posssibilities
for i in range(100):
    hcoof = tt['cb'] * i/100 + tt['gg'] * (100-i)/100
    hcoof[hcoof >=.5] = 1
    hcoof[hcoof <.5 ] = 0
    hcscore = accuracy_score(train[TARGET],hcoof)
    if hcscore > best:
        best = hcscore
        ii = i

print(f"Best Score: {best} at {ii}%")

# apply the blend rate to an oof out
tt['blend1'] = tt.cb * ii/100 + tt.gg * (100-ii)/100

# apply the blend rate and make a dataframe
te['blend1'] = te.cb * ii/100 + te.gg * (100-ii)/100
blend1 = te[['id','blend1']].copy()

# set up the dataframe to make a submission
blend1.columns = ['id','Depression']
blend1['Depression'] = blend1.Depression.apply(lambda x : 1 if x >=.5 else 0)
blend1.to_csv('b_set_blend1.csv', index=False)
blend1.head()

Best Score: 0.9406112295664535 at 40%


Unnamed: 0,id,Depression
0,140700,0
1,140701,0
2,140702,0
3,140703,1
4,140704,0


In [11]:
!head b_set_blend1.csv

  pid, fd = os.forkpty()


id,Depression
140700,0
140701,0
140702,0
140703,1
140704,0
140705,0
140706,0
140707,0
140708,0


In [12]:
te.head()

Unnamed: 0,id,Depression,cb,hg,lg,gg,cbp,hgp,lgp,ggp,blend1
0,140700,0,0.000358,0.000696,0.001785,0.000618,0,0,0,0,0.000514
1,140701,0,0.000188,0.000607,0.001207,0.000322,0,0,0,0,0.000268
2,140702,0,0.031073,0.006772,0.022442,0.014303,0,0,0,0,0.021011
3,140703,0,0.97778,0.885894,0.958822,0.975888,1,1,1,1,0.976645
4,140704,0,0.016685,0.004565,0.017693,0.014672,0,0,0,0,0.015477


In [13]:
#hill climb stage 2 with the models
best = 0
ii = 0

# test the posssibilities
for i in range(100):
    hcoof = tt['blend1'] * i/100 + tt['lg'] * (100-i)/100
    hcoof[hcoof >=.5] = 1
    hcoof[hcoof <.5 ] = 0
    hcscore = accuracy_score(train[TARGET],hcoof)
    if hcscore > best:
        best = hcscore
        ii = i

print(f"Best Score: {best} at {ii}%")

#apply the blend to oof
tt['blend2'] = tt.blend1 * ii/100 + tt.lg * (100-ii)/100

# apply the blend rate and make a dataframe
te['blend2'] = te.blend1 * ii/100 + te.lg * (100-ii)/100
blend2 = te[['id','blend2']].copy()

# set up the dataframe to make a submission
blend2.columns = ['id','Depression']
blend2['Depression'] = blend2.Depression.apply(lambda x : 1 if x >=.5 else 0)
blend2.to_csv('b_set_blend2.csv', index=False)
blend2.head()

Best Score: 0.9405756929637527 at 85%


Unnamed: 0,id,Depression
0,140700,0
1,140701,0
2,140702,0
3,140703,1
4,140704,0


In [14]:
# # now compute the thresholds for blend 1 and blend 
# best, best2 = 0, 0

# for i in range(100):
#     a = tt['blend1'].copy()
#     a[a>=i/100]=1
#     a[a<i/100]=0
#     score = accuracy_score(train[TARGET],a)
#     if score > best:
#         best = score
#         ii = i
# for j in range(100):
#     b = tt['blend2'].copy()
#     b[b>=j/100]=1
#     b[b<j/100]=0
#     score = accuracy_score(train[TARGET],b)
#     if score > best2:
#         best2 = score
#         jj = j

# print(f"Best of Blend 1 {best} at {ii}%")
# print(f"Best of Blend 2 {best2} at {jj}%")

# # These are the same values as above. 
# # The Thresholding was the best these 3 models can produce.