In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
pd.set_option('display.max_columns', 100)
import warnings
warnings.filterwarnings("ignore")
import time
from sklearn import metrics

In [2]:
train=pd.read_csv(r'C:\Users\rishi\dataset/train.csv')
test=pd.read_csv(r'C:\Users\rishi\dataset/test.csv')

### Concatinating

In [3]:
train['train_or_test']='train'
test['train_or_test']='test'
df=pd.concat([train,test])

### Mapping

In [4]:
hospital_type_map={
 'a': 0,
 'b': 1,
 'c': 2,
 'e': 3,
 'd': 4,
 'f': 5,
 'g': 6} 

hospital_region_map = {'X': 0, 'Y': 1, 'Z': 2}

dep_map={'gynecology': 0,
 'anesthesia': 1,
 'radiotherapy': 2,
 'TB & Chest disease': 3,
 'surgery': 4}

ward_type_map ={'R': 0, 'Q': 1, 'S': 2, 'P': 3, 'T': 4, 'U': 5}
ward_fac_map ={'F':0, 'E': 1, 'D':2, 'C': 3, 'B': 4, 'A': 5}
admiss_map = {'Trauma': 0, 'Emergency': 1, 'Urgent': 2}
stay_map = {'21-30': 2,
 '11-20': 1,
 '31-40': 3,
 '51-60': 5,
 '0-10': 0,
 '41-50': 4,
 '71-80': 7,
 'More than 100 Days': 10,
 '81-90': 8,
 '91-100': 9,
 '61-70': 6}

In [5]:
df['Hospital_type_code'] = df['Hospital_type_code'].map(hospital_type_map)
df['Hospital_region_code'] = df['Hospital_region_code'].map(hospital_region_map)
df['Department'] = df['Department'].map(dep_map)
df['Ward_Type'] = df['Ward_Type'].map(ward_type_map)
df['Ward_Facility_Code'] = df['Ward_Facility_Code'].map(ward_fac_map)
df['Type of Admission'] = df['Type of Admission'].map(admiss_map)
df['Stay'] = df['Stay'].map(stay_map)

In [6]:
df['Severity of Illness'] = df['Severity of Illness'].map({'Minor': 1,
                          'Moderate': 2,
                          'Extreme': 3})

In [7]:
df['Age'] = df['Age'].map({'0-10': 5,
                          '11-20': 15,
                          '21-30': 25,
                          '31-40': 35,
                          '41-50': 45,
                          '51-60': 55,
                          '61-70': 65,
                          '71-80': 75,
                          '81-90': 85,
                          '91-100': 95
                         })

### Feature Engineering

In [8]:
df['mean_Admission_Deposit_per_illness']=df.groupby(['Severity of Illness'])['Admission_Deposit'].transform('mean')
df['mean_Admission_Deposit_per_admission']=df.groupby(['Type of Admission'])['Admission_Deposit'].transform('mean')
df['mean_Admission_Deposit_per_bed']=df.groupby(['Bed Grade'])['Admission_Deposit'].transform('mean')
df['mean_Admission_Deposit_per_department']=df.groupby(['Department'])['Admission_Deposit'].transform('mean')
df['mean_Admission_Deposit_per_hosp']=df.groupby(['Hospital_code'])['Admission_Deposit'].transform('mean')

In [9]:
df['sum_Admission_Deposit_per_illness']=df.groupby(['Severity of Illness'])['Admission_Deposit'].transform('sum')
df['sum_Admission_Deposit_per_admission']=df.groupby(['Type of Admission'])['Admission_Deposit'].transform('sum')
df['sum_Admission_Deposit_per_bed']=df.groupby(['Bed Grade'])['Admission_Deposit'].transform('sum')
df['sum_Admission_Deposit_per_department']=df.groupby(['Department'])['Admission_Deposit'].transform('sum')
df['sum_Admission_Deposit_per_hosp']=df.groupby(['Hospital_code'])['Admission_Deposit'].transform('sum')

In [10]:
df['max_Admission_Deposit_per_illness']=df.groupby(['Severity of Illness'])['Admission_Deposit'].transform('max')
df['max_Admission_Deposit_per_admission']=df.groupby(['Type of Admission'])['Admission_Deposit'].transform('max')
df['max_Admission_Deposit_per_bed']=df.groupby(['Bed Grade'])['Admission_Deposit'].transform('max')
df['max_Admission_Deposit_per_department']=df.groupby(['Department'])['Admission_Deposit'].transform('max')
df['max_Admission_Deposit_per_hosp']=df.groupby(['Hospital_code'])['Admission_Deposit'].transform('max')

In [11]:
df['min_Admission_Deposit_per_illness']=df.groupby(['Severity of Illness'])['Admission_Deposit'].transform('min')
df['min_Admission_Deposit_per_admission']=df.groupby(['Type of Admission'])['Admission_Deposit'].transform('min')
df['min_Admission_Deposit_per_bed']=df.groupby(['Bed Grade'])['Admission_Deposit'].transform('min')
df['min_Admission_Deposit_per_department']=df.groupby(['Department'])['Admission_Deposit'].transform('min')
df['min_Admission_Deposit_per_hosp']=df.groupby(['Hospital_code'])['Admission_Deposit'].transform('min')

In [12]:
df['mean_Admission_Deposit_per_patient_hosp']=df.groupby(['patientid','Hospital_code'])['Admission_Deposit'].transform('mean')
df['sum_Admission_Deposit_per_patient_hosp']=df.groupby(['patientid','Hospital_code'])['Admission_Deposit'].transform('sum')
df['max_Admission_Deposit_per_patient_hosp']=df.groupby(['patientid','Hospital_code'])['Admission_Deposit'].transform('max')
df['min_Admission_Deposit_per_patient_hosp']=df.groupby(['patientid','Hospital_code'])['Admission_Deposit'].transform('min')

In [13]:
df['illness_count']=df.groupby('Hospital_code')['Severity of Illness'].transform('count')
df['bed_count']=df.groupby('Hospital_code')['Bed Grade'].transform('count')
df['room_count']=df.groupby('Hospital_code')['Available Extra Rooms in Hospital'].transform('count')
df['Department_count']=df.groupby('Hospital_code')['Department'].transform('count')
df['ward_count']=df.groupby('Hospital_code')['Ward_Type'].transform('count')
df['TOA_count']=df.groupby('Hospital_code')['Type of Admission'].transform('count')

In [14]:
df['meanVisitors with Patient_per_patient']=df.groupby(['patientid'])['Visitors with Patient'].transform('mean')
df['meanVisitors with Patient_per_patient']=df.groupby(['patientid'])['Visitors with Patient'].transform('sum')
df['meanVisitors with Patient_per_patient']=df.groupby(['patientid'])['Visitors with Patient'].transform('max')
df['meanVisitors with Patient_per_patient']=df.groupby(['patientid'])['Visitors with Patient'].transform('min')

In [15]:
df['unique_hospital_visited']=df.groupby('patientid')['Hospital_code'].transform('nunique')
df['unique_hospitaltype_visited']=df.groupby('patientid')['Hospital_type_code'].transform('nunique')
df['count_visited_same_hospital']=df.groupby(['patientid','Hospital_code'])['Hospital_code'].transform('count')

df['mean_Admission_Deposit_per_patient']=df.groupby(['patientid'])['Admission_Deposit'].transform('mean')
df['sum_Admission_Deposit_per_patient']=df.groupby(['patientid'])['Admission_Deposit'].transform('sum')
df['max_Admission_Deposit_per_patient']=df.groupby(['patientid'])['Admission_Deposit'].transform('max')
df['min_Admission_Deposit_per_patient']=df.groupby(['patientid'])['Admission_Deposit'].transform('min')

df['mean_visitors_per_patient']=df.groupby(['patientid'])['Visitors with Patient'].transform('mean')
df['unique city']=df.groupby('patientid')['City_Code_Hospital'].transform('nunique')
df['unique illness type']=df.groupby('patientid')['Severity of Illness'].transform('nunique')
df['unique admission type']=df.groupby('patientid')['Type of Admission'].transform('nunique')

df['mean_Admission_Deposit_per_patient_in_same_hospital']=df.groupby(['patientid','Hospital_code'])['Admission_Deposit'].transform('mean')

In [16]:
df['total_no_of_patients_in_hospital']=df.groupby('Hospital_code')['patientid'].transform('count')
df['total_no_of_patients_in_hospital_from_same_city']=df.groupby(['Hospital_code','City_Code_Patient'])['patientid'].transform('count')
df['total_no_of_patients_in_patientcity']=df.groupby('City_Code_Patient')['patientid'].transform('count')
df['total_no_of_patients_in_hospitalcity']=df.groupby('City_Code_Hospital')['patientid'].transform('count')

In [17]:
df['number_of_time_patient_visited']=df.groupby('patientid')['patientid'].transform('count')
df['Total_Hospitals_in_city']=df.groupby('City_Code_Hospital')['Hospital_code'].transform('nunique')
df['Total_Hospitals_in_city_of_same_type']=df.groupby(['City_Code_Hospital','Hospital_type_code'])['Hospital_code'].transform('nunique')

In [18]:
df['total_no_of_patients_per_Department']=df.groupby('Department')['patientid'].transform('count')
df['total_no_of_patients_per_Severity of Illness']=df.groupby('Severity of Illness')['patientid'].transform('count')
df['Total_Hospitals_in_region']=df.groupby('Hospital_region_code')['patientid'].transform('count')
df['total_no_of_patients_in_hospitalward']=df.groupby(['Hospital_code','Ward_Type'])['patientid'].transform('count')
df['total_no_of_patients_in_hospitaldepartment']=df.groupby(['Hospital_code','Department'])['patientid'].transform('count')
df['total_no_of_patients_in_hospitalBedGrade']=df.groupby(['Hospital_code','Bed Grade'])['patientid'].transform('count')
df['total_no_of_patients_in_hospitalSeverityofIllness']=df.groupby(['Hospital_code','Type of Admission'])['patientid'].transform('count')
df['unique_patientcity_in_hospital']=df.groupby('Hospital_code')['City_Code_Patient'].transform('nunique')
df['Average_deposit']=df.groupby('Hospital_code')['Available Extra Rooms in Hospital'].transform('mean')
df['Total_Hospitals_in_patientcity']=df.groupby('City_Code_Patient')['Hospital_code'].transform('nunique')
df['Total_Hospitals_in_region']=df.groupby('Hospital_region_code')['Hospital_code'].transform('nunique')

### Rank features

In [19]:
df["RANK"] = df.groupby("patientid")['patientid'].rank(method="first", ascending=True)
df["RANK_avg"] = df.groupby("patientid")['patientid'].rank(method="average", ascending=True)
df["RANK_max"] = df.groupby("patientid")['patientid'].rank(method="max", ascending=True)
df["RANK_min"] = df.groupby("patientid")['patientid'].rank(method="min", ascending=True)
df["RANK_DIFF"] = df['RANK_max'] - df['RANK_min']
df["RANK_Type_Admission"] = df.groupby(['patientid','Type of Admission'])['patientid'].rank(method='first',ascending=True)
df["RANK_Type_City_Code"] = df.groupby(['patientid','City_Code_Hospital'])['patientid'].rank(method='first',ascending=True)
df["RANK_Type_hosp_Code"] = df.groupby(['patientid','Hospital_code'])['patientid'].rank(method='first',ascending=True)
df["RANK_Type_hosp_type_Code"] = df.groupby(['patientid','Hospital_type_code'])['patientid'].rank(method='first',ascending=True)
df["RANK_Type_rooms"] = df.groupby(['patientid','Available Extra Rooms in Hospital'])['patientid'].rank(method='first',ascending=True)
df["RANK_Type_Department"] = df.groupby(['patientid','Department'])['patientid'].rank(method='first',ascending=True)
df["RANK_Type_Ward_Type"] = df.groupby(['patientid','Ward_Type'])['patientid'].rank(method='first',ascending=True)
df["RANK_Type_Ward_Facility_Code"] = df.groupby(['patientid','Ward_Facility_Code'])['patientid'].rank(method='first',ascending=True)
df["RANK_Type_Hospital_region_code"] = df.groupby(['patientid','Hospital_region_code'])['patientid'].rank(method='first',ascending=True)
df['Rank_City_Code_Patient'] = df.groupby(['patientid','City_Code_Patient'])['patientid'].rank(method='first',ascending=True)
df['Rank_City_Code_Vist'] = df.groupby(['patientid','Visitors with Patient'])['patientid'].rank(method='first',ascending=True)
df['Rank_ill+admiss'] = df.groupby(['patientid','Type of Admission','Severity of Illness'])['patientid'].rank(method='first',ascending=True)
df['Rank_ill+admiss_max'] = df.groupby(['patientid','Type of Admission','Severity of Illness'])['patientid'].rank(method='max',ascending=True)
df['Rank_ill+admiss_min'] = df.groupby(['patientid','Type of Admission','Severity of Illness'])['patientid'].rank(method='min',ascending=True)
df['Rank_ill+admiss_avg'] = df.groupby(['patientid','Type of Admission','Severity of Illness'])['patientid'].rank(method='average',ascending=True)
df['Rank_ill+admiss_diff'] = df['Rank_ill+admiss_max'] - df['Rank_ill+admiss_min']
cols_rank = ['Type of Admission','City_Code_Hospital','Hospital_code','Hospital_type_code','Available Extra Rooms in Hospital','Department','Ward_Type','Ward_Facility_Code',
             'Hospital_region_code','City_Code_Patient','Visitors with Patient']
for i in cols_rank:
    df[f"RANK_{i}_min"] = df.groupby(['patientid',i])['patientid'].rank(method='min',ascending=True)
    df[f"RANK_{i}_max"] = df.groupby(['patientid',i])['patientid'].rank(method='max',ascending=True)
    df[f"RANK_{i}_avg"] = df.groupby(['patientid',i])['patientid'].rank(method='average',ascending=True)
    df[f'diff_{i}_max_min'] = df[f"RANK_{i}_max"] -  df[f"RANK_{i}_min"]

In [20]:
train=df.loc[df.train_or_test.isin(['train'])]
test=df.loc[df.train_or_test.isin(['test'])]
train.drop(columns={'train_or_test'},axis=1,inplace=True)
test.drop(columns={'train_or_test'},axis=1,inplace=True)

In [21]:
x=train.drop(columns={'case_id','Stay'},axis=1)
y=train.loc[:,['Stay']]
test=test.drop(columns={'case_id','Stay'},axis=1)

In [None]:
test.describe()

### StratifiedKFOLD

In [22]:
%%time
errcat = [] 
y_pred_tot_lgmcat = np.zeros((len(test), 11))


fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=2020)
i = 1

for train_index, test_index in fold.split(x, y):
    x_train, x_val = x.iloc[train_index], x.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    n = CatBoostClassifier(n_estimators=1100,
                       random_state=2020,
                       eval_metric='Accuracy',
                       learning_rate=0.08,
                       depth=8,
                       bagging_temperature=0.3,
                       task_type='GPU'
                       #num_leaves=64
                       
                       )
    n.fit(x_train, y_train,eval_set=[(x_val, y_val)], early_stopping_rounds=100,verbose=200)
    pred_ycat = n.predict(x_val)
    print(i, " err_lgm: ", accuracy_score(y_val,pred_ycat))
    errcat.append(accuracy_score(y_val,pred_ycat))
    print("Accuracy:",metrics.explained_variance_score(y_val, pred_ycat))
    print("MAE: ", metrics.mean_absolute_error(y_val, pred_ycat))
    print("MSE: ", metrics.mean_squared_error(y_val, pred_ycat))
    print("RMSE: ", np.sqrt(metrics.mean_squared_error(y_val, pred_ycat)))
    y_pred_tot_lgmcat+= n.predict(test)
    i = i + 1


0:	learn: 0.3775995	test: 0.3782664	best: 0.3782664 (0)	total: 471ms	remaining: 8m 37s
200:	learn: 0.4544975	test: 0.4318356	best: 0.4323136 (197)	total: 1m 3s	remaining: 4m 44s
400:	learn: 0.4753499	test: 0.4377310	best: 0.4377310 (382)	total: 2m 3s	remaining: 3m 34s
600:	learn: 0.4935679	test: 0.4398343	best: 0.4403760 (593)	total: 3m 2s	remaining: 2m 31s
800:	learn: 0.5126570	test: 0.4416507	best: 0.4416507 (799)	total: 4m 2s	remaining: 1m 30s
bestTest = 0.4424474187
bestIteration = 814
Shrink model to first 815 iterations.
1  err_lgm:  0.4424474187380497
Accuracy: 0.44762148000822044
MAE:  1.0096876991714467
MSE:  2.6577437858508604
RMSE:  1.6302588094688708
0:	learn: 0.3779819	test: 0.3749522	best: 0.3749522 (0)	total: 345ms	remaining: 6m 18s
200:	learn: 0.4540655	test: 0.4380178	best: 0.4386233 (191)	total: 1m 3s	remaining: 4m 44s
400:	learn: 0.4754278	test: 0.4425749	best: 0.4426068 (395)	total: 2m 3s	remaining: 3m 34s
600:	learn: 0.4955119	test: 0.4437540	best: 0.4443913 (571)	

In [23]:
y_pred_tot_lgmcat=y_pred_tot_lgmcat/10
sum(errcat)/10

0.4431552030337948

In [24]:
44.2324

44.2324

In [25]:
params = {}
params['learning_rate'] = 0.08
params['max_depth'] = 8
params['n_estimators'] = 1100
params['objective'] = 'multiclass'
params['boosting_type'] = 'gbdt'
params['subsample'] = 0.7
params['random_state'] = 2020
params['colsample_bytree']=0.7
params['min_data_in_leaf'] = 100
params['reg_alpha'] = 1.6
params['reg_lambda'] = 1.1
params['num_leaves'] = 64

In [26]:
%%time
err = [] 


fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=2020)
i = 1

for train_index, test_index in fold.split(x, y):
    x_train, x_val = x.iloc[train_index], x.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    m = lgb.LGBMClassifier(**params)
    m.fit(x_train, y_train,eval_set=[(x_val, y_val)], eval_metric='multi_error', early_stopping_rounds=100,verbose=200)
    pred_y = m.predict(x_val)
    print(i, " err_lgm: ", accuracy_score(y_val,pred_y))
    err.append(accuracy_score(y_val,pred_y))
    print("Accuracy:",metrics.explained_variance_score(y_val, pred_y))
    print("MAE: ", metrics.mean_absolute_error(y_val, pred_y))
    print("MSE: ", metrics.mean_squared_error(y_val, pred_y))
    print("RMSE: ", np.sqrt(metrics.mean_squared_error(y_val, pred_y)))
    i = i + 1
sum(err)/10

Training until validation scores don't improve for 100 rounds
[200]	valid_0's multi_error: 0.555991	valid_0's multi_logloss: 1.44701
Early stopping, best iteration is:
[240]	valid_0's multi_error: 0.554621	valid_0's multi_logloss: 1.44535
1  err_lgm:  0.44537922243467176
Accuracy: 0.4505951214202816
MAE:  1.004493307839388
MSE:  2.6334289356277885
RMSE:  1.6227843158065671
Training until validation scores don't improve for 100 rounds
[200]	valid_0's multi_error: 0.555672	valid_0's multi_logloss: 1.44712
[400]	valid_0's multi_error: 0.552772	valid_0's multi_logloss: 1.44348
Early stopping, best iteration is:
[353]	valid_0's multi_error: 0.554525	valid_0's multi_logloss: 1.44312
2  err_lgm:  0.4454748247291268
Accuracy: 0.44262744451422675
MAE:  1.0081580624601658
MSE:  2.6709369024856597
RMSE:  1.6343001261964278
Training until validation scores don't improve for 100 rounds
[200]	valid_0's multi_error: 0.558349	valid_0's multi_logloss: 1.45001
[400]	valid_0's multi_error: 0.554844	valid

0.44671488592223235

In [27]:
sum(err)/10

0.44671488592223235

In [28]:
44.5854

44.5854