In [1]:
import pandas as pd
import numpy as np
import io
from sklearn.model_selection import RandomizedSearchCV
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import KFold , StratifiedKFold
from sklearn.feature_selection import SelectKBest, f_classif
import warnings
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')


In [3]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [4]:
train['source'] = 'train'
test['source'] = 'test'

In [5]:
df = pd.concat([train, test])

In [6]:
df['Stay'] = df['Stay'].replace({'0-10' : 0,
                                     '11-20' : 1,
                                     '21-30': 2,
                                     '31-40': 3,
                                     '41-50': 4,
                                     '51-60': 5,
                                     '61-70': 6, 
                                    '71-80': 7,
                                      '81-90': 8,
                                      '91-100': 9,
                                      'More than 100 Days' : 10
                                     })

### df.info()

In [7]:
df['Hospital_type_code'].value_counts()

a    204730
b     98884
c     66147
e     35428
d     29048
f     15252
g      6006
Name: Hospital_type_code, dtype: int64

In [8]:
df['Hospital_type_code'] = df['Hospital_type_code'].replace({'a' : 0,
                                     'b' : 1,
                                     'c': 2,
                                     'd': 3,
                                     'e': 4,
                                     'f': 5,
                                     'g': 6
                                     })

In [9]:
df['Hospital_region_code'].value_counts()

X    190849
Y    174707
Z     89939
Name: Hospital_region_code, dtype: int64

In [10]:
df['Hospital_region_code'] = df['Hospital_region_code'].replace({'X' : 0,
                                     'Y' : 1,
                                     'Z': 2})

In [11]:
df['Department'].value_counts()

gynecology            356688
anesthesia             42358
radiotherapy           41033
TB & Chest disease     13751
surgery                 1665
Name: Department, dtype: int64

In [12]:
dept = pd.get_dummies(df['Department'])

In [13]:
df = pd.concat([df, dept], axis = 1)

In [14]:
df.shape

(455495, 24)

In [15]:
df['Ward_Type'].value_counts()

R    182939
Q    152046
S    111166
P      7199
T      2133
U        12
Name: Ward_Type, dtype: int64

In [16]:
df['Ward_Type'] = df['Ward_Type'].replace({'R' : 0,
                                    'Q' : 1,
                                    'S' : 2,
                                    'P' : 3,
                                    'T' : 4,
                                     'U': 5})

In [17]:
del df['Department']

In [18]:
df['Ward_Facility_Code'].value_counts()

F    161470
E     79058
D     74312
C     50279
B     50116
A     40260
Name: Ward_Facility_Code, dtype: int64

In [19]:
df['Bed Grade'].value_counts()

2.0    176451
3.0    158942
4.0     82387
1.0     37567
Name: Bed Grade, dtype: int64

In [20]:
df['Ward_Facility_Code'] = df['Ward_Facility_Code'].replace({'A' : 0,
                                    'B' : 1,
                                    'C' : 2,
                                    'D' : 3,
                                    'E' : 4,
                                     'F': 5})

In [21]:
df['Bed Grade'] = df['Bed Grade'].transform(lambda x: x.fillna('2.0'))

In [22]:
df['Type of Admission'].value_counts()  

Trauma       217672
Emergency    168363
Urgent        69460
Name: Type of Admission, dtype: int64

In [23]:
adtype = pd.get_dummies(df['Type of Admission'])

In [24]:
df = pd.concat([df, adtype], axis = 1)

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 455495 entries, 0 to 137056
Data columns (total 26 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   case_id                            455495 non-null  int64  
 1   Hospital_code                      455495 non-null  int64  
 2   Hospital_type_code                 455495 non-null  int64  
 3   City_Code_Hospital                 455495 non-null  int64  
 4   Hospital_region_code               455495 non-null  int64  
 5   Available Extra Rooms in Hospital  455495 non-null  int64  
 6   Ward_Type                          455495 non-null  int64  
 7   Ward_Facility_Code                 455495 non-null  int64  
 8   Bed Grade                          455495 non-null  object 
 9   patientid                          455495 non-null  int64  
 10  City_Code_Patient                  448806 non-null  float64
 11  Type of Admission                  4554

In [26]:
del df['Type of Admission'] 

In [27]:
df['Severity of Illness'].value_counts()

Moderate    251565
Minor       122735
Extreme      81195
Name: Severity of Illness, dtype: int64

In [28]:
sev = pd.get_dummies(df['Severity of Illness'])

In [29]:
df = pd.concat([df, sev], axis = 1)

In [30]:
df.shape

(455495, 28)

In [31]:
del df['Severity of Illness'] 

In [32]:
df['Age'].value_counts()

41-50     91495
31-40     90420
51-60     69506
21-30     58560
71-80     50737
61-70     48619
11-20     23871
81-90     11240
0-10       9140
91-100     1907
Name: Age, dtype: int64

In [33]:
df['Age'] = df['Age'].replace({'0-10' : 0,
                                     '11-20' : 1,
                                     '21-30': 2,
                                     '31-40': 3,
                                     '41-50': 4,
                                     '51-60': 5,
                                     '61-70': 6, 
                                    '71-80': 7,
                                      '81-90': 8,
                                      '91-100': 9,
                                     })

In [34]:
df['City_Code_Patient'] = df.groupby(['Hospital_type_code', 'City_Code_Hospital', 'Hospital_region_code', 'Hospital_code']).City_Code_Patient.transform(lambda x: x.fillna(x.median()))

In [35]:
Train = df[df['source'] == 'train']

In [36]:
Test = df[df['source'] == 'test']

In [37]:
Train.head()

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Visitors with Patient,Age,Admission_Deposit,Stay,source,TB & Chest disease,anesthesia,gynecology,radiotherapy,surgery,Emergency,Trauma,Urgent,Extreme,Minor,Moderate
0,1,8,2,3,2,3,0,5,2.0,31397,7.0,2,5,4911.0,0.0,train,0,0,0,1,0,1,0,0,1,0,0
1,2,2,2,5,2,2,2,5,2.0,31397,7.0,2,5,5954.0,4.0,train,0,0,0,1,0,0,1,0,1,0,0
2,3,10,4,1,0,2,2,4,2.0,31397,7.0,2,5,4745.0,3.0,train,0,1,0,0,0,0,1,0,1,0,0
3,4,26,1,2,1,2,0,3,2.0,31397,7.0,2,5,7272.0,4.0,train,0,0,0,1,0,0,1,0,1,0,0
4,5,26,1,2,1,2,2,3,2.0,31397,7.0,2,5,5558.0,4.0,train,0,0,0,1,0,0,1,0,1,0,0


In [38]:
Test.head()

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Visitors with Patient,Age,Admission_Deposit,Stay,source,TB & Chest disease,anesthesia,gynecology,radiotherapy,surgery,Emergency,Trauma,Urgent,Extreme,Minor,Moderate
0,318439,21,2,3,2,3,2,0,2.0,17006,2.0,2,7,3095.0,,test,0,0,1,0,0,1,0,0,0,0,1
1,318440,29,0,4,0,2,2,5,2.0,17006,2.0,4,7,4018.0,,test,0,0,1,0,0,0,1,0,0,0,1
2,318441,26,1,2,1,3,1,3,4.0,17006,2.0,3,7,4492.0,,test,0,0,1,0,0,1,0,0,0,0,1
3,318442,6,0,6,0,3,1,5,2.0,17006,2.0,3,7,4173.0,,test,0,0,1,0,0,0,1,0,0,0,1
4,318443,28,1,11,0,2,0,5,2.0,17006,2.0,4,7,4161.0,,test,0,0,1,0,0,0,1,0,0,0,1


In [39]:
Train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 318438 entries, 0 to 318437
Data columns (total 27 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   case_id                            318438 non-null  int64  
 1   Hospital_code                      318438 non-null  int64  
 2   Hospital_type_code                 318438 non-null  int64  
 3   City_Code_Hospital                 318438 non-null  int64  
 4   Hospital_region_code               318438 non-null  int64  
 5   Available Extra Rooms in Hospital  318438 non-null  int64  
 6   Ward_Type                          318438 non-null  int64  
 7   Ward_Facility_Code                 318438 non-null  int64  
 8   Bed Grade                          318438 non-null  object 
 9   patientid                          318438 non-null  int64  
 10  City_Code_Patient                  318438 non-null  float64
 11  Visitors with Patient              3184

In [40]:
del Train['source']
del Test['source']
del Test['Stay']
del Train['case_id']
del Test['case_id']

In [41]:
yTrain = Train.pop('Stay')

In [42]:
sc = StandardScaler()
sTrain = sc.fit_transform(Train)
sTest = sc.fit_transform(Test)

In [47]:
model_params = {
    'LGBM': {
        'model': LGBMClassifier(),
        'params' : {
            'n_estimators': [10, 100, 200, 500],
            'num_leaves': [10, 50, 100, 200, 500],
            'max_depth': [5, 10, 15, 20, 25]
            
        }
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params': {
           'n_estimators': [50, 100, 150, 500],
            'min_samples_leaf': [1, 2, 3, 4],
            'max_depth': [5, 10, 15, 20, 25]
        }
    },
     'cat': {
        'model': CatBoostClassifier(eval_metric='Accuracy'),
        'params' : {
            'n_estimators': [10, 100, 200, 500],
            'max_depth': [5, 10, 15, 20, 25] 
        }
    },
       'xgb': {
        'model': XGBClassifier(),
        'params' : {
            'n_estimators': [10, 100, 200, 500],
            'max_depth': [5, 10, 15, 20, 25],
            'min_child_weight' : [ 1, 3, 5, 7 ],
            'gamma'            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
            'colsample_bytree' : [ 0.3, 0.4, 0.5 , 0.7 ]
        }
    },
}

In [48]:
scores = []

for model_name, mp in model_params.items():
    clf = RandomizedSearchCV(mp['model'], mp['params'], cv=3, return_train_score=False, verbose = 1)
    clf.fit(sTrain, yTrain)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed: 48.6min finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed: 62.4min finished


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Learning rate set to 0.40516
0:	learn: 0.4283393	total: 2m 35s	remaining: 8h 36m 21s
1:	learn: 0.4425131	total: 5m 9s	remaining: 8h 29m 57s
2:	learn: 0.4552315	total: 7m 56s	remaining: 8h 41m 30s
3:	learn: 0.4632770	total: 10m 48s	remaining: 8h 49m 46s
4:	learn: 0.4665461	total: 13m 37s	remaining: 8h 51m 24s
5:	learn: 0.4706301	total: 15m 56s	remaining: 8h 35m 29s
6:	learn: 0.4743090	total: 18m 18s	remaining: 8h 24m 54s
7:	learn: 0.4837582	total: 21m 8s	remaining: 8h 27m 14s
8:	learn: 0.4940130	total: 23m 39s	remaining: 8h 21m 57s
9:	learn: 0.5022940	total: 26m 18s	remaining: 8h 19m 47s
10:	learn: 0.5065099	total: 28m 57s	remaining: 8h 17m 28s


KeyboardInterrupt: 

In [46]:
scores 

[{'model': 'LGBM',
  'best_score': 0.4196013038644885,
  'best_params': {'num_leaves': 100, 'n_estimators': 200, 'max_depth': 5}}]

In [34]:
xgb = XGBClassifier(max_depth=6, n_estimators=600)

In [36]:
xgb.fit(df1,yTrain)

NameError: name 'yTrain' is not defined

In [77]:
xpred= xgb.predict(sTest)
acc = accuracy_score(yTrain,xgb.predict(sTrain))*100
acc

58.611409442340424

In [78]:
xpred = pd.DataFrame(xpred, columns =['Stay'])

In [79]:
xpred['Stay'] = xpred['Stay'].replace({0.0 : '0-10',
                                     1.0 : '11-20',
                                     2.0 :'21-30',
                                     3.0 :'31-40',
                                     4.0 :'41-50',
                                     5.0 :'51-60',
                                     6.0 :'61-70', 
                                     7.0 :'71-80',
                                     8.0 : '81-90',
                                     9.0 : '91-100',
                                     10.0 : 'More than 100 Days' 
                                     })

In [53]:
sample = pd.read_csv('sample.csv')

In [81]:
sample.head()

Unnamed: 0,case_id,Stay
0,318439,0-10
1,318440,0-10
2,318441,0-10
3,318442,0-10
4,318443,0-10


In [82]:
sample['Stay'] = xpred

In [83]:
sample.to_csv('out22.csv')

In [49]:
catboost = CatBoostClassifier(eval_metric='Accuracy', max_depth=5, n_estimators=500, verbose=10 )
catboost.fit( sTrain, yTrain )   
 
 

Learning rate set to 0.190699
0:	learn: 0.3438220	total: 440ms	remaining: 3m 39s
10:	learn: 0.3933858	total: 4.56s	remaining: 3m 22s
20:	learn: 0.4045466	total: 8.26s	remaining: 3m 8s
30:	learn: 0.4093450	total: 12s	remaining: 3m 1s
40:	learn: 0.4134525	total: 15.9s	remaining: 2m 57s
50:	learn: 0.4160150	total: 19.7s	remaining: 2m 53s
60:	learn: 0.4177862	total: 23.5s	remaining: 2m 48s
70:	learn: 0.4196296	total: 27.4s	remaining: 2m 45s
80:	learn: 0.4207412	total: 31.1s	remaining: 2m 40s
90:	learn: 0.4218843	total: 34.7s	remaining: 2m 36s
100:	learn: 0.4228327	total: 38.6s	remaining: 2m 32s
110:	learn: 0.4242427	total: 42.3s	remaining: 2m 28s
120:	learn: 0.4251283	total: 46.1s	remaining: 2m 24s
130:	learn: 0.4257658	total: 50s	remaining: 2m 20s
140:	learn: 0.4263624	total: 53.8s	remaining: 2m 16s
150:	learn: 0.4269842	total: 57.7s	remaining: 2m 13s
160:	learn: 0.4276562	total: 1m 1s	remaining: 2m 9s
170:	learn: 0.4282089	total: 1m 5s	remaining: 2m 6s
180:	learn: 0.4289532	total: 1m 9s	

<catboost.core.CatBoostClassifier at 0x29109ab4910>

In [50]:
pred = catboost.predict(sTest)

In [51]:
tpred = catboost.predict(sTrain)
print(accuracy_score(yTrain,tpred)*100)
pred1 = pd.DataFrame(pred, columns =['Stay'])

44.13198173584811


In [54]:
tpred = catboost.predict(sTrain)
print(accuracy_score(yTrain,tpred)*100)
pred1 = pd.DataFrame(pred, columns =['Stay'])
pred1['Stay'] = pred1['Stay'].replace({0.0 : '0-10',
                                     1.0 : '11-20',
                                     2.0 :'21-30',
                                     3.0 :'31-40',
                                     4.0 :'41-50',
                                     5.0 :'51-60',
                                     6.0 :'61-70', 
                                     7.0 :'71-80',
                                     8.0 : '81-90',
                                     9.0 : '91-100',
                                     10.0 : 'More than 100 Days' 
                                     })
sample['Stay'] = pred1
sample.to_csv('out33.csv')

44.13198173584811


In [278]:
pred1 = pd.DataFrame(pred, columns =['Stay'])

In [297]:
pred1['Stay'] = pred1['Stay'].replace({0.0 : '0-10',
                                     1.0 : '11-20',
                                     2.0 :'21-30',
                                     3.0 :'31-40',
                                     4.0 :'41-50',
                                     5.0 :'51-60',
                                     6.0 :'61-70', 
                                     7.0 :'71-80',
                                     8.0 : '81-90',
                                     9.0 : '91-100',
                                     10.0 : 'More than 100 Days' 
                                     })

In [298]:
sample['Stay'] = pred1

In [299]:
sample.to_csv('out6.csv')

In [284]:
select_top = SelectKBest(f_classif, k = 10)
x_train_new = select_top.fit_transform(Train, yTrain)
print('Top train features', Train.columns.values[select_top.get_support()])

Top train features ['Available Extra Rooms in Hospital' 'Ward_Type' 'Bed Grade'
 'Visitors with Patient' 'Age' 'Admission_Deposit' 'Emergency' 'Trauma'
 'Extreme' 'Minor']


Index(['case_id', 'Hospital_code', 'Hospital_type_code', 'City_Code_Hospital',
       'Hospital_region_code', 'Available Extra Rooms in Hospital',
       'Ward_Type', 'Ward_Facility_Code', 'Bed Grade', 'patientid',
       'City_Code_Patient', 'Visitors with Patient', 'Age',
       'Admission_Deposit', 'TB & Chest disease', 'anesthesia', 'gynecology',
       'radiotherapy', 'surgery', 'Emergency', 'Trauma', 'Urgent', 'Extreme',
       'Minor', 'Moderate'],
      dtype='object')

In [46]:
dTrain = Train
dTest = Test

In [47]:
del dTrain['case_id']
del dTrain['Hospital_code']
del dTrain['City_Code_Hospital']
del dTrain['Hospital_region_code']
del dTrain['patientid']
del dTrain['City_Code_Patient']
del dTest['case_id']
del dTest['Hospital_code']
del dTest['City_Code_Hospital']
del dTest['Hospital_region_code']
del dTest['patientid']
del dTest['City_Code_Patient']

In [48]:
sc = RobustScaler()
sdTrain = sc.fit_transform(dTrain)
sdTest = sc.fit_transform(dTest)

In [294]:
catboost = CatBoostClassifier(eval_metric='Accuracy', max_depth=4, n_estimators=1200, verbose=100)
catboost.fit( sdTrain, yTrain )  

Learning rate set to 0.091006
0:	learn: 0.3590526	total: 445ms	remaining: 8m 53s
100:	learn: 0.4047130	total: 33.9s	remaining: 6m 8s
200:	learn: 0.4098317	total: 1m 4s	remaining: 5m 22s
300:	learn: 0.4129784	total: 1m 35s	remaining: 4m 45s
400:	learn: 0.4147181	total: 2m 6s	remaining: 4m 12s
500:	learn: 0.4163793	total: 2m 37s	remaining: 3m 39s
600:	learn: 0.4175915	total: 3m 9s	remaining: 3m 8s
700:	learn: 0.4187032	total: 3m 40s	remaining: 2m 36s
800:	learn: 0.4193878	total: 4m 11s	remaining: 2m 5s
900:	learn: 0.4204900	total: 4m 43s	remaining: 1m 34s
1000:	learn: 0.4213316	total: 5m 14s	remaining: 1m 2s
1100:	learn: 0.4221010	total: 5m 45s	remaining: 31.1s
1199:	learn: 0.4229018	total: 6m 16s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x2552da43c10>

In [55]:
lg = LGBMClassifier(n_estimators=500,num_leaves=10,max_depth=25)
lg.fit(sTrain,yTrain)
lgp = lg.predict(sTrain)
ac = 100 * accuracy_score(yTrain,lgp)
print('Accuracy Score ', ac)

Accuracy Score  44.882206269352274


In [295]:
pred = catboost.predict(sdTest)

In [296]:
tpred = catboost.predict(sdTrain)
print(accuracy_score(yTrain,tpred)*100)
pred1 = pd.DataFrame(pred, columns =['Stay'])

42.29017893593101


In [311]:
lg = LGBMClassifier(n_estimators=480,num_leaves=200,max_depth=20)

lg.fit(sTrain,yTrain)
lgp = lg.predict(sTrain)

ac = 100 * accuracy_score(yTrain,lgp)
print('Accuracy Score ', ac)

Accuracy Score  81.65137326575346


In [56]:
pred = lg.predict(sTest)

In [57]:
pred1 = pd.DataFrame(pred, columns =['Stay'])
pred1['Stay'] = pred1['Stay'].replace({0.0 : '0-10',
                                     1.0 : '11-20',
                                     2.0 :'21-30',
                                     3.0 :'31-40',
                                     4.0 :'41-50',
                                     5.0 :'51-60',
                                     6.0 :'61-70', 
                                     7.0 :'71-80',
                                     8.0 : '81-90',
                                     9.0 : '91-100',
                                     10.0 : 'More than 100 Days' 
                                     })
sample['Stay'] = pred1
sample.to_csv('out34.csv')

In [59]:
rf = RandomForestClassifier(n_estimators=500, min_samples_leaf= 4, max_depth= 15)



In [60]:
rf.fit(sTrain,yTrain)
rfp = rf.predict(sTrain)

ac = 100 * accuracy_score(yTrain,rfp)
print('Accuracy Score ', ac)


Accuracy Score  51.54912416231732


In [61]:
pred = rf.predict(sTest)
pred1 = pd.DataFrame(pred, columns =['Stay'])
pred1['Stay'] = pred1['Stay'].replace({0.0 : '0-10',
                                     1.0 : '11-20',
                                     2.0 :'21-30',
                                     3.0 :'31-40',
                                     4.0 :'41-50',
                                     5.0 :'51-60',
                                     6.0 :'61-70', 
                                     7.0 :'71-80',
                                     8.0 : '81-90',
                                     9.0 : '91-100',
                                     10.0 : 'More than 100 Days' 
                                     })
sample['Stay'] = pred1
sample.to_csv('out35.csv')

In [62]:
xgb = XGBClassifier(n_estimators= 200, min_child_weight= 1, max_depth= 5, gamma= 0.0, colsample_bytree= 0.5)

xgb.fit(sTrain,yTrain)
xgbY = xgb.predict(sTrain)

ac = 100 * accuracy_score(yTrain,xgbY)
print('Accuracy Score ', ac)

Accuracy Score  45.6311746713646


In [328]:
rf = LogisticRegression(max_iter=500) 

rf.fit(sdTrain,yTrain)
rfp = rf.predict(sdTrain)

ac = 100 * accuracy_score(yTrain,rfp)
print('Accuracy Score ', ac)

Accuracy Score  36.651718701913715


In [63]:
pred = xgb.predict(sTest)
pred1 = pd.DataFrame(pred, columns =['Stay'])
pred1['Stay'] = pred1['Stay'].replace({0.0 : '0-10',
                                     1.0 : '11-20',
                                     2.0 :'21-30',
                                     3.0 :'31-40',
                                     4.0 :'41-50',
                                     5.0 :'51-60',
                                     6.0 :'61-70', 
                                     7.0 :'71-80',
                                     8.0 : '81-90',
                                     9.0 : '91-100',
                                     10.0 : 'More than 100 Days' 
                                     })
sample['Stay'] = pred1
sample.to_csv('out36.csv')

In [334]:
rf = DecisionTreeClassifier() 

rf.fit(sTrain,yTrain)
rfp = rf.predict(sTrain)

ac = 100 * accuracy_score(yTrain,rfp)
print('Accuracy Score ', ac)

Accuracy Score  100.0


In [335]:
pred = rf.predict(sTest)
pred1 = pd.DataFrame(pred, columns =['Stay'])
pred1['Stay'] = pred1['Stay'].replace({0.0 : '0-10',
                                     1.0 : '11-20',
                                     2.0 :'21-30',
                                     3.0 :'31-40',
                                     4.0 :'41-50',
                                     5.0 :'51-60',
                                     6.0 :'61-70', 
                                     7.0 :'71-80',
                                     8.0 : '81-90',
                                     9.0 : '91-100',
                                     10.0 : 'More than 100 Days' 
                                     })
sample['Stay'] = pred1
sample.to_csv('out12.csv')

In [None]:
svm = SVC(kernel='rbf', C=1, gamma='auto')
svm.fit(sTrain, yTrain)
svc = svm.predict(sTrain)

sac = 100 * accuracy_score(yTrain,svc)
print('Accuracy Score ', sac)

In [38]:
from autoviml.Auto_ViML import Auto_ViML
target = 'Stay'

Imported Auto_ViML version: 0.1.682. Call using:
             m, feats, trainm, testm = Auto_ViML(train, target, test,
                            sample_submission='',
                            scoring_parameter='', KMeans_Featurizer=False,
                            hyper_param='RS',feature_reduction=True,
                             Boosting_Flag='CatBoost', Binning_Flag=False,
                            Add_Poly=0, Stacking_Flag=False,Imbalanced_Flag=False,
                            verbose=1)
            

Imported Auto_NLP version: 0.0.46.. Call using:
     train_nlp, test_nlp, nlp_pipeline, predictions = Auto_NLP(
                nlp_column, train, test, target, score_type='balanced_accuracy',
                modeltype='Classification',top_num_features=200, verbose=0,
                build_model=True)


In [39]:
model, features, trainm, testm = Auto_ViML(df1, target, df2, sample_submission='',
                                           scoring_parameter = 'balanced_Accuracy',
                                          hyper_param = 'RS',
                                          feature_reduction=True,
                                          Boosting_Flag= True,
                                          Binning_Flag=True,
                                          Add_Poly=0,
                                          Stacking_Flag=True,
                                          Imbalanced_Flag=True,
                                          verbose=1)

AttributeError: 'DataFrameLocal' object has no attribute 'index'

In [46]:
model

CalibratedClassifierCV(base_estimator=OneVsRestClassifier(estimator=XGBClassifier(base_score=None,
                                                                                  booster='gbtree',
                                                                                  colsample_bylevel=None,
                                                                                  colsample_bynode=None,
                                                                                  colsample_bytree=None,
                                                                                  gamma=None,
                                                                                  gpu_id=None,
                                                                                  importance_type='gain',
                                                                                  interaction_constraints=None,
                                                                              

In [47]:
features

['Visitors with Patient',
 'Ward_Type',
 'Type of Admission',
 'Available Extra Rooms in Hospital',
 'Severity of Illness',
 'Hospital_type_code',
 'Hospital_region_code',
 'City_Code_Hospital',
 'Ward_Facility_Code',
 'Hospital_code',
 'Department',
 'Age',
 'patientid',
 'City_Code_Patient_Missing_Flag',
 'Bed Grade_Missing_Flag',
 'Bed Grade_bin',
 'Linear Discriminant_0',
 'Linear Discriminant_1']

In [48]:
testm.head()

Unnamed: 0,case_id,Visitors with Patient,Ward_Type,Type of Admission,Available Extra Rooms in Hospital,Severity of Illness,Hospital_type_code,Hospital_region_code,City_Code_Hospital,Ward_Facility_Code,Hospital_code,Department,Age,patientid,City_Code_Patient_Missing_Flag,Bed Grade_Missing_Flag,Bed Grade_bin,Linear Discriminant_0,Linear Discriminant_1,Stay_proba_21-30,Stay_proba_11-20,Stay_proba_31-40,Stay_proba_51-60,Stay_proba_0-10,Stay_proba_41-50,Stay_proba_71-80,Stay_proba_More than 100 Days,Stay_proba_81-90,Stay_proba_91-100,Stay_proba_61-70,Stay_predictions,Stay_Stacked_Linear Discriminant_predictions
0,318439,0.06,0.6,0.0,0.12,1.0,0.33,1.0,0.17,0.0,0.65,0.5,0.78,0.13,0.0,0.0,0.33,0.98,0.02,0.33,0.26,0.13,0.06,0.14,0.04,0.01,0.01,0.01,0.01,0.01,21-30,21-30
1,318440,0.12,0.6,0.5,0.08,1.0,0.0,0.0,0.25,1.0,0.9,0.5,0.78,0.13,0.0,0.0,0.33,0.98,0.02,0.01,0.04,0.16,0.49,0.0,0.02,0.17,0.02,0.0,0.07,0.01,51-60,21-30
2,318441,0.09,0.2,0.0,0.12,1.0,0.17,0.5,0.08,0.6,0.81,0.5,0.78,0.13,0.0,0.0,1.0,0.98,0.02,0.34,0.21,0.13,0.04,0.15,0.08,0.01,0.01,0.02,0.0,0.02,21-30,21-30
3,318442,0.09,0.2,0.5,0.12,1.0,0.0,0.0,0.42,1.0,0.16,0.5,0.78,0.13,0.0,0.0,0.33,0.99,0.01,0.52,0.19,0.16,0.04,0.03,0.05,0.0,0.0,0.0,0.0,0.01,21-30,21-30
4,318443,0.12,0.4,0.5,0.08,1.0,0.17,0.0,0.83,1.0,0.87,0.5,0.78,0.13,0.0,0.0,0.33,0.98,0.02,0.02,0.05,0.16,0.55,0.0,0.02,0.15,0.02,0.0,0.03,0.01,51-60,21-30


In [49]:
trainm.head()

Unnamed: 0,Visitors with Patient,Ward_Type,Type of Admission,Available Extra Rooms in Hospital,Severity of Illness,Hospital_type_code,Hospital_region_code,City_Code_Hospital,Ward_Facility_Code,Hospital_code,Department,Age,patientid,City_Code_Patient_Missing_Flag,Bed Grade_Missing_Flag,Stay,Bed Grade_bin,Linear Discriminant_0,Linear Discriminant_1
0,2,2,0,3,0,2,2,3,5,8,3,5,31397,0,0,4,1,0.99,0.01
1,2,3,1,2,0,2,2,5,5,2,3,5,31397,0,0,5,1,0.99,0.01
2,2,3,1,2,0,4,0,1,4,10,1,5,31397,0,0,2,1,0.99,0.01
3,2,2,1,2,0,1,1,2,3,26,3,5,31397,0,0,5,1,0.99,0.01
4,2,3,1,2,0,1,1,2,3,26,3,5,31397,0,0,5,1,0.99,0.01
