In [1]:
# Importing the essential libraries to get started

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

### Reading the CSV file and importing the data

In [2]:
df_train = pd.read_csv('train.csv')

In [3]:
df_train.head()

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,1,8,c,3,Z,3,radiotherapy,R,F,2.0,31397,7.0,Emergency,Extreme,2,51-60,4911.0,0-10
1,2,2,c,5,Z,2,radiotherapy,S,F,2.0,31397,7.0,Trauma,Extreme,2,51-60,5954.0,41-50
2,3,10,e,1,X,2,anesthesia,S,E,2.0,31397,7.0,Trauma,Extreme,2,51-60,4745.0,31-40
3,4,26,b,2,Y,2,radiotherapy,R,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,7272.0,41-50
4,5,26,b,2,Y,2,radiotherapy,S,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,5558.0,41-50


## Now we first check for missing variables in the data

In [4]:
df_train.isnull().sum()/df_train.shape[0]

case_id                              0.000000
Hospital_code                        0.000000
Hospital_type_code                   0.000000
City_Code_Hospital                   0.000000
Hospital_region_code                 0.000000
Available Extra Rooms in Hospital    0.000000
Department                           0.000000
Ward_Type                            0.000000
Ward_Facility_Code                   0.000000
Bed Grade                            0.000355
patientid                            0.000000
City_Code_Patient                    0.014232
Type of Admission                    0.000000
Severity of Illness                  0.000000
Visitors with Patient                0.000000
Age                                  0.000000
Admission_Deposit                    0.000000
Stay                                 0.000000
dtype: float64

In [5]:
# As we can check that there are very few rows which are missing the values from columns Bed Grade and City_Code_Patient
# Filling the values missing in the Bed Grade column first

df_train['Bed Grade'] = df_train.groupby(['Department', 'Hospital_code', 'Ward_Type'])['Bed Grade'].apply(lambda x: x.fillna(x.value_counts().index[0]))

In [6]:
# Now to fill the values missing in the the City_Code_Patient columns we use the mode of the column

df_train['City_Code_Patient'].value_counts().index[0]

8.0

In [7]:
null_idx = df_train['City_Code_Patient'].isnull()
df_train['City_Code_Patient'][null_idx] = 8

In [8]:
# As we can check now there is no null value in any of the columns

df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318438 entries, 0 to 318437
Data columns (total 18 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   case_id                            318438 non-null  int64  
 1   Hospital_code                      318438 non-null  int64  
 2   Hospital_type_code                 318438 non-null  object 
 3   City_Code_Hospital                 318438 non-null  int64  
 4   Hospital_region_code               318438 non-null  object 
 5   Available Extra Rooms in Hospital  318438 non-null  int64  
 6   Department                         318438 non-null  object 
 7   Ward_Type                          318438 non-null  object 
 8   Ward_Facility_Code                 318438 non-null  object 
 9   Bed Grade                          318438 non-null  float64
 10  patientid                          318438 non-null  int64  
 11  City_Code_Patient                  3184

## Exploring the data

In [10]:
print('Number of unique Hospital_code = ', df_train.Hospital_code.nunique())
print('Number of unique Hospital_type_code = ', df_train.Hospital_type_code.nunique())
print('Number of unique City_Code_Hospital = ', df_train.City_Code_Hospital.nunique())
print('Number of unique Hospital_region_code = ', df_train.Hospital_region_code.nunique())
print('Number of unique Available Extra Rooms in Hospital = ', df_train['Available Extra Rooms in Hospital'].nunique())
print('Number of unique Ward_Type = ', df_train['Ward_Type'].nunique())
print('Number of unique Ward_Facility_Code = ', df_train['Ward_Facility_Code'].nunique())
print('Number of unique Bed Grade = ', df_train['Bed Grade'].nunique())
print('Number of unique patientid = ', df_train['patientid'].nunique())
print('Number of unique City_Code_Patient = ', df_train['City_Code_Patient'].nunique())
print('Number of unique Type of Admission = ', df_train['Type of Admission'].nunique())
print('Number of unique Severity of Illness = ', df_train['Severity of Illness'].nunique())
print('Number of unique Visitors with Patient = ', df_train['Visitors with Patient'].nunique())
print('Number of unique Age = ', df_train['Age'].nunique())

Number of unique Hospital_code =  32
Number of unique Hospital_type_code =  7
Number of unique City_Code_Hospital =  11
Number of unique Hospital_region_code =  3
Number of unique Available Extra Rooms in Hospital =  18
Number of unique Ward_Type =  6
Number of unique Ward_Facility_Code =  6
Number of unique Bed Grade =  4
Number of unique patientid =  92017
Number of unique City_Code_Patient =  37
Number of unique Type of Admission =  3
Number of unique Severity of Illness =  3
Number of unique Visitors with Patient =  28
Number of unique Age =  10


In [11]:
df_train.columns

Index(['case_id', 'Hospital_code', 'Hospital_type_code', 'City_Code_Hospital',
       'Hospital_region_code', 'Available Extra Rooms in Hospital',
       'Department', 'Ward_Type', 'Ward_Facility_Code', 'Bed Grade',
       'patientid', 'City_Code_Patient', 'Type of Admission',
       'Severity of Illness', 'Visitors with Patient', 'Age',
       'Admission_Deposit', 'Stay'],
      dtype='object')

In [16]:
df_train.dtypes

case_id                                int64
Hospital_code                          int64
Hospital_type_code                    object
City_Code_Hospital                     int64
Hospital_region_code                  object
Available Extra Rooms in Hospital      int64
Department                            object
Ward_Type                             object
Ward_Facility_Code                    object
Bed Grade                            float64
patientid                              int64
City_Code_Patient                    float64
Type of Admission                     object
Severity of Illness                   object
Visitors with Patient                  int64
Age                                   object
Admission_Deposit                    float64
Stay                                  object
dtype: object

In [17]:
categorical_vars = list(df_train.select_dtypes('object').columns)
categorical_vars

['Hospital_type_code',
 'Hospital_region_code',
 'Department',
 'Ward_Type',
 'Ward_Facility_Code',
 'Type of Admission',
 'Severity of Illness',
 'Age',
 'Stay']

In [None]:
# Hospital_region_code
# Department
# Type of Admission
# Severity of Illness

In [5]:
df_train['City_Code_Patient'].value_counts()

8.0     124011
2.0      38869
1.0      26377
7.0      23807
5.0      20079
4.0      15380
9.0      11795
15.0      8950
10.0      8174
6.0       6005
12.0      5647
3.0       3772
23.0      3698
14.0      2927
16.0      2254
13.0      1625
21.0      1602
20.0      1409
18.0      1404
19.0      1028
26.0      1023
25.0       798
27.0       771
11.0       658
28.0       521
22.0       405
24.0       360
30.0       133
29.0        98
33.0        78
31.0        59
37.0        57
32.0        52
34.0        46
35.0        16
36.0        12
38.0         6
Name: City_Code_Patient, dtype: int64

In [20]:
df_train['Ward_Facility_Code'].value_counts()

F    111006
E     54181
D     51178
C     35144
B     34727
A     27557
Name: Ward_Facility_Code, dtype: int64

In [18]:
df_train['Hospital_region_code'].value_counts()

X    130833
Y    121049
Z     61911
Name: Hospital_region_code, dtype: int64

In [6]:
df_train['Department'].value_counts()

gynecology            249486
anesthesia             29649
radiotherapy           28516
TB & Chest disease      9586
surgery                 1201
Name: Department, dtype: int64

In [7]:
df_train['Type of Admission'].value_counts()

Trauma       152261
Emergency    117676
Urgent        48501
Name: Type of Admission, dtype: int64

In [8]:
df_train['Severity of Illness'].value_counts()

Moderate    175843
Minor        85872
Extreme      56723
Name: Severity of Illness, dtype: int64

In [25]:
df_train['City_Code_Hospital'].value_counts()

1     54181
2     51178
6     46128
7     35144
3     31168
5     30743
9     25947
11    16947
4     13577
10     5188
13     3592
Name: City_Code_Hospital, dtype: int64