In [88]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import LinearSVR, SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

import warnings
warnings.filterwarnings('ignore')

In [89]:
data = pd.read_csv('D:/Music/CSV/Healthcare_Investments_and_Hospital_Stay (1).csv')
data

Unnamed: 0,Location,Time,Hospital_Stay,MRI_Units,CT_Scanners,Hospital_Beds
0,AUS,1992,6.6,1.43,16.71,1.43
1,AUS,1994,6.4,2.36,18.48,2.36
2,AUS,1995,6.5,2.89,20.55,2.89
3,AUS,1996,6.4,2.96,21.95,2.96
4,AUS,1997,6.2,3.53,23.34,3.53
...,...,...,...,...,...,...
513,LTU,2014,6.8,10.57,22.17,10.57
514,LTU,2015,6.6,11.02,21.00,11.02
515,LTU,2016,6.6,12.20,23.01,12.20
516,LTU,2017,6.5,12.37,23.33,12.37


In [90]:
data.isna().sum()

Location         0
Time             0
Hospital_Stay    0
MRI_Units        0
CT_Scanners      0
Hospital_Beds    0
dtype: int64

In [91]:
data.describe()

Unnamed: 0,Time,Hospital_Stay,MRI_Units,CT_Scanners,Hospital_Beds
count,518.0,518.0,518.0,518.0,518.0
mean,2007.967181,7.140154,10.565502,19.646718,10.565502
std,6.94416,2.566864,8.68557,14.352069,8.68557
min,1990.0,3.4,0.1,1.48,0.1
25%,2003.25,5.8,4.0725,10.3325,4.0725
50%,2009.0,6.65,8.765,15.375,8.765
75%,2014.0,7.5,13.8775,26.5925,13.8775
max,2018.0,32.7,55.21,111.49,55.21


In [92]:
data['Location'].value_counts()

HUN    29
FIN    28
RUS    25
AUS    23
AUT    23
ITA    22
FRA    21
CAN    20
LTU    19
ISR    19
CZE    19
KOR    18
DEU    18
NLD    18
LUX    17
TUR    17
LVA    16
SVK    16
BEL    16
POL    14
EST    14
IRL    13
USA    13
GBR    12
SVN    12
ISL    12
NZL    11
GRC    10
ESP     9
JPN     7
DNK     4
PRT     3
Name: Location, dtype: int64

In [93]:
def preprocess_inputs(df):
    df = df.copy()
    
    #one hot encoding 'Location'
    dummy = pd.get_dummies(df['Location'])
    df = pd.concat([df, dummy], axis = 1)
    df.drop('Location', axis = 1, inplace = True)
    
    #spliting data
    X = df.drop('Hospital_Stay', axis = 1).copy()
    Y = df['Hospital_Stay'].copy()
    
    #train & test split
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.7, random_state = 1)
    
    #scaling data
    scaler = StandardScaler()
    scaler.fit(X)
    
    X_train = pd.DataFrame(scaler.transform(X_train), columns=X.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)
    
    return X_train, Y_train, X_test, Y_test

In [94]:
X_train, Y_train, X_test, Y_test = preprocess_inputs(data)
X_train

Unnamed: 0,Time,MRI_Units,CT_Scanners,Hospital_Beds,AUS,AUT,BEL,CAN,CZE,DEU,...,LVA,NLD,NZL,POL,PRT,RUS,SVK,SVN,TUR,USA
0,-1.148430,-1.014788,-0.974091,-1.014788,-0.215557,-0.215557,-0.178529,-0.200401,-0.195131,-0.189737,...,-0.178529,-0.189737,-0.147296,-0.166667,-0.076323,-0.225189,-0.178529,-0.153998,-0.184207,-0.160445
1,-1.725011,-0.348673,0.321050,-0.348673,-0.215557,4.639153,-0.178529,-0.200401,-0.195131,-0.189737,...,-0.178529,-0.189737,-0.147296,-0.166667,-0.076323,-0.225189,-0.178529,-0.153998,-0.184207,-0.160445
2,-0.283560,-0.654072,2.585628,-0.654072,4.639153,-0.215557,-0.178529,-0.200401,-0.195131,-0.189737,...,-0.178529,-0.189737,-0.147296,-0.166667,-0.076323,-0.225189,-0.178529,-0.153998,-0.184207,-0.160445
3,1.446182,0.123830,0.295942,0.123830,-0.215557,-0.215557,5.601339,-0.200401,-0.195131,-0.189737,...,-0.178529,-0.189737,-0.147296,-0.166667,-0.076323,-0.225189,-0.178529,-0.153998,-0.184207,-0.160445
4,1.446182,0.764591,-0.036735,0.764591,-0.215557,-0.215557,-0.178529,-0.200401,-0.195131,-0.189737,...,-0.178529,-0.189737,-0.147296,-0.166667,-0.076323,-0.225189,-0.178529,-0.153998,-0.184207,-0.160445
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357,1.013746,1.768373,0.131347,1.768373,-0.215557,-0.215557,-0.178529,-0.200401,-0.195131,-0.189737,...,-0.178529,-0.189737,-0.147296,-0.166667,-0.076323,-0.225189,-0.178529,-0.153998,-0.184207,-0.160445
358,0.148876,-0.476595,-0.597475,-0.476595,-0.215557,-0.215557,-0.178529,-0.200401,-0.195131,-0.189737,...,-0.178529,-0.189737,-0.147296,-0.166667,-0.076323,-0.225189,-0.178529,-0.153998,-0.184207,-0.160445
359,-0.427705,-0.556114,-0.563300,-0.556114,-0.215557,-0.215557,-0.178529,4.989990,-0.195131,-0.189737,...,-0.178529,-0.189737,-0.147296,-0.166667,-0.076323,-0.225189,-0.178529,-0.153998,-0.184207,-0.160445
360,0.004731,1.094191,0.789030,1.094191,-0.215557,-0.215557,-0.178529,-0.200401,-0.195131,-0.189737,...,-0.178529,-0.189737,-0.147296,-0.166667,-0.076323,-0.225189,-0.178529,-0.153998,-0.184207,-0.160445


In [95]:
Y

0      6.6
1      6.4
2      6.5
3      6.4
4      6.2
      ... 
513    6.8
514    6.6
515    6.6
516    6.5
517    6.5
Name: Hospital_Stay, Length: 518, dtype: float64

In [96]:
print(X.shape, X_train.shape, X_test.shape)

(518, 36) (362, 36) (156, 36)


In [97]:
models = {
    "Linear Regression": LinearRegression(),
    "K-Nearest Neighbors": KNeighborsRegressor(),
    "Neural Network": MLPRegressor(),
    "Support Vector Machine (Linear Kernel)": LinearSVR(),
    "Support Vector Machine (RBF Kernel)": SVR(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "XGBoost": XGBRegressor(),
    "LightGBM": LGBMRegressor()
}

for name, model in models.items():
    model.fit(X_train, Y_train)
    print(name + " trained!")

Linear Regression trained!
K-Nearest Neighbors trained!
Neural Network trained!
Support Vector Machine (Linear Kernel) trained!
Support Vector Machine (RBF Kernel) trained!
Decision Tree trained!
Random Forest trained!
Gradient Boosting trained!
XGBoost trained!
LightGBM trained!


In [98]:
for name, model in models.items():
    pred = model.score(X_train, Y_train)
    print(name + ' R2 SCore:  {:.2f}%'.format(pred * 100))

Linear Regression R2 SCore:  88.12%
K-Nearest Neighbors R2 SCore:  92.01%
Neural Network R2 SCore:  90.34%
Support Vector Machine (Linear Kernel) R2 SCore:  84.28%
Support Vector Machine (RBF Kernel) R2 SCore:  66.36%
Decision Tree R2 SCore:  100.00%
Random Forest R2 SCore:  96.59%
Gradient Boosting R2 SCore:  97.18%
XGBoost R2 SCore:  99.98%
LightGBM R2 SCore:  64.73%
