# Medical Cost Dataset

__Develop:__ Sima Shirzadi

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('insurance.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df['smoker'] = (df['smoker'] == 'yes').astype(np.int)

In [None]:
df['sex'] = (df['sex'] == 'male').astype(np.int)

In [None]:
df.head()

In [None]:
df['region'].unique()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(df['region'])
df['region'] = le.transform(df['region'])

In [None]:
dummy = pd.get_dummies(df['region'])
df = pd.concat([df, dummy], axis=1)
df.drop('region', axis=1, inplace=True)

In [None]:
df.rename({0 : 'zero', 1: 'one', 2: 'two', 3: 'three'}, inplace=True, axis=1)

In [None]:
df.keys()

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
sns.heatmap(df.corr(), annot=True)
plt.show()

In [None]:
plt.figure(figsize=(20,20))
pd.plotting.scatter_matrix(df)
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop('charges', axis=1), df['charges'], train_size=0.8)

In [None]:
X_train

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train, y_train)
reg.score(X_test, y_test)

In [None]:
reg.coef_

In [None]:
from sklearn.metrics import mean_absolute_error
preds = reg.predict(X_test)
mean_absolute_error(y_test, preds)

In [None]:
from sklearn.metrics import mean_squared_error
preds = reg.predict(X_test)
mean_squared_error(y_test, preds)

In [None]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(n_estimators=5000)
regr.fit(X_train, y_train)

regr.score(X_test, y_test)

In [None]:
from sklearn.metrics import mean_absolute_error
preds = regr.predict(X_test)
mean_absolute_error(y_test, preds)

In [None]:
from sklearn.metrics import mean_squared_error
preds = regr.predict(X_test)
mean_squared_error(y_test, preds)

In [None]:
regr.feature_importances_

In [None]:
regr.feature_names_in_

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gb_reg = GradientBoostingRegressor(random_state=0)
gb_reg.fit(X_train, y_train)
gb_reg.score(X_test, y_test)

In [None]:
from sklearn.metrics import mean_absolute_error
preds = gb_reg.predict(X_test)
mean_absolute_error(y_test, preds)

In [None]:
from sklearn.metrics import mean_squared_error
preds = gb_reg.predict(X_test)
mean_squared_error(y_test, preds)

In [None]:
gb_reg.feature_importances_

In [None]:
gb_reg.feature_names_in_

In [None]:
corr = df.corr()

cor_target = abs(corr["charges"])

relevant_features = cor_target[cor_target>0.05]

names = [index for index, value in relevant_features.iteritems()]

names
names.remove('charges')

In [None]:
X = df[names]
y = df['charges']


In [None]:
X

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [None]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train, y_train)
reg.score(X_test, y_test)

In [None]:
reg.coef_

In [None]:
reg.feature_names_in_

In [None]:
from sklearn.metrics import mean_absolute_error
preds = reg.predict(X_test)
mean_absolute_error(y_test, preds)

In [None]:
from sklearn.metrics import mean_squared_error
preds = reg.predict(X_test)
mean_squared_error(y_test, preds)

In [None]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(n_estimators=5000)
regr.fit(X_train, y_train)

regr.score(X_test, y_test)

In [None]:
from sklearn.metrics import mean_absolute_error
preds = regr.predict(X_test)
mean_absolute_error(y_test, preds)

In [None]:
from sklearn.metrics import mean_squared_error
preds = regr.predict(X_test)
mean_squared_error(y_test, preds)

In [None]:
regr.feature_importances_

In [None]:
regr.feature_names_in_

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gb_reg = GradientBoostingRegressor(random_state=0)
gb_reg.fit(X_train, y_train)
gb_reg.score(X_test, y_test)

In [None]:
from sklearn.metrics import mean_absolute_error
preds = gb_reg.predict(X_test)
mean_absolute_error(y_test, preds)

In [None]:
from sklearn.metrics import mean_squared_error
preds = gb_reg.predict(X_test)
mean_squared_error(y_test, preds)

In [None]:
gb_reg.feature_importances_

In [None]:
gb_reg.feature_names_in_