# 載入資料集 (第1小題)

In [1]:
import pandas as pd
insurance = pd.read_csv('insurance.csv')
insurance

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


## 檢查欄位 
無缺失值~

In [2]:
insurance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


## 資料前處理 (第2小題)
先看nominal欄位的值有哪些

再將categorical的資料如: smoker、sex、region重新編碼成0、1、2...(numerical values)

最後是 bmi、age、charges 值的標準化

In [3]:
print('*'*30)
print(insurance.groupby(['sex']).size())
print('*'*30)
print(insurance.groupby(['smoker']).size())
print('*'*30)
print(insurance.groupby(['region']).size())
print('*'*30)

******************************
sex
female    662
male      676
dtype: int64
******************************
smoker
no     1064
yes     274
dtype: int64
******************************
region
northeast    324
northwest    325
southeast    364
southwest    325
dtype: int64
******************************


### categorical feature to numeric value (pd.get_dummies)

In [4]:
# bmi、age 的資料標準化
from sklearn import preprocessing
#建立MinMaxScaler物件
minmax = preprocessing.MinMaxScaler()
# 資料標準化
data1_minmax = minmax.fit_transform(insurance[['bmi']])
data2_minmax = minmax.fit_transform(insurance[['age']])
data3_minmax = minmax.fit_transform(insurance[['charges']])
insurance['bmi'] = data1_minmax
insurance['age'] = data2_minmax
insurance['charges'] = data3_minmax

In [5]:
#nominal to numeric value
dummyColumn = pd.get_dummies(insurance[['sex','smoker','region']])
dff = pd.DataFrame(dummyColumn)
insurance = pd.concat([insurance, dff],axis=1)
insurance.info()
insurance = insurance.drop(['region','smoker','sex'], axis=1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               1338 non-null   float64
 1   sex               1338 non-null   object 
 2   bmi               1338 non-null   float64
 3   children          1338 non-null   int64  
 4   smoker            1338 non-null   object 
 5   region            1338 non-null   object 
 6   charges           1338 non-null   float64
 7   sex_female        1338 non-null   uint8  
 8   sex_male          1338 non-null   uint8  
 9   smoker_no         1338 non-null   uint8  
 10  smoker_yes        1338 non-null   uint8  
 11  region_northeast  1338 non-null   uint8  
 12  region_northwest  1338 non-null   uint8  
 13  region_southeast  1338 non-null   uint8  
 14  region_southwest  1338 non-null   uint8  
dtypes: float64(3), int64(1), object(3), uint8(8)
memory usage: 83.8+ KB


In [6]:
# 資料轉換完畢後
insurance

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,0.021739,0.321227,0,0.251611,1,0,0,1,0,0,0,1
1,0.000000,0.479150,1,0.009636,0,1,1,0,0,0,1,0
2,0.217391,0.458434,3,0.053115,0,1,1,0,0,0,1,0
3,0.326087,0.181464,0,0.333010,0,1,1,0,0,1,0,0
4,0.304348,0.347592,0,0.043816,0,1,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1333,0.695652,0.403820,3,0.151299,0,1,1,0,0,1,0,0
1334,0.000000,0.429379,0,0.017305,1,0,1,0,1,0,0,0
1335,0.000000,0.562012,0,0.008108,1,0,1,0,0,0,1,0
1336,0.065217,0.264730,0,0.014144,1,0,1,0,0,0,0,1


## 切割測試資料 (第3小題)

In [7]:
from sklearn.model_selection import train_test_split
X = insurance.drop(['charges'], axis=1)
y = insurance['charges']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,random_state=12)

## 建立、訓練模型 (第4小題)

### LinearRegression

In [8]:
# LinearRegression 
from sklearn.linear_model import LinearRegression 
from math import sqrt
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
model1 = LinearRegression()
#模型訓練
model1.fit(X_train, y_train)
#分別在訓練資料和測試資料上做預測
y_test_pred1 = model1.predict(X_test)

#評估模型在訓練資料及測試資料上的表現
print("在測試資料上的預測評估:")
print("mean_absolute_error:",mean_absolute_error(y_test,y_test_pred1))
print("mean_squared_error:",mean_squared_error(y_test,y_test_pred1))
print("rmse:",sqrt(mean_squared_error(y_test,y_test_pred1)))
print("r2 score:",r2_score(y_test,y_test_pred1))

LogisticModel = pd.DataFrame({'LinearRegression':[r2_score(y_test,y_test_pred1),sqrt(mean_squared_error(y_test,y_test_pred1)),mean_absolute_error(y_test,y_test_pred1)]},index=['R2','RSME','MAE'])

在測試資料上的預測評估:
mean_absolute_error: 0.06793259081863107
mean_squared_error: 0.010285685251808372
rmse: 0.10141836742823447
r2 score: 0.6998001477843854


### SVM (SVR)

In [9]:
from sklearn.svm import SVR  # 使用SVM中的SVR
#建立SVR
svr_rbf = SVR(C=1e3, kernel='rbf', gamma='auto')
svr_rbf.fit(X_train,y_train)
svr_predict=svr_rbf.predict(X_test)

#評估模型在訓練資料及測試資料上的表現
print("在測試資料上的預測評估:")
print("mean_absolute_error:",mean_absolute_error(y_test,svr_predict))
print("mean_squared_error:",mean_squared_error(y_test,svr_predict))
print("rmse:",sqrt(mean_squared_error(y_test,svr_predict)))
print("r2 score:",r2_score(y_test,svr_predict))

SVRModel = pd.DataFrame({'SVM (SVR)':[r2_score(y_test,svr_predict),sqrt(mean_squared_error(y_test,svr_predict)),mean_absolute_error(y_test,svr_predict)]},index=['R2','RSME','MAE'])

在測試資料上的預測評估:
mean_absolute_error: 0.08814743159854656
mean_squared_error: 0.010471459779727106
rmse: 0.10233015088294899
r2 score: 0.6943780991350914


### Decision Tree (Regression)

In [10]:
from sklearn.tree import DecisionTreeRegressor

model3 = DecisionTreeRegressor(max_depth=5)
#模型訓練
model3.fit(X_train, y_train)
#分別在訓練資料和測試資料上做預測
y_test_pred3 = model3.predict(X_test)

#評估模型在訓練資料及測試資料上的表現
print("在測試資料上的預測評估:")
print("mean_absolute_error:",mean_absolute_error(y_test,y_test_pred3))
print("mean_squared_error:",mean_squared_error(y_test,y_test_pred3))
print("rmse:",sqrt(mean_squared_error(y_test,y_test_pred3)))
print("r2 score:",r2_score(y_test,y_test_pred3))

Tree = pd.DataFrame({'Decision Tree':[r2_score(y_test,y_test_pred3),sqrt(mean_squared_error(y_test,y_test_pred3)),mean_absolute_error(y_test,y_test_pred3)]},index=['R2','RSME','MAE'])

在測試資料上的預測評估:
mean_absolute_error: 0.04486304040983644
mean_squared_error: 0.0064929645517793864
rmse: 0.08057893367238975
r2 score: 0.810495173518682


## 評估結果 (第5小題)

In [11]:
Tree

Unnamed: 0,Decision Tree
R2,0.810495
RSME,0.080579
MAE,0.044863


In [12]:
SVRModel

Unnamed: 0,SVM (SVR)
R2,0.694378
RSME,0.10233
MAE,0.088147


In [13]:
LogisticModel

Unnamed: 0,LinearRegression
R2,0.6998
RSME,0.101418
MAE,0.067933


In [14]:
final = pd.concat([LogisticModel,SVRModel,Tree],axis=1)
final

Unnamed: 0,LinearRegression,SVM (SVR),Decision Tree
R2,0.6998,0.694378,0.810495
RSME,0.101418,0.10233,0.080579
MAE,0.067933,0.088147,0.044863
