# 載入資料集 (第1小題)

In [1]:
import pandas as pd
insurance = pd.read_csv('insurance.csv')
insurance

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


## 檢查欄位 
無缺失值~

In [2]:
insurance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


## 資料前處理 (第2小題)
先看nominal欄位的值有哪些

再將categorical的資料如: smoker、sex、region重新編碼成0、1、2...(numerical values)

最後是 bmi 值的標準化

In [3]:
print('*'*30)
print(insurance.groupby(['sex']).size())
print('*'*30)
print(insurance.groupby(['smoker']).size())
print('*'*30)
print(insurance.groupby(['region']).size())
print('*'*30)

******************************
sex
female    662
male      676
dtype: int64
******************************
smoker
no     1064
yes     274
dtype: int64
******************************
region
northeast    324
northwest    325
southeast    364
southwest    325
dtype: int64
******************************


categorical feature to numeric value

sex : male = 1 female=0

smoker : yes = 1 no = 0

region: northeast = 0 northwest = 1 southeast = 2 sourthwest = 3

In [4]:
# Transoform Function
def sexChange(x):
    if x == 'male':
        return 1
    else:
        return 0

def smokerChange(x):
    if x == 'yes':
        return 1
    else:
        return 0

def regionChange(x):
    if x == 'northeast':
        return 0
    elif x == 'northwest':
        return 1
    elif x == 'southeast':
        return 2
    elif x == 'southwest':
        return 3


insurance['sex'] = insurance['sex'].apply(sexChange)
insurance['smoker'] = insurance['smoker'].apply(smokerChange)
insurance['region'] = insurance['region'].apply(regionChange)

In [5]:
# bmi 的資料標準化
from sklearn import preprocessing
#建立MinMaxScaler物件
minmax = preprocessing.MinMaxScaler()
# 資料標準化
data_minmax = minmax.fit_transform(insurance[['bmi']])
insurance['bmi'] = data_minmax

In [6]:
# 資料轉換完畢後
insurance

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,0.321227,0,1,3,16884.92400
1,18,1,0.479150,1,0,2,1725.55230
2,28,1,0.458434,3,0,2,4449.46200
3,33,1,0.181464,0,0,1,21984.47061
4,32,1,0.347592,0,0,1,3866.85520
...,...,...,...,...,...,...,...
1333,50,1,0.403820,3,0,1,10600.54830
1334,18,0,0.429379,0,0,0,2205.98080
1335,18,0,0.562012,0,0,2,1629.83350
1336,21,0,0.264730,0,0,3,2007.94500


## 切割測試資料 (第3小題)

In [7]:
from sklearn.model_selection import train_test_split
X = insurance[['age','sex','bmi','children','smoker','region']]
y = insurance['charges']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,random_state=12)

## 建立、訓練模型、用三種指標評估模型 (第4、5小題)

### LinearRegression

In [8]:
# LinearRegression 
from sklearn.linear_model import LinearRegression 
from math import sqrt
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
model1 = LinearRegression()
#模型訓練
model1.fit(X_train, y_train)
#分別在訓練資料和測試資料上做預測
y_train_pred1 = model1.predict(X_train)
y_test_pred1 = model1.predict(X_test)

#評估模型在訓練資料及測試資料上的表現
print("在訓練資料上的預測評估:")
print("mean_absolute_error:",mean_absolute_error(y_train,y_train_pred1))
print("mean_squared_error:",mean_squared_error(y_train,y_train_pred1))
print("rmse:",sqrt(mean_squared_error(y_train,y_train_pred1)))
print("r2 score:",r2_score(y_train,y_train_pred1))
print('*'*35)
print("在測試資料上的預測評估:")
print("mean_absolute_error:",mean_absolute_error(y_test,y_test_pred1))
print("mean_squared_error:",mean_squared_error(y_test,y_test_pred1))
print("rmse:",sqrt(mean_squared_error(y_test,y_test_pred1)))
print("r2 score:",r2_score(y_test,y_test_pred1))

在訓練資料上的預測評估:
mean_absolute_error: 4156.668241912032
mean_squared_error: 34822155.39877064
rmse: 5901.030028628107
r2 score: 0.7713597869318137
***********************************
在測試資料上的預測評估:
mean_absolute_error: 4247.025567444527
mean_squared_error: 40297716.509451024
rmse: 6348.048244102358
r2 score: 0.7003353070884335


### SVM (SVC)

In [9]:
from sklearn.svm import SVC
model2 = SVC(kernel='linear',probability=True)
#模型訓練
model2.fit(X_train.astype('int'), y_train.astype('int'))
#分別在訓練資料和測試資料上做預測
y_train_pred2 = model2.predict(X_train)
y_test_pred2 = model2.predict(X_test)

#評估模型在訓練資料及測試資料上的表現
print("在訓練資料上的預測評估:")
print("mean_absolute_error:",mean_absolute_error(y_train,y_train_pred2))
print("mean_squared_error:",mean_squared_error(y_train,y_train_pred2))
print("rmse:",sqrt(mean_squared_error(y_train,y_train_pred2)))
print("r2 score:",r2_score(y_train,y_train_pred2))
print('*'*35)
print("在測試資料上的預測評估:")
print("mean_absolute_error:",mean_absolute_error(y_test,y_test_pred2))
print("mean_squared_error:",mean_squared_error(y_test,y_test_pred2))
print("rmse:",sqrt(mean_squared_error(y_test,y_test_pred2)))
print("r2 score:",r2_score(y_test,y_test_pred2))

在訓練資料上的預測評估:
mean_absolute_error: 978.1157828560268
mean_squared_error: 16911926.56620546
rmse: 4112.411283688131
r2 score: 0.8889572902880318
***********************************
在測試資料上的預測評估:
mean_absolute_error: 5933.818158190045
mean_squared_error: 125761807.8871615
rmse: 11214.357221310613
r2 score: 0.06480126406986175


In [10]:
### SVM (SVR)



### Decision Tree (Regression)

In [11]:
from sklearn.tree import DecisionTreeRegressor

model3 = DecisionTreeRegressor(max_depth=5)
#模型訓練
model3.fit(X_train, y_train)
#分別在訓練資料和測試資料上做預測
y_train_pred3 = model3.predict(X_train)
y_test_pred3 = model3.predict(X_test)

#評估模型在訓練資料及測試資料上的表現
print("在訓練資料上的預測評估:")
print("mean_absolute_error:",mean_absolute_error(y_train,y_train_pred3))
print("mean_squared_error:",mean_squared_error(y_train,y_train_pred3))
print("rmse:",sqrt(mean_squared_error(y_train,y_train_pred3)))
print("r2 score:",r2_score(y_train,y_train_pred3))
print('*'*35)
print("在測試資料上的預測評估:")
print("mean_absolute_error:",mean_absolute_error(y_test,y_test_pred3))
print("mean_squared_error:",mean_squared_error(y_test,y_test_pred3))
print("rmse:",sqrt(mean_squared_error(y_test,y_test_pred3)))
print("r2 score:",r2_score(y_test,y_test_pred3))

在訓練資料上的預測評估:
mean_absolute_error: 2234.693334810761
mean_squared_error: 15537615.488380183
rmse: 3941.778214001922
r2 score: 0.8979809355523061
***********************************
在測試資料上的預測評估:
mean_absolute_error: 2892.796367154971
mean_squared_error: 26000704.74162484
rmse: 5099.088618726374
r2 score: 0.8066517441489277


## 評估結果

### 3種模型在使用訓練資料進行預測時，表現都會比較好。