In [1]:
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import style

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df_org = pd.read_csv("/content/drive/MyDrive/GOAI/Project/insurance/insurance.csv")
df_org

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


- age : 나이 (18 ~ 64)
- sex : 성별 ('female', 'male')
- bmi : BMI 지수 (15.96 ~ 53.13)
- children : 부양자녀수 (0, 1, 3, 2, 5, 4)
- smoker : 흡연여부 ('yes', 'no')
- region : 사는 지역 ('southwest', 'southeast', 'northwest', 'northeast')
- charges : 건강 보험에서 청구하는 개인의료비용 (1121.8739 ~ 63770.42801)

# 딥러닝 (원핫인코딩)

In [4]:
df = pd.get_dummies(df_org)
df.head()

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,1,0,0,1,0,0,0,1
1,18,33.77,1,1725.5523,0,1,1,0,0,0,1,0
2,28,33.0,3,4449.462,0,1,1,0,0,0,1,0
3,33,22.705,0,21984.47061,0,1,1,0,0,1,0,0
4,32,28.88,0,3866.8552,0,1,1,0,0,1,0,0


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 데이터 로드
X = df.drop('charges',axis = 1)
y = df['charges']
print(X.shape, y.shape)

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

# 표준화 (옵셔널)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

# StandardScaler
# 각 특성의 평균=0, 표준 편차=1로
# 평균 중심으로 데이터를 조정하고, 각 특성의 분산을 고려하여 스케일을 조정
# 특성의 분포가 정규 분포에 가까워질 수 있음
# 주로 평균과 표준 편차를 사용하는 모델에 적합

# MinMaxScaler
# 각 특성의 최솟값=0, 최댓값=1로
# 데이터를 고정된 범위 내로 스케일링
# 특성의 분포가 정규 분포가 아니거나, 데이터의 최솟값과 최댓값을 중요하게 생각하는 모델에 적합

(1338, 11) (1338,)
(1070, 11) (1070,)
(268, 11) (268,)
(1070, 11) (1070,)
(268, 11) (268,)


In [6]:
# Keras: 초기 모델 프로토타이핑, 빠른 개발, 간단한 모델을 위한 사용에 적합
# PyTorch: 연구 및 실험 중심의 환경에서, 복잡한 모델을 빌드하고 다양한 실험을 수행하는 데 적합

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error

# 모델 정의
model = Sequential()  # Sequential 모델 객체 생성
model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(100, activation='relu'))
model.add(Dense(80, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(1))  # 회귀 모델이므로 출력 레이어의 뉴런 수는 1

model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               1536      
                                                                 
 dense_1 (Dense)             (None, 100)               12900     
                                                                 
 dense_2 (Dense)             (None, 80)                8080      
                                                                 
 dense_3 (Dense)             (None, 64)                5184      
                                                                 
 dense_4 (Dense)             (None, 1)                 65        
                                                                 
Total params: 27765 (108.46 KB)
Trainable params: 27765 (108.46 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [7]:
loss = 'mse'
optimizer = 'adam'
model.compile(loss=loss, optimizer=optimizer)

model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))

model_training_predictions = model.predict(X_train)
model_test_predictions = model.predict(X_test)

print(f"R2 Score: {round(r2_score(y_test, model_test_predictions), 2)}")
print(f"MSE: {round(mean_squared_error(y_test, model_test_predictions), 2)}")
print(f"MAPE: {round(mean_absolute_percentage_error(y_test, model_test_predictions), 2)}")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

# 딥러닝 (label encoder)

- smoker : no=, yes=1
- sex : male=0, female=1
- 'southwest'=0, 'southeast'=1, 'northwest'=2, 'northeast'=3

In [8]:
# female=0, male=1
# smoker no=0, smoker yes=1
# northeast	=0, northwest=1, southeast=2, southwest=3

from sklearn.preprocessing import LabelEncoder

df = df_org.copy()

le = LabelEncoder()
for column, datatype in zip(df_org.columns, df_org.dtypes):
    if datatype == object:
        df[column] = le.fit_transform(df_org[column])
    else:
        pass

df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.900,0,1,3,16884.92400
1,18,1,33.770,1,0,2,1725.55230
2,28,1,33.000,3,0,2,4449.46200
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.880,0,0,1,3866.85520
...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,1,10600.54830
1334,18,0,31.920,0,0,0,2205.98080
1335,18,0,36.850,0,0,2,1629.83350
1336,21,0,25.800,0,0,3,2007.94500


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df.drop('charges',axis = 1)
y = df['charges']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error

model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(100, activation='relu'))
model.add(Dense(80, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(1))

loss = 'mse'
optimizer = 'adam'
model.compile(loss=loss, optimizer=optimizer)

model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test))

model_training_predictions = model.predict(X_train)
model_test_predictions = model.predict(X_test)

print(f"R2 Score: {round(r2_score(y_test, model_test_predictions), 2)}")
print(f"MSE: {round(mean_squared_error(y_test, model_test_predictions), 2)}")
print(f"MAPE: {round(mean_absolute_percentage_error(y_test, model_test_predictions), 2)}")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78