# 뉴럴 네트워크 선형 회귀 (자동차 연비 데이터셋)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow import keras

## 1. 데이터 준비

In [None]:
# !wget https://raw.githubusercontent.com/devdio/datasets/refs/heads/main/auto-mpg.csv

--2025-01-02 05:46:09--  https://raw.githubusercontent.com/devdio/datasets/refs/heads/main/auto-mpg.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 18131 (18K) [text/plain]
Saving to: ‘auto-mpg.csv.1’


2025-01-02 05:46:09 (55.7 MB/s) - ‘auto-mpg.csv.1’ saved [18131/18131]



In [None]:
df = pd.read_csv('auto-mpg.csv')
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


## 2. 정보 확인

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


In [None]:
df['horsepower'].unique()

array(['130', '165', '150', '140', '198', '220', '215', '225', '190',
       '170', '160', '95', '97', '85', '88', '46', '87', '90', '113',
       '200', '210', '193', '?', '100', '105', '175', '153', '180', '110',
       '72', '86', '70', '76', '65', '69', '60', '80', '54', '208', '155',
       '112', '92', '145', '137', '158', '167', '94', '107', '230', '49',
       '75', '91', '122', '67', '83', '78', '52', '61', '93', '148',
       '129', '96', '71', '98', '115', '53', '81', '79', '120', '152',
       '102', '108', '68', '58', '149', '89', '63', '48', '66', '139',
       '103', '125', '133', '138', '135', '142', '77', '62', '132', '84',
       '64', '74', '116', '82'], dtype=object)

In [None]:
df['horsepower'] = df['horsepower'].replace('?', np.nan)
df.isna().sum()

Unnamed: 0,0
mpg,0
cylinders,0
displacement,0
horsepower,6
weight,0
acceleration,0
model year,0
origin,0
car name,0


In [None]:
df['horsepower'] = df['horsepower'].astype('float64') # 문자열 -> float64

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(4), int64(4), object(1)
memory usage: 28.1+ KB


In [None]:
df=df.dropna()
df.shape

(392, 9)

## 3. 데이터 분리

In [None]:
df.head() # y: mpg, car name 필요없으므로 날림

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino


In [None]:
x = df.drop(['mpg','car name', 'model year'], axis=1)
y= df['mpg']

In [None]:
x.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,origin
0,8,307.0,130.0,3504,12.0,1
1,8,350.0,165.0,3693,11.5,1
2,8,318.0,150.0,3436,11.0,1
3,8,304.0,150.0,3433,12.0,1
4,8,302.0,140.0,3449,10.5,1


In [None]:
y.head()

Unnamed: 0,mpg
0,18.0
1,15.0
2,18.0
3,16.0
4,17.0


In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

x_train.shape, x_test.shape, y_train.shape, y_test.shape

((313, 6), (79, 6), (313,), (79,))

## 4. 데이터 전처리

### 스케일링

In [None]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
ss.fit(x_train) # 학습
x_train_s = ss.transform(x_train) # 변환
# x_train_s = ss.fit_transform(x_train)

x_test_s = ss.transform(x_test)

In [None]:
y_train_n = y_train.to_numpy()
y_test_n = y_test.to_numpy()

In [None]:
type(x_train_s), type(y_train_n)

(numpy.ndarray, numpy.ndarray)

In [None]:
type(x_test_s), type(y_test_n)

(numpy.ndarray, numpy.ndarray)

## 5. 모델 만들기

In [None]:
x_train_s.shape # 데이터 313개, 피처 6개

(313, 6)

In [None]:
from keras import layers
import tensorflow as tf

model = keras.Sequential([
    keras.layers.Input(shape=(6,)), # (입력 값의 피처 수)
    keras.layers.Dense(units=3, activation='relu'),
    keras.layers.Dense(units=2, activation='relu'),
    keras.layers.Dense(units=1) # 최종 출력 (output layer) -> 회귀에서는 출력층에서 활성화함수 사용 안함
])

model.summary()

In [None]:
# 모델 컴파일: 손실함수, 옵티마이저, 메트릭스
model.compile(
    loss='mse', # 손실함수
    optimizer='adam', # 옵티마이저
    metrics=['mse'] # 메트릭스
)

In [None]:
x_train_s.shape, y_train_n.shape

((313, 6), (313,))

In [None]:
# 학습
EPOCHS = 300
BATCHSIZE = 32 # 총 데이터가 313개니까 약 10번 이동 (320)하면 1 에포크 -> 약 1000번 이동함
history = model.fit(x_train_s, y_train_n, epochs=EPOCHS, batch_size=BATCHSIZE, validation_split=0.2)

Epoch 1/300
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step - loss: 581.8312 - mse: 581.8312 - val_loss: 684.0678 - val_mse: 684.0678
Epoch 2/300
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 567.7597 - mse: 567.7597 - val_loss: 682.7655 - val_mse: 682.7655
Epoch 3/300
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 604.1506 - mse: 604.1506 - val_loss: 681.4658 - val_mse: 681.4658
Epoch 4/300
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 598.1699 - mse: 598.1699 - val_loss: 680.1609 - val_mse: 680.1609
Epoch 5/300
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 603.9832 - mse: 603.9832 - val_loss: 678.8249 - val_mse: 678.8249
Epoch 6/300
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 624.9147 - mse: 624.9147 - val_loss: 677.3954 - val_mse: 677.3954
Epoch 7/300
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

## 학습 곡선

In [None]:
hist = history.history
epochs  = hist.epoch

plt.plot(epochs,)
plt.xlabel('epochs')
plt.ylabel('loss')
plt.legend()
plt.show()

AttributeError: 'dict' object has no attribute 'history'

## 7. 테스트

In [None]:
y_pred = model.predict(x_test_s)
y_pred[:10]

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 


array([[31.536757],
       [17.906155],
       [36.39005 ],
       [32.261173],
       [26.415829],
       [31.712997],
       [16.182041],
       [30.745953],
       [17.874447],
       [37.728046]], dtype=float32)

In [None]:
y_pred = y_pred.flatten()
y_pred[:10]

array([31.536757, 17.906155, 36.39005 , 32.261173, 26.415829, 31.712997,
       16.182041, 30.745953, 17.874447, 37.728046], dtype=float32)

In [None]:
y_test_n[:10]

array([26. , 21.6, 36.1, 26. , 27. , 28. , 13. , 26. , 19. , 29. ])

### 평가지표 (MSE)

In [None]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test_n, y_pred) # (정답 값, 예측 값)
# 예측 값: 2차원, 정답 값: 1차원 -> flatten해서 차원 맞춰줌
mse

20.586421097929808

In [None]:
np.sqrt(mse)

4.537226145777815