In [61]:
# 넘파이, 판다스, 맷플롯립
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 사이킷런
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# 경고 무시
import warnings
warnings.filterwarnings(action='ignore')


In [62]:
#데이터 불러오고 타입 확인
df = pd.read_csv('data3.csv')
print(df.dtypes)

number_people             int64
date                     object
timestamp                 int64
day_of_week               int64
is_weekend                int64
is_holiday                int64
temperature             float64
is_start_of_semester      int64
is_during_semester        int64
month                     int64
hour                      int64
dtype: object


In [63]:
#데이터 가공
df['date'] = pd.to_datetime(df['date'])
df['month'] = df['date'].apply(lambda x: x.month)
df['day'] = df['date'].apply(lambda x: x.day)
df['hour'] = df['date'].apply(lambda x: x.hour)
df['minute'] = df['date'].apply(lambda x: x.minute)
df['date'] = pd.to_datetime(df['date'], utc=True).dt.strftime('%Y%m%d%H%M').astype(int)

#데이터 가공 후 타입 확인
print(df.dtypes)

number_people             int64
date                      int64
timestamp                 int64
day_of_week               int64
is_weekend                int64
is_holiday                int64
temperature             float64
is_start_of_semester      int64
is_during_semester        int64
month                     int64
hour                      int64
day                       int64
minute                    int64
dtype: object


In [64]:
# 데이터 설정
y = df['number_people']
X = df.drop(['number_people','timestamp'], axis=1)

# X 데이터 확인
print(X.dtypes)

date                      int64
day_of_week               int64
is_weekend                int64
is_holiday                int64
temperature             float64
is_start_of_semester      int64
is_during_semester        int64
month                     int64
hour                      int64
day                       int64
minute                    int64
dtype: object


In [65]:
# 훈련/테스트 세트 나누기
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True)

# 스탠다드 스케일링 하기
scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

In [66]:
# 스케일링 결과 확인
X_train

Unnamed: 0,date,day_of_week,is_weekend,is_holiday,temperature,is_start_of_semester,is_during_semester,month,hour,day,minute
82318,0.258722,-0.992230,-0.628552,-0.052026,-1.075253,-0.29224,0.721117,-1.289301,-0.335112,-0.897704,-1.659911
82014,0.258062,0.508831,-0.628552,-0.052026,-0.921568,-0.29224,0.721117,-1.289301,1.450621,-1.352491,-1.659911
79363,0.237339,1.009185,1.590959,-0.052026,-1.026426,-0.29224,0.721117,-1.579659,1.301810,-1.125098,-0.504984
6411,-1.705379,-1.492583,-0.628552,-0.052026,1.259639,-0.29224,0.721117,0.743208,-0.037490,0.352962,-0.504984
67506,-1.707709,0.008477,-0.628552,-0.052026,-0.791096,-0.29224,0.721117,0.743208,-0.335112,-0.897704,-0.504984
...,...,...,...,...,...,...,...,...,...,...,...
111848,0.410535,-0.491876,-0.628552,-0.052026,-0.924770,-0.29224,0.721117,0.743208,-0.483923,1.148841,0.187973
88425,0.301220,-1.492583,-0.628552,-0.052026,-1.021624,-0.29224,0.721117,-0.708584,-0.335112,-0.784007,0.187973
13384,-1.660773,-0.491876,-0.628552,-0.052026,0.268690,-0.29224,-1.386738,1.323924,-0.781545,1.603628,-1.659911
106268,0.387047,0.008477,-0.628552,-0.052026,-0.979200,-0.29224,0.721117,0.452849,-1.674411,-0.101825,-1.082447


In [67]:
# 모델 학습
model = RandomForestRegressor()
model.fit(X_train, y_train)

In [68]:
y_pred = model.predict(X_test)

# 평균 제곱 오차(MSE)와 R-squared 값 계산
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('Mean squared error:', mse)
print('R-squared:', r2)

Mean squared error: 16.50500207779873
R-squared: 0.967855381368118


In [69]:
#예측 테스트 해보기
test_predict = pd.DataFrame([[20210412, 4, 1, 1, 50.00, 0, 0, 5, 11, 12, 35]], columns=X_test.columns)
test_predict = scaler.transform(test_predict)
print(model.predict(test_predict))


[35.6]


In [70]:
import pickle

# 모델 저장
with open('RFR_GYM_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# StandardScaler 객체 저장
with open('RFR_GYM_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)