In [1]:
from sklearn import datasets, linear_model, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
data = pd.read_csv("C:/Users/Jintae/Desktop/python_regression/YEAR_DATA.csv", encoding="cp949")
data.head()

Unnamed: 0,일자,요일,시간대,업종,시도,시군구,읍면동,통화건수
0,20180101,월,0,음식점-족발/보쌈전문,서울특별시,강남구,역삼동,5
1,20180101,월,0,음식점-족발/보쌈전문,서울특별시,강동구,길동,5
2,20180101,월,0,음식점-족발/보쌈전문,서울특별시,강북구,번동,5
3,20180101,월,0,음식점-족발/보쌈전문,서울특별시,강서구,가양동,5
4,20180101,월,0,음식점-족발/보쌈전문,서울특별시,강북구,미아동,5


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1455045 entries, 0 to 1455044
Data columns (total 8 columns):
일자      1455045 non-null int64
요일      1455045 non-null object
시간대     1455045 non-null int64
업종      1455045 non-null object
시도      1455045 non-null object
시군구     1455045 non-null object
읍면동     1455045 non-null object
통화건수    1455045 non-null int64
dtypes: int64(3), object(5)
memory usage: 88.8+ MB


In [4]:
# 계절 추가
date = list(data.일자)
season = list()

for x in date:
    month = int(x % 10000 / 100)
    if month in [3, 4, 5]:
        season.append('봄')
    elif month in [6, 7 ,8]:
        season.append('여름')
    elif month in [6, 7 ,8]:
        season.append('가을')
    else:
        season.append('겨울')
    
data['계절'] = season
data.head()

Unnamed: 0,일자,요일,시간대,업종,시도,시군구,읍면동,통화건수,계절
0,20180101,월,0,음식점-족발/보쌈전문,서울특별시,강남구,역삼동,5,겨울
1,20180101,월,0,음식점-족발/보쌈전문,서울특별시,강동구,길동,5,겨울
2,20180101,월,0,음식점-족발/보쌈전문,서울특별시,강북구,번동,5,겨울
3,20180101,월,0,음식점-족발/보쌈전문,서울특별시,강서구,가양동,5,겨울
4,20180101,월,0,음식점-족발/보쌈전문,서울특별시,강북구,미아동,5,겨울


In [5]:
# 공휴일 추가
holiday_list = [20180101, 20180215, 20180216, 20180217, 20180301, 20180505, 20180522, 20180606, 20180815, 20180923, 20180924, 20180925, 20181003, 20181009, 20181225]

date = list(data.일자)
holiday = list()

for x in date:
    if x in holiday_list:
        holiday.append(1)
    else:
        holiday.append(0)
    
data['공휴일'] = holiday
data.head()

Unnamed: 0,일자,요일,시간대,업종,시도,시군구,읍면동,통화건수,계절,공휴일
0,20180101,월,0,음식점-족발/보쌈전문,서울특별시,강남구,역삼동,5,겨울,1
1,20180101,월,0,음식점-족발/보쌈전문,서울특별시,강동구,길동,5,겨울,1
2,20180101,월,0,음식점-족발/보쌈전문,서울특별시,강북구,번동,5,겨울,1
3,20180101,월,0,음식점-족발/보쌈전문,서울특별시,강서구,가양동,5,겨울,1
4,20180101,월,0,음식점-족발/보쌈전문,서울특별시,강북구,미아동,5,겨울,1


In [6]:
# 일자 -> 월로 바꾸기

date = list(data.일자)
months = list()

for x in date:
    month = int(x % 10000 / 100)
    months.append(month)
    
data['월'] = months
data.head()

Unnamed: 0,일자,요일,시간대,업종,시도,시군구,읍면동,통화건수,계절,공휴일,월
0,20180101,월,0,음식점-족발/보쌈전문,서울특별시,강남구,역삼동,5,겨울,1,1
1,20180101,월,0,음식점-족발/보쌈전문,서울특별시,강동구,길동,5,겨울,1,1
2,20180101,월,0,음식점-족발/보쌈전문,서울특별시,강북구,번동,5,겨울,1,1
3,20180101,월,0,음식점-족발/보쌈전문,서울특별시,강서구,가양동,5,겨울,1,1
4,20180101,월,0,음식점-족발/보쌈전문,서울특별시,강북구,미아동,5,겨울,1,1


In [7]:
data_simple = data.drop('일자', axis=1)
data_simple = data_simple.drop('시도', axis=1)
data_simple = data_simple.drop('시군구', axis=1)
data_simple = data_simple.drop('읍면동', axis=1)
data_simple.head()

Unnamed: 0,요일,시간대,업종,통화건수,계절,공휴일,월
0,월,0,음식점-족발/보쌈전문,5,겨울,1,1
1,월,0,음식점-족발/보쌈전문,5,겨울,1,1
2,월,0,음식점-족발/보쌈전문,5,겨울,1,1
3,월,0,음식점-족발/보쌈전문,5,겨울,1,1
4,월,0,음식점-족발/보쌈전문,5,겨울,1,1


In [8]:
data_c = data_simple[data_simple['업종'] == '치킨']
data_c.head()

Unnamed: 0,요일,시간대,업종,통화건수,계절,공휴일,월
68,월,0,치킨,14,겨울,1,1
69,월,0,치킨,5,겨울,1,1
70,월,0,치킨,5,겨울,1,1
71,월,0,치킨,33,겨울,1,1
72,월,0,치킨,5,겨울,1,1


In [9]:
# 다중공선성 확인을 위한 상관 계수 측정
data_corr = data_c.drop('업종', axis=1)
data_corr = data_corr.drop('통화건수', axis=1)
#data_corr.head()

data_corr = data_corr.assign(요일=data_corr.요일.astype('category').cat.codes)\
.assign(시간대=data_corr.시간대.astype('category').cat.codes)\
.assign(계절=data_corr.계절.astype('category').cat.codes)\
.assign(공휴일=data_corr.공휴일.astype('category').cat.codes)\
.assign(월=data_corr.월.astype('category').cat.codes)

data_corr.head()

Unnamed: 0,요일,시간대,계절,공휴일,월
68,3,0,0,1,0
69,3,0,0,1,0
70,3,0,0,1,0
71,3,0,0,1,0
72,3,0,0,1,0


In [10]:
data_corr.corr()

Unnamed: 0,요일,시간대,계절,공휴일,월
요일,1.0,-0.023423,-0.01221,0.052041,-0.001574
시간대,-0.023423,1.0,0.002235,-0.011755,-0.007014
계절,-0.01221,0.002235,1.0,-0.064264,-0.082935
공휴일,0.052041,-0.011755,-0.064264,1.0,-0.015796
월,-0.001574,-0.007014,-0.082935,-0.015796,1.0


In [11]:
data_dummy = pd.get_dummies(data=data_c, columns=['요일'], drop_first=True)
data_dummy = pd.get_dummies(data=data_dummy, columns=['시간대'], drop_first=True)
data_dummy = pd.get_dummies(data=data_dummy, columns=['계절'], drop_first=True)
#data_dummy = pd.get_dummies(data=data_dummy, columns=['공휴일'], drop_first=True)
data_dummy = pd.get_dummies(data=data_dummy, columns=['월'], drop_first=True)

data_dummy = data_dummy.drop('업종', axis=1)

data_dummy.head()

Unnamed: 0,통화건수,공휴일,요일_목,요일_수,요일_월,요일_일,요일_토,요일_화,시간대_1,시간대_2,...,월_3,월_4,월_5,월_6,월_7,월_8,월_9,월_10,월_11,월_12
68,14,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,5,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
70,5,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
71,33,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
72,5,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# linear regresion
#features = data_dummy.drop('통화건수', axis=1)
features = data_corr
#print(features)

X = features.values
y = data_c['통화건수'].values

#print("X.shape : {}  y.shape : {}".format(X.shape,y.shape))

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

model1 = linear_model.LinearRegression()
model1.fit(X_train, y_train)
#print(model1.coef_, model1.intercept_)

print ("Model1 Training Mean squared error: %.2f" % np.mean((model1.predict(X_train) - y_train) ** 2))
print ("Model1 Test Mean squared error: %.2f" % np.mean((model1.predict(X_test) - y_test) ** 2))
print('training 모델 점수: {:.2f}'.format(model1.score(X_train, y_train)))
print('testing 점수: {:.2f}'.format(model1.score(X_test, y_test)))

Model1 Training Mean squared error: 287.61
Model1 Test Mean squared error: 294.17
training 모델 점수: 0.03
testing 점수: 0.03


In [None]:
features = data_corr
X=features.values.T
print(X)
y=data_c['통화건수'].values
print(y)
model = make_pipeline(PolynomialFeatures(degree=2), LinearRegression(fit_intercept = False))
model.fit(X,y)
#print(model.named_steps.linearregression.coef_)
#print(model.predict(X_predict))
print('training 모델 점수: {:.2f}'.format(model.score(X, y)))

[[ 3  3  3 ...  3  3  3]
 [ 0  0  0 ... 23 23 23]
 [ 0  0  0 ...  0  0  0]
 [ 1  1  1 ...  0  0  0]
 [ 0  0  0 ... 11 11 11]]
[14  5  5 ... 20  5  5]


In [None]:
# 인공 신경망

# data 준비
#X = data_dummy.drop('통화건수', axis=1).values
X = data_corr.values
y = data_c['통화건수'].values

X_train, X_test, y_train, y_test = train_test_split(X, y)

# data 정규화
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)


# compare training models
from sklearn.neural_network import MLPRegressor

"""
models = [
    linear_model.LinearRegression(), 
    MLPRegressor(hidden_layer_sizes=[512, 4], alpha=0.005, random_state=42),
    MLPRegressor(hidden_layer_sizes=[48, 4], max_iter=5000, alpha=0.005, random_state=42), 
    MLPRegressor(hidden_layer_sizes=[512, 4], max_iter=5000, alpha=0.005, random_state=42), 
    MLPRegressor(hidden_layer_sizes=[1024, 4], max_iter=5000, alpha=0.005, random_state=42),
    MLPRegressor(hidden_layer_sizes=[1024, 512, 4], max_iter=5000, alpha=0.005, random_state=42),
]
"""

models = [MLPRegressor(hidden_layer_sizes=[512, 4], max_iter=5000, alpha=0.005, random_state=42)]

from sklearn.metrics import r2_score
for m in models:
    m.fit(X_train, y_train)
    print(m.__class__)
    print('training 모델 점수: {:.2f}'.format(r2_score(y_train, m.predict(X_train))))
    print('testing 점수: {:.2f}'.format(r2_score(y_test, m.predict(X_test))))