In [3]:
from sklearn import datasets, preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
#from spark_sklearn.grid_search import GridSearchCV
#from spark_sklearn.util import createLocalSparkSession
import pandas as pd
import numpy as np
import time

In [7]:
data = pd.read_csv("./CALL_NDELIVERY_07MONTH.csv")
data = data.drop('시도', axis=1)
data.head()

Unnamed: 0,일자,요일,시간대,업종,시군구,읍면동,통화건수
0,20180701,일,0,음식점-족발/보쌈전문,강남구,논현동,5
1,20180701,일,0,음식점-족발/보쌈전문,관악구,신림동,5
2,20180701,일,0,음식점-족발/보쌈전문,서대문구,창천동,5
3,20180701,일,0,음식점-족발/보쌈전문,서초구,반포동,5
4,20180701,일,0,음식점-족발/보쌈전문,송파구,송파동,5


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124564 entries, 0 to 124563
Data columns (total 7 columns):
일자      124564 non-null int64
요일      124564 non-null object
시간대     124564 non-null int64
업종      124564 non-null object
시군구     124564 non-null object
읍면동     124564 non-null object
통화건수    124564 non-null int64
dtypes: int64(3), object(4)
memory usage: 6.7+ MB


In [9]:
# 계절 추가
date = list(data.일자)
season = list()

for x in date:
    month = int(x % 10000 / 100)
    if month in [3, 4, 5]:
        season.append('봄')
    elif month in [6, 7 ,8]:
        season.append('여름')
    elif month in [6, 7 ,8]:
        season.append('가을')
    else:
        season.append('겨울')
    
data['계절'] = season
data.head()

Unnamed: 0,일자,요일,시간대,업종,시군구,읍면동,통화건수,계절
0,20180701,일,0,음식점-족발/보쌈전문,강남구,논현동,5,여름
1,20180701,일,0,음식점-족발/보쌈전문,관악구,신림동,5,여름
2,20180701,일,0,음식점-족발/보쌈전문,서대문구,창천동,5,여름
3,20180701,일,0,음식점-족발/보쌈전문,서초구,반포동,5,여름
4,20180701,일,0,음식점-족발/보쌈전문,송파구,송파동,5,여름


In [10]:
# 공휴일 추가
holiday_list = [20180101, 20180215, 20180216, 20180217, 20180301, 20180505, 20180522, 20180606, 20180815, 20180923, 20180924, 20180925, 20181003, 20181009, 20181225]

date = list(data.일자)
holiday = list()

for x in date:
    if x in holiday_list:
        holiday.append(1)
    else:
        holiday.append(0)
    
data['공휴일'] = holiday
data.head()

Unnamed: 0,일자,요일,시간대,업종,시군구,읍면동,통화건수,계절,공휴일
0,20180701,일,0,음식점-족발/보쌈전문,강남구,논현동,5,여름,0
1,20180701,일,0,음식점-족발/보쌈전문,관악구,신림동,5,여름,0
2,20180701,일,0,음식점-족발/보쌈전문,서대문구,창천동,5,여름,0
3,20180701,일,0,음식점-족발/보쌈전문,서초구,반포동,5,여름,0
4,20180701,일,0,음식점-족발/보쌈전문,송파구,송파동,5,여름,0


In [11]:
# 일자 -> 월로 바꾸기
date = list(data.일자)
months = list()

for x in date:
    month = int(x % 10000 / 100)
    months.append(month)
    
data['월'] = months
data.head()

Unnamed: 0,일자,요일,시간대,업종,시군구,읍면동,통화건수,계절,공휴일,월
0,20180701,일,0,음식점-족발/보쌈전문,강남구,논현동,5,여름,0,7
1,20180701,일,0,음식점-족발/보쌈전문,관악구,신림동,5,여름,0,7
2,20180701,일,0,음식점-족발/보쌈전문,서대문구,창천동,5,여름,0,7
3,20180701,일,0,음식점-족발/보쌈전문,서초구,반포동,5,여름,0,7
4,20180701,일,0,음식점-족발/보쌈전문,송파구,송파동,5,여름,0,7


In [12]:
# 주말 추가
day = list(data.요일)
weekends = list()

for x in day:
    if x in ['토', '일']:
        weekends.append(1)
    else:
        weekends.append(0)
        
data['주말'] = weekends
data.head()

Unnamed: 0,일자,요일,시간대,업종,시군구,읍면동,통화건수,계절,공휴일,월,주말
0,20180701,일,0,음식점-족발/보쌈전문,강남구,논현동,5,여름,0,7,1
1,20180701,일,0,음식점-족발/보쌈전문,관악구,신림동,5,여름,0,7,1
2,20180701,일,0,음식점-족발/보쌈전문,서대문구,창천동,5,여름,0,7,1
3,20180701,일,0,음식점-족발/보쌈전문,서초구,반포동,5,여름,0,7,1
4,20180701,일,0,음식점-족발/보쌈전문,송파구,송파동,5,여름,0,7,1


In [13]:
#data_c = data_simple[data_simple['업종'] == '치킨']
data_c = data[data['업종'] == '치킨']
data_c.head()

Unnamed: 0,일자,요일,시간대,업종,시군구,읍면동,통화건수,계절,공휴일,월,주말
29,20180701,일,0,치킨,강남구,삼성동,8,여름,0,7,1
30,20180701,일,0,치킨,강남구,일원동,5,여름,0,7,1
31,20180701,일,0,치킨,강북구,미아동,15,여름,0,7,1
32,20180701,일,0,치킨,강북구,우이동,5,여름,0,7,1
33,20180701,일,0,치킨,강서구,가양동,10,여름,0,7,1


In [14]:
# 다중공선성 확인을 위한 상관 계수 측정
data_corr = data_c.drop('업종', axis=1)
data_corr = data_corr.drop('통화건수', axis=1)
#data_corr.head()

data_corr = data_corr.assign(일자=data_corr.일자.astype('category').cat.codes)\
.assign(요일=data_corr.요일.astype('category').cat.codes)\
.assign(시간대=data_corr.시간대.astype('category').cat.codes)\
.assign(시군구=data_corr.시군구.astype('category').cat.codes)\
.assign(읍면동=data_corr.읍면동.astype('category').cat.codes)\
.assign(계절=data_corr.계절.astype('category').cat.codes)\
.assign(공휴일=data_corr.공휴일.astype('category').cat.codes)\
.assign(월=data_corr.월.astype('category').cat.codes)\
.assign(주말=data_corr.주말.astype('category').cat.codes)

data_corr.head()

Unnamed: 0,일자,요일,시간대,시군구,읍면동,계절,공휴일,월,주말
29,0,4,0,0,96,0,0,0,1
30,0,4,0,0,190,0,0,0,1
31,0,4,0,2,79,0,0,0,1
32,0,4,0,2,176,0,0,0,1
33,0,4,0,3,3,0,0,0,1


In [15]:
data_corr.corr()

Unnamed: 0,일자,요일,시간대,시군구,읍면동,계절,공휴일,월,주말
일자,1.0,0.043151,-0.002815,0.004542,0.001498,,,,-0.005621
요일,0.043151,1.0,-0.020806,-0.001399,-0.003593,,,,0.440651
시간대,-0.002815,-0.020806,1.0,0.051264,0.022462,,,,-0.033153
시군구,0.004542,-0.001399,0.051264,1.0,0.225453,,,,-0.006921
읍면동,0.001498,-0.003593,0.022462,0.225453,1.0,,,,-0.007707
계절,,,,,,,,,
공휴일,,,,,,,,,
월,,,,,,,,,
주말,-0.005621,0.440651,-0.033153,-0.006921,-0.007707,,,,1.0


In [16]:
data_dummy = data_c.drop('일자', axis=1)
data_dummy = data_dummy.drop('요일', axis=1)
data_dummy = data_dummy.drop('업종', axis=1)

data_dummy = pd.get_dummies(data=data_dummy, columns=['시간대'], drop_first=True)
data_dummy = pd.get_dummies(data=data_dummy, columns=['시군구'], drop_first=True)
data_dummy = pd.get_dummies(data=data_dummy, columns=['읍면동'], drop_first=True)
data_dummy = pd.get_dummies(data=data_dummy, columns=['계절'], drop_first=True)
data_dummy = pd.get_dummies(data=data_dummy, columns=['월'], drop_first=True)
data_dummy = pd.get_dummies(data=data_dummy, columns=['주말'], drop_first=True)

data_dummy.head()

Unnamed: 0,통화건수,공휴일,시간대_1,시간대_2,시간대_3,시간대_4,시간대_5,시간대_6,시간대_7,시간대_8,...,읍면동_화양동,읍면동_황학동,읍면동_회기동,읍면동_회현동1가,읍면동_효창동,읍면동_후암동,읍면동_휘경동,읍면동_흑석동,읍면동_흥인동,주말_1
29,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
30,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
31,15,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
32,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
33,10,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [17]:
features = data_dummy.drop('통화건수', axis=1)
X = features.values
y = data_c['통화건수'].values
X_train, X_test, y_train, y_test = train_test_split(X, y)

#data 표준화
scaler = StandardScaler()
scaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)



In [20]:
models = [
    LinearRegression(),
    MLPRegressor(hidden_layer_sizes=[512, 4], max_iter=5000, alpha=0.005, random_state=42), 
    RandomForestRegressor(n_estimators=100, random_state=0), 
    GradientBoostingRegressor(n_estimators=100, max_depth=10, criterion='mse')
]

for m in models:
    m.fit(X_train, y_train)
    print(m.__class__)
    print('Training Set Mean Squared Error: {:.2f}'.format(mean_squared_error(y_train, m.predict(X_train))))
    print('training Set R^2: {:.2f}'.format(r2_score(y_train, m.predict(X_train))))
    
    print('testing Set Mean Squared Error: {:.2f}'.format(mean_squared_error(y_test, m.predict(X_test))))
    print('testing Set R^2: {:.2f}'.format(r2_score(y_test, m.predict(X_test))))
    
    print()

<class 'sklearn.linear_model.base.LinearRegression'>
Training Set Mean Squared Error: 116.64
training Set R^2: 0.53
testing Set Mean Squared Error: 79216773631652521022849024.00
testing Set R^2: -304985720037848928747520.00

<class 'sklearn.neural_network.multilayer_perceptron.MLPRegressor'>
Training Set Mean Squared Error: 29.27
training Set R^2: 0.88
testing Set Mean Squared Error: 38.32
testing Set R^2: 0.85

<class 'sklearn.ensemble.forest.RandomForestRegressor'>
Training Set Mean Squared Error: 28.47
training Set R^2: 0.88
testing Set Mean Squared Error: 37.83
testing Set R^2: 0.85

<class 'sklearn.ensemble.gradient_boosting.GradientBoostingRegressor'>
Training Set Mean Squared Error: 33.99
training Set R^2: 0.86
testing Set Mean Squared Error: 42.71
testing Set R^2: 0.84

