In [39]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor

In [2]:
targetData = pd.read_csv("../../Python_Script/dataset/feature_regression_example.csv")

In [3]:
targetData.head()

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1,1225,Y,1,Y,0.209442
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2,968,N,4,Y,0.209442
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201503,2015,3,1209,N,4,Y,0.208155
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201504,2015,4,1810,Y,2,Y,0.208155
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201505,2015,5,1773,N,4,Y,0.208155


In [4]:
targetData.dtypes

REGIONID         object
PRODUCTGROUP     object
PRODUCT          object
ITEM             object
YEARWEEK          int64
YEAR              int64
WEEK              int64
QTY               int64
HOLIDAY          object
HCLUS             int64
PROMOTION        object
PRO_PERCENT     float64
dtype: object

In [5]:
ynLabelEn = LabelEncoder()

In [6]:
targetData["HO_YN"] = ynLabelEn.fit_transform(targetData["HOLIDAY"])

In [7]:
targetData["PRO_YN"] = ynLabelEn.fit_transform(targetData["PROMOTION"])

In [8]:
targetData.head()

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,HO_YN,PRO_YN
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1,1225,Y,1,Y,0.209442,1,1
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2,968,N,4,Y,0.209442,0,1
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201503,2015,3,1209,N,4,Y,0.208155,0,1
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201504,2015,4,1810,Y,2,Y,0.208155,1,1
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201505,2015,5,1773,N,4,Y,0.208155,0,1


In [9]:
targetData["YEARWEEK"] = targetData["YEARWEEK"].astype(int)
targetData["YEAR"] = targetData["YEAR"].astype(int)
targetData["WEEK"] = targetData["WEEK"].astype(int)
targetData["HO_YN"] = targetData["HO_YN"].astype(int)
targetData["PRO_YN"] = targetData["PRO_YN"].astype(int)
targetData["HCLUS"] = targetData["HCLUS"].astype(int)

In [10]:
targetData.dtypes

REGIONID         object
PRODUCTGROUP     object
PRODUCT          object
ITEM             object
YEARWEEK          int32
YEAR              int32
WEEK              int32
QTY               int64
HOLIDAY          object
HCLUS             int32
PROMOTION        object
PRO_PERCENT     float64
HO_YN             int32
PRO_YN            int32
dtype: object

In [11]:
corrData = targetData.corr()

In [12]:
corrData

Unnamed: 0,YEARWEEK,YEAR,WEEK,QTY,HCLUS,PRO_PERCENT,HO_YN,PRO_YN
YEARWEEK,1.0,0.956598,0.275593,0.112267,-0.071586,0.404889,0.049867,0.205916
YEAR,0.956598,1.0,-0.016493,0.028931,0.028593,0.321193,-0.031106,0.195931
WEEK,0.275593,-0.016493,1.0,0.289766,-0.339943,0.329705,0.27371,0.060206
QTY,0.112267,0.028931,0.289766,1.0,-0.53723,0.700195,0.505932,0.612451
HCLUS,-0.071586,0.028593,-0.339943,-0.53723,1.0,-0.545619,-0.974601,-0.374072
PRO_PERCENT,0.404889,0.321193,0.329705,0.700195,-0.545619,1.0,0.487062,0.898554
HO_YN,0.049867,-0.031106,0.27371,0.505932,-0.974601,0.487062,1.0,0.365148
PRO_YN,0.205916,0.195931,0.060206,0.612451,-0.374072,0.898554,0.365148,1.0


In [13]:
targetStd = 0.5

In [14]:
features = list(corrData[(abs(corrData.QTY) > targetStd) & \
                         (abs(corrData.QTY) != 1)].index)

In [15]:
features

['HCLUS', 'PRO_PERCENT', 'HO_YN', 'PRO_YN']

In [16]:
label = ["QTY"]

In [17]:
label

['QTY']

In [18]:
targetRatio = 0.7

In [19]:
targetIndex = int(targetData.shape[0] * targetRatio)

In [20]:
targetIndex

73

In [21]:
training_features = targetData[features][:targetIndex]
training_label = targetData[label][:targetIndex]

In [22]:
test_features = targetData[features][targetIndex:]
test_label = targetData[label][targetIndex:]

In [23]:
training_features = training_features.reset_index(drop=True, inplace = False)
training_label = training_label.reset_index(drop=True, inplace = False)
test_features = test_features.reset_index(drop=True, inplace = False)
test_label = test_label.reset_index(drop=True, inplace = False)

In [40]:
model_method = tree.DecisionTreeRegressor()

In [41]:
model = model_method.fit(training_features, training_label)

In [42]:
predictData = pd.DataFrame(model.predict(test_features), columns = {"PREDICT"})

In [43]:
predictData.shape[0]

32

In [44]:
test_label.shape[0]

32

In [45]:
predictData

Unnamed: 0,PREDICT
0,2783.6
1,2783.6
2,2783.6
3,367.2
4,2783.6
5,2783.6
6,2783.6
7,2783.6
8,2783.6
9,2783.6


In [46]:
test_label

Unnamed: 0,QTY
0,2143
1,2282
2,973
3,21
4,968
5,1685
6,1586
7,1616
8,1318
9,2240


In [52]:
mean_absolute_error(test_label, predictData)

962.30625

In [53]:
mean_squared_error(test_label, predictData)

1229891.4987499998