## 01. 모듈 호출

In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder

## 02. 목표데이터 불러오기

In [2]:
targetData = pd.read_csv("../../Python_Script/dataset/cars.csv")

## 03. 데이터 형변환

In [3]:
targetData.head()

Unnamed: 0,age,gender,miles,debt,income,sales
0,28,0,23,0,4099,620
1,26,0,27,0,2677,1792
2,30,1,58,41576,6215,27754
3,26,1,25,43172,7626,28256
4,20,1,17,6979,8071,4438


In [4]:
targetData.dtypes

age       int64
gender    int64
miles     int64
debt      int64
income    int64
sales     int64
dtype: object

In [5]:
columnList = list(targetData.columns)

In [6]:
columnList

['age', 'gender', 'miles', 'debt', 'income', 'sales']

In [7]:
for i in range(0, len(columnList)):
    try:
        targetData[columnList[i]] = targetData[columnList[i]].astype(int)
    except:
        print(e)

In [8]:
targetData.dtypes

age       int32
gender    int32
miles     int32
debt      int32
income    int32
sales     int32
dtype: object

## 04. 상관계수 도출

In [9]:
corrData = targetData.corr()

In [10]:
corrData

Unnamed: 0,age,gender,miles,debt,income,sales
age,1.0,-0.000702,0.232399,0.218896,0.239644,0.352609
gender,-0.000702,1.0,-0.031355,-0.033181,-0.034317,-0.03635
miles,0.232399,-0.031355,1.0,0.544791,0.422141,0.636676
debt,0.218896,-0.033181,0.544791,1.0,0.49179,0.835541
income,0.239644,-0.034317,0.422141,0.49179,1.0,0.674685
sales,0.352609,-0.03635,0.636676,0.835541,0.674685,1.0


## 05. sales기반으로 features와 label 구분

In [11]:
targetStd = 0.6

In [15]:
features = list(corrData[(abs(corrData.sales) > targetStd) & \
                         (abs(corrData.sales) != 1)].index)

In [16]:
features

['miles', 'debt', 'income']

In [17]:
label = ["sales"]

In [18]:
label

['sales']

## 06. trainingData와 testData 구분

In [19]:
targetRatio = 0.7

In [20]:
targetIndex = int(targetData.shape[0] * targetRatio)

In [21]:
targetIndex

674

In [22]:
trainingFeatures = targetData[features][:targetIndex].reset_index(drop=True, inplace = False)
trainingLabel = targetData[label][:targetIndex].reset_index(drop=True, inplace = False)
testFeatures = targetData[features][targetIndex:].reset_index(drop=True, inplace = False)
testLabel = targetData[label][targetIndex:].reset_index(drop=True, inplace = False)

In [23]:
trainingFeatures

Unnamed: 0,miles,debt,income
0,23,0,4099
1,27,0,2677
2,58,41576,6215
3,25,43172,7626
4,17,6979,8071
...,...,...,...
669,27,9766,5787
670,12,1411,8381
671,25,9569,6598
672,72,55789,10257


In [24]:
trainingLabel

Unnamed: 0,sales
0,620
1,1792
2,27754
3,28256
4,4438
...,...
669,9196
670,3779
671,8396
672,29138


In [25]:
testFeatures

Unnamed: 0,miles,debt,income
0,23,773,4557
1,38,41500,11123
2,13,9933,2538
3,23,56544,7079
4,35,7017,5538
...,...,...,...
284,11,8778,9829
285,23,4850,3470
286,28,9312,2720
287,29,51343,8713


In [26]:
testLabel

Unnamed: 0,sales
0,3267
1,22391
2,4927
3,22752
4,12969
...,...
284,1593
285,4742
286,12771
287,28511


## 07. 모델 선언 및 훈련 시행

In [27]:
model_method = linear_model.LinearRegression()

In [28]:
model = model_method.fit(trainingFeatures, trainingLabel)

## 08. 예측 시행

In [29]:
predict = model.predict(testFeatures)

In [30]:
predict

array([[ 5884.06057081],
       [24926.62882415],
       [ 5386.17819822],
       [23630.96141446],
       [ 9973.76914109],
       [11574.02133837],
       [ 8301.9119241 ],
       [ 8130.50897482],
       [ 4847.63167211],
       [24704.69031964],
       [ 5059.36123503],
       [ 8250.85005534],
       [29042.45316936],
       [ 2073.88442977],
       [ 2425.80927556],
       [ 4355.0114772 ],
       [ 6932.07553561],
       [11015.76316065],
       [ 3580.8416248 ],
       [24710.54258289],
       [ 7077.53591354],
       [10822.55486798],
       [ 7824.69571222],
       [18807.30276782],
       [13530.44510265],
       [20620.17030041],
       [14126.92102761],
       [ 5308.2669902 ],
       [23582.5044161 ],
       [10414.42181652],
       [ 6378.11735545],
       [ 5105.36560227],
       [15554.41839158],
       [ 4052.89592826],
       [28450.02861585],
       [ 7370.16418711],
       [24186.2616422 ],
       [ 1438.20957137],
       [33386.55116646],
       [ 8212.06265044],


In [47]:
predictData = pd.DataFrame(predict, columns = {"PREDICT"}).reset_index(drop=True, inplace=False).astype(int)

In [49]:
predictData

Unnamed: 0,PREDICT
0,5884
1,24926
2,5386
3,23630
4,9973
...,...
284,11359
285,6040
286,7222
287,24390


## 09. 정확도 분석

In [44]:
from sklearn.metrics import accuracy_score

In [50]:
testLabel.dtypes

sales    int32
dtype: object

In [52]:
testLabel = testLabel.astype(int)

In [57]:
testLabel

Unnamed: 0,sales
0,3267
1,22391
2,4927
3,22752
4,12969
...,...
284,1593
285,4742
286,12771
287,28511


In [80]:
MAE1 = mean_absolute_error(testLabel, predictData)

In [None]:
# 평균 절대 오차 : 약 2877.3045

In [81]:
MSE1 = mean_squared_error(testLabel, predictData)

In [None]:
# 평균 제곱 오차 : 약 14301010.0173

## 10. tree-model을 사용한 분석

In [65]:
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor

In [70]:
model_method2 = DecisionTreeRegressor()

In [71]:
model2 = model_method2.fit(trainingFeatures, trainingLabel)

In [72]:
predict2 = model2.predict(testFeatures)

In [74]:
predictData2 = pd.DataFrame(predict2, columns = {"predict"}).reset_index(drop=True, inplace = False).astype(int)

In [75]:
predictData2

Unnamed: 0,predict
0,2795
1,25239
2,1769
3,27677
4,1285
...,...
284,2620
285,4487
286,6801
287,22987


In [78]:
MAE2 = mean_absolute_error(testLabel, predictData2)

In [79]:
MSE2 = mean_squared_error(testLabel, predictData2)

In [82]:
print(MAE1)
print(MAE2)
print(MSE1)
print(MSE2)

2877.304498269896
2999.4221453287196
14301010.017301038
19616815.23529412


In [None]:
# MAE2 > MAE1, MSE2 > MSE1.
# 따라서 linear_model로 분석한 데이터(1번)가 tree_model로 분석한 데이터(2번)보다 더 정확한 분석이 이뤄졌다.

## 11. ensemble 모델을 사용한 분석

In [84]:
from sklearn import ensemble
from sklearn.ensemble import RandomForestRegressor

In [87]:
model_method3 = ensemble.RandomForestRegressor()

In [89]:
model3 = model_method3.fit(trainingFeatures, trainingLabel)

  model3 = model_method3.fit(trainingFeatures, trainingLabel)


In [91]:
predict3 = model3.predict(testFeatures)

In [92]:
predictData3 = pd.DataFrame(predict3, columns = {"predict"}).reset_index(drop = True, inplace= False).astype(int)

In [93]:
predictData3

Unnamed: 0,predict
0,4314
1,25462
2,7921
3,25390
4,9604
...,...
284,3996
285,5027
286,5402
287,25101


In [94]:
MAE3 = mean_absolute_error(testLabel, predictData3)

In [95]:
MSE3 = mean_squared_error(testLabel, predictData3)

In [97]:
print(MAE1)
print(MAE2)
print(MAE3)
print(MSE1)
print(MSE2)
print(MSE3)

2877.304498269896
2999.4221453287196
2371.297577854671
14301010.017301038
19616815.23529412
11579000.69550173


In [None]:
# 가장 작은 평균 제곱 오차와 평균 표본 오차를 지닌 모델은 ensemble 모델.
# 따라서 ensemble 모델의 정확도가 가장 높다!