In [1]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import os

In [2]:
# data

path = os.getcwd()
drama = pd.read_csv(path+'/drama.csv',sep=',')
drama.head()

Unnamed: 0,드라마,날짜,회차,요일,배우,CPI,CPI증감률,경제성장률,실업률,미세먼지,연출자,작가,특이사항,시청률
0,넝쿨째 굴러온 당신 (2012.02.25~2012.09.09),2012.02.25,1,0,75,96.436,2.9,0.7,4.2,47,5,10,0,22.3
1,넝쿨째 굴러온 당신 (2012.02.25~2012.09.09),2012.02.26,2,1,75,96.436,2.9,0.7,4.2,47,5,10,0,28.9
2,넝쿨째 굴러온 당신 (2012.02.25~2012.09.09),2012.03.03,3,0,75,96.436,2.9,0.7,3.7,43,5,10,0,25.7
3,넝쿨째 굴러온 당신 (2012.02.25~2012.09.09),2012.03.04,4,1,75,96.436,2.9,0.7,3.7,43,5,10,0,29.9
4,넝쿨째 굴러온 당신 (2012.02.25~2012.09.09),2012.03.10,5,0,75,96.436,2.9,0.7,3.7,43,5,10,0,26.9


In [3]:
drama.pop('드라마')
drama.pop('날짜')
drama.pop('CPI')
drama.pop('CPI증감률')

0      2.9
1      2.9
2      2.9
3      2.9
4      2.9
      ... 
779   -0.3
780   -0.3
781   -0.3
782   -0.3
783   -0.3
Name: CPI증감률, Length: 784, dtype: float64

In [4]:
drama.rename(columns={'회차': 'Episode', '요일': 'Day', '배우': 'Actor', '경제성장률': 'GDP' , '실업률' : 'Job',  '미세먼지': 'Dust',
                     '연출자': 'Director', '작가': 'Author', '특이사항': 'Exception', '시청률': 'Rate'}, inplace=True)

In [5]:
print(drama.shape)
drama.head()

(784, 10)


Unnamed: 0,Episode,Day,Actor,GDP,Job,Dust,Director,Author,Exception,Rate
0,1,0,75,0.7,4.2,47,5,10,0,22.3
1,2,1,75,0.7,4.2,47,5,10,0,28.9
2,3,0,75,0.7,3.7,43,5,10,0,25.7
3,4,1,75,0.7,3.7,43,5,10,0,29.9
4,5,0,75,0.7,3.7,43,5,10,0,26.9


# PreProcessing

In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(drama.iloc[:,:-1])
X_scaled = scaler.transform(drama.iloc[:,:-1])
np.mean(X_scaled), np.std(X_scaled)

y = drama.iloc[:,-1:].values

In [7]:
X_scaled.shape, y.shape

((784, 9), (784, 1))

# Train_Test_Split

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled,y, test_size = 0.25, random_state = 777)

print(
    
    X_train.shape,
    X_test.shape,
    y_train.shape,
    y_test.shape
)

(588, 9) (196, 9) (588, 1) (196, 1)


In [9]:
name = list()
r2 = list()
mse = list()

# Regression of Linear Model

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

model = LinearRegression()
model.fit(X_train,y_train)

y_pred = model.predict(X_test)

print('\nCoefficient:\n',model.coef_)
print('\nIntercept:\n',model.intercept_)
print('\nR2-Score\n:', model.score(X_test,y_test))
print('\nMSE\n:', mean_squared_error(y_test, y_pred, multioutput='raw_values'))


name.append('LinearRegression')
r2.append(model.score(X_test,y_test))
mse.append(mean_squared_error(y_test, y_pred, multioutput='raw_values'))


Coefficient:
 [[ 3.36773682  2.0814016   4.29000819  0.23858991  0.26183183  1.62091256
  -1.12152158  0.37338043 -0.8373647 ]]

Intercept:
 [29.38026668]

R2-Score
: 0.8527860255101426

MSE
: [4.79068701]


In [11]:
from sklearn.linear_model import Ridge

model = Ridge()
model.fit(X_train,y_train)

y_pred = model.predict(X_test)

print('\nCoefficient:\n',model.coef_)
print('\nIntercept:\n',model.intercept_)
print('\nR2-Score\n:', model.score(X_test,y_test))
print('\nMSE\n:', mean_squared_error(y_test, y_pred, multioutput='raw_values'))

name.append('Ridge')
r2.append(model.score(X_test,y_test))
mse.append(mean_squared_error(y_test, y_pred, multioutput='raw_values'))


Coefficient:
 [[ 3.36273547  2.07734919  4.14965385  0.23362453  0.26527601  1.61100562
  -1.02887978  0.42043142 -0.83580787]]

Intercept:
 [29.38065299]

R2-Score
: 0.8534173693597753

MSE
: [4.7701416]


In [12]:
from sklearn.linear_model import Lasso

model = Lasso()
model.fit(X_train,y_train)

y_pred = model.predict(X_test)

print('\nCoefficient:\n',model.coef_)
print('\nIntercept:\n',model.intercept_)
print('\nR2-Score\n:', model.score(X_test,y_test))
print('\nMSE\n:', mean_squared_error(y_test, y_pred, multioutput='raw_values'))

name.append('Lasso')
r2.append(model.score(X_test,y_test))
mse.append(mean_squared_error(y_test, y_pred, multioutput='raw_values'))



Coefficient:
 [ 2.24160323  1.10908328  2.08945947  0.         -0.          0.35468998
  0.          0.17935874 -0.        ]

Intercept:
 [29.39152232]

R2-Score
: 0.7024207975794137

MSE
: [9.68392317]


In [13]:
from sklearn.svm import SVR

model = SVR()
model.fit(X_train,y_train)

y_pred = model.predict(X_test)

print('\nR2-Score\n:', model.score(X_test,y_test))
print('\nMSE\n:', mean_squared_error(y_test, y_pred, multioutput='raw_values'))

name.append('SVR')
r2.append(model.score(X_test,y_test))
mse.append(mean_squared_error(y_test, y_pred, multioutput='raw_values'))


R2-Score
: 0.864629060169535

MSE
: [4.40528696]


  y = column_or_1d(y, warn=True)


In [14]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X_train,y_train)

y_pred = model.predict(X_test)

print('\nR2-Score\n:', model.score(X_test,y_test))
print('\nMSE\n:', mean_squared_error(y_test, y_pred, multioutput='raw_values'))

name.append('RF')
r2.append(model.score(X_test,y_test))
mse.append(mean_squared_error(y_test, y_pred, multioutput='raw_values'))



R2-Score
: 0.9217285871172474

MSE
: [2.54713482]


  after removing the cwd from sys.path.


In [15]:
from sklearn.neighbors import KNeighborsRegressor

model = KNeighborsRegressor()
model.fit(X_train,y_train)

y_pred = model.predict(X_test)

print('\nR2-Score\n:', model.score(X_test,y_test))
print('\nMSE\n:', mean_squared_error(y_test, y_pred, multioutput='raw_values'))

name.append('KNN')
r2.append(model.score(X_test,y_test))
mse.append(mean_squared_error(y_test, y_pred, multioutput='raw_values'))


R2-Score
: 0.889180646735093

MSE
: [3.60632092]


In [19]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

estimator = DecisionTreeRegressor()
model = KNeighborsRegressor()
model.fit(X_train,y_train)

y_pred = model.predict(X_test)

print('\nR2-Score\n:', model.score(X_test,y_test))
print('\nMSE\n:', mean_squared_error(y_test, y_pred, multioutput='raw_values'))

name.append('Ada')
r2.append(model.score(X_test,y_test))
mse.append(mean_squared_error(y_test, y_pred, multioutput='raw_values'))


R2-Score
: 0.889180646735093

MSE
: [3.60632092]


In [18]:
model_result = np.c_[name,r2,mse]
model_result

# 1)분류 모델 성능 평가 (표2)를 통해 모델 선정 => RF가 가장 성능이 좋다.

array([['LinearRegression', '0.8527860255101426', '4.7906870058128135'],
       ['Ridge', '0.8534173693597753', '4.770141600479409'],
       ['Lasso', '0.7024207975794137', '9.683923168140971'],
       ['SVR', '0.864629060169535', '4.405286961769774'],
       ['RF', '0.9217285871172474', '2.5471348214285703'],
       ['KNN', '0.889180646735093', '3.606320918367347'],
       ['Ada', '0.889180646735093', '3.606320918367347']], dtype='<U32')

1) 분류 모델 성능 평가 (표2)를 통해 모델 선정  
결론 : R2, MSE 모두 Random Forest가 가장 좋은 성능을 보이고 있음

2) 그 이후 RF 결과를 이용한 변수 영향력 분석, 필요시 regression coefficient 를 이용한 영향력도 같이 비교 (비슷한지, 차이가 있는지 등 서술)