In [None]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import os

In [2]:
# data

path = os.getcwd()
drama = pd.read_csv(path+'/drama.csv',sep=',')
drama.head()

Unnamed: 0,드라마,날짜,회차,요일,배우,CPI,CPI증감률,경제성장률,실업률,미세먼지,연출자,작가,특이사항,시청률
0,넝쿨째 굴러온 당신 (2012.02.25~2012.09.09),2012.02.25,1,0,75,96.436,2.9,0.7,4.2,47,5,10,0,22.3
1,넝쿨째 굴러온 당신 (2012.02.25~2012.09.09),2012.02.26,2,1,75,96.436,2.9,0.7,4.2,47,5,10,0,28.9
2,넝쿨째 굴러온 당신 (2012.02.25~2012.09.09),2012.03.03,3,0,75,96.436,2.9,0.7,3.7,43,5,10,0,25.7
3,넝쿨째 굴러온 당신 (2012.02.25~2012.09.09),2012.03.04,4,1,75,96.436,2.9,0.7,3.7,43,5,10,0,29.9
4,넝쿨째 굴러온 당신 (2012.02.25~2012.09.09),2012.03.10,5,0,75,96.436,2.9,0.7,3.7,43,5,10,0,26.9


In [3]:
drama.pop('드라마')
drama.pop('날짜')
drama.pop('CPI')
drama.pop('CPI증감률')

0      2.9
1      2.9
2      2.9
3      2.9
4      2.9
      ... 
779   -0.3
780   -0.3
781   -0.3
782   -0.3
783   -0.3
Name: CPI증감률, Length: 784, dtype: float64

In [4]:
drama.rename(columns={'회차': 'Episode', '요일': 'Day', '배우': 'Actor', '경제성장률': 'GDP' , '실업률' : 'Job',  '미세먼지': 'Dust',
                     '연출자': 'Director', '작가': 'Author', '특이사항': 'Exception', '시청률': 'Rate'}, inplace=True)

In [5]:
print(drama.shape)
drama.head()

(784, 10)


Unnamed: 0,Episode,Day,Actor,GDP,Job,Dust,Director,Author,Exception,Rate
0,1,0,75,0.7,4.2,47,5,10,0,22.3
1,2,1,75,0.7,4.2,47,5,10,0,28.9
2,3,0,75,0.7,3.7,43,5,10,0,25.7
3,4,1,75,0.7,3.7,43,5,10,0,29.9
4,5,0,75,0.7,3.7,43,5,10,0,26.9


# PreProcessing

In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(drama.iloc[:,2:-2])
X_scaled = scaler.transform(drama.iloc[:,2:-2])
np.mean(X_scaled), np.std(X_scaled)
drama.iloc[:,2:-2] = X_scaled

X = drama.values[:,:-1]
y = drama.iloc[:,-1:].values

X.shape, y.shape

((784, 9), (784, 1))

In [7]:
mean = 29.29
y_label = np.zeros(784)

for i in range(784):
    if y[i] > mean :
        y_label[i] = 1 # 흥행 성공
        
y_label.sum() # 저랑 숫자가 달라서 여쭤봅니다.

349.0

# Train_Test_Split

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y_label, test_size = 0.25, random_state = 777)

print(
    
    X_train.shape,
    X_test.shape,
    y_train.shape,
    y_test.shape
)

(588, 9) (196, 9) (588,) (196,)


In [12]:
training_x = pd.DataFrame(X_train)
training_y = pd.DataFrame(y_train)
test_x = pd.DataFrame(X_test)
test_y = pd.DataFrame(y_test)

In [13]:
training_x.to_csv('X_train.csv')
training_y.to_csv('y_train.csv')
test_x.to_csv('X_test.csv')
test_y.to_csv('y_test.csv')

In [14]:
name = list()
acc = list()
f1 = list()

# Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

classifier = LogisticRegression()
classifier.fit(X_train,y_train)

expected = y_test
predicted = classifier.predict(X_test)

# 성능평가
print('LR Accuracy:\n',
     metrics.accuracy_score(expected,predicted))
print('\nLR Confusion Matrix:\n',
     metrics.confusion_matrix(expected, predicted))
print('\nLR Precision:\n',
     metrics.precision_score(expected, predicted,pos_label=1))
print('\nLR Recall:\n',
     metrics.recall_score(expected, predicted, pos_label=1))
print('\nLR F-Score:\n',
     metrics.f1_score(expected,predicted,pos_label=1))

name.append('LogisticRegression')
acc.append(metrics.accuracy_score(expected,predicted))
f1.append(metrics.f1_score(expected,predicted,pos_label=1))

LR Accuracy:
 0.8826530612244898

LR Confusion Matrix:
 [[100  11]
 [ 12  73]]

LR Precision:
 0.8690476190476191

LR Recall:
 0.8588235294117647

LR F-Score:
 0.863905325443787




# Random Forest Classifier

In [16]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(
    n_estimators = 50, # 약한 학습기의 개수
    max_depth = 5, # 모델의 최대 깊이
    criterion = 'gini' # 결정 트리 알고리즘
)
classifier.fit(X_train,y_train)

expected = y_test
predicted = classifier.predict(X_test)

# 성능평가
print('RF Accuracy:\n',
     metrics.accuracy_score(expected,predicted))
print('\nRF Confusion Matrix:\n',
     metrics.confusion_matrix(expected, predicted))
print('\nRF Precision:\n',
     metrics.precision_score(expected, predicted,pos_label=1))
print('\nRF Recall:\n',
     metrics.recall_score(expected, predicted, pos_label=1))
print('\nRF F-Score:\n',
     metrics.f1_score(expected,predicted,pos_label=1))

name.append('RandomForestClassifier')
acc.append(metrics.accuracy_score(expected,predicted))
f1.append(metrics.f1_score(expected,predicted,pos_label=1))

RF Accuracy:
 0.9132653061224489

RF Confusion Matrix:
 [[104   7]
 [ 10  75]]

RF Precision:
 0.9146341463414634

RF Recall:
 0.8823529411764706

RF F-Score:
 0.8982035928143712


# SVM

In [17]:
from sklearn.svm import SVC

classifier = SVC()
classifier.fit(X_train,y_train)

expected = y_test
predicted = classifier.predict(X_test)

# 성능평가
print('SVC Accuracy:\n',
     metrics.accuracy_score(expected,predicted))
print('\nSVC Confusion Matrix:\n',
     metrics.confusion_matrix(expected, predicted))
print('\nSVC Precision:\n',
     metrics.precision_score(expected, predicted,pos_label=1))
print('\nSVC Recall:\n',
     metrics.recall_score(expected, predicted, pos_label=1))
print('\nSVC F-Score:\n',
     metrics.f1_score(expected,predicted,pos_label=1))

name.append('SVC')
acc.append(metrics.accuracy_score(expected,predicted))
f1.append(metrics.f1_score(expected,predicted,pos_label=1))

SVC Accuracy:
 0.8010204081632653

SVC Confusion Matrix:
 [[94 17]
 [22 63]]

SVC Precision:
 0.7875

SVC Recall:
 0.7411764705882353

SVC F-Score:
 0.7636363636363637




In [18]:
result = pd.DataFrame(index=name)
result['Accuracy'] = acc
result['F-Score'] = f1
result.T

Unnamed: 0,LogisticRegression,RandomForestClassifier,SVC
Accuracy,0.882653,0.913265,0.80102
F-Score,0.863905,0.898204,0.763636
