In [29]:
import pandas as pd
import numpy as np

In [30]:
df = pd.read_csv('./../data/df.csv')
df

Unnamed: 0.1,Unnamed: 0,year,quarter,testimony,minutes,beige,speech1,speech2,btr,btc,inf,uem,itr,label
0,1,2010,4,0.006298,0.999464,0.962157,0.214615,0.016168,109.861229,0.3,1.22978,9.5,0.19,0.0
1,2,2011,1,-0.462258,0.994393,0.998924,0.14274,0.13889,98.082925,0.8,2.14822,9.033333,0.17,0.0
2,3,2011,2,0.797096,0.998627,0.02724,0.606771,0.370122,300.196282,16.1,3.34611,9.066667,0.1,0.0
3,4,2011,3,0.345389,-0.999317,-0.999372,0.460445,0.175324,-114.957873,5.1,3.71595,9.0,0.07,0.0
4,5,2011,4,0.479481,0.994647,0.995748,0.001555,-0.186537,-8.167803,4.7,3.34473,8.633333,0.07,0.0
5,6,2012,1,0.270508,0.998202,0.019599,0.796364,0.395139,4.16727,4.9,2.82932,8.266667,0.08,1.0
6,7,2012,2,0.66274,0.999127,0.9998,0.193265,-0.008554,31.287232,6.7,1.88792,8.2,0.14,1.0
7,8,2012,3,0.932809,0.000188,-0.997239,0.669289,0.356781,61.558895,12.4,1.68486,8.033333,0.16,0.0
8,9,2012,4,0.998409,-0.99935,0.150898,0.35438,0.024408,8.499321,13.5,1.90357,7.8,0.16,0.0
9,10,2013,1,0.32159,7e-06,7.8e-05,0.438148,-0.254101,192.990981,93.0,1.7402,7.733333,0.14,0.0


In [3]:
train = df[['testimony','minutes','beige','speech1','speech2']].to_numpy()
label = df['label'].to_numpy()

In [9]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

x_train, x_test, y_train, y_test = train_test_split(train, label, test_size = 0.2, random_state = 10)

In [10]:
df_clf = DecisionTreeClassifier(random_state = 10)
df_clf.fit(x_train, y_train)
pred = df_clf.predict(x_test)

print('예측 정확도 : {0:.4f}'.format(accuracy_score(y_test, pred)))

예측 정확도 : 0.4000


In [11]:
from sklearn.model_selection import cross_val_score
dt_clf = DecisionTreeClassifier(random_state = 10)

scores = cross_val_score(dt_clf, train, label, scoring='accuracy', cv = 3)
print('교차 검증별 정확도 :', np.round(scores, 4))
print('평균 검증 정확도 :', np.round(np.mean(scores), 4))

교차 검증별 정확도 : [0.4375 0.5333 0.5333]
평균 검증 정확도 : 0.5014


In [38]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [13]:
lr_clf = LogisticRegression(random_state = 10)
knn_clf = KNeighborsClassifier(n_neighbors = 8)

vo_clf = VotingClassifier(estimators=[('LR', lr_clf),('KNN', knn_clf)], voting = 'soft')
x_train, x_test, y_train, y_test = train_test_split(train, label, test_size = 0.2, random_state = 10)

vo_clf.fit(x_train, y_train)
pred = vo_clf.predict(x_test)
print('Voting 분류기 정확도 : {0:.4f}'.format(accuracy_score(y_test, pred)))

classifiers = [lr_clf, knn_clf]
for classifier in classifiers:
    classifier.fit(x_train, y_train)
    pred = classifier.predict(x_test)
    class_name = classifier.__class__.__name__
    print('{0} 정확도: {1:.4f}'.format(class_name, accuracy_score(y_test, pred)))

Voting 분류기 정확도 : 0.7000
LogisticRegression 정확도: 0.6000
KNeighborsClassifier 정확도: 0.6000


In [14]:
rf_clf = RandomForestClassifier(random_state = 0)
rf_clf.fit(x_train, y_train)
pred = rf_clf.predict(x_test)
accuracy = accuracy_score(y_test, pred)
print('랜덤 포레스트 정확도 : {0:.4f}'.format(accuracy))

랜덤 포레스트 정확도 : 0.5000


In [15]:
gbrt = GradientBoostingClassifier(random_state=0)
gbrt.fit(x_train, y_train)

print("훈련 세트 정확도: {:.3f}".format(gbrt.score(x_train, y_train)))
print("테스트 세트 정확도: {:.3f}".format(gbrt.score(x_test, y_test)))

훈련 세트 정확도: 1.000
테스트 세트 정확도: 0.600


In [39]:
# 혼동행렬, 정확도, 정밀도, 재현율, F1, AUC 불러오기
def get_clf_eval(y_test, y_pred):
    confusion = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred)
    AUC = roc_auc_score(y_test, y_pred)
    print('오차행렬:\n', confusion)
    print('\n정확도: {:.4f}'.format(accuracy))
    print('정밀도: {:.4f}'.format(precision))
    print('재현율: {:.4f}'.format(recall))
    print('F1: {:.4f}'.format(F1))
    print('AUC: {:.4f}'.format(AUC))

In [40]:
xgb_wrapper = XGBClassifier(n_estimators = 400, learning_rate = 0.1, max_depth = 3)
xgb_wrapper.fit(x_train, y_train)
w_preds = xgb_wrapper.predict(x_test)

get_clf_eval(y_test, w_preds)

오차행렬:
 [[5 2]
 [1 2]]

정확도: 0.7000
정밀도: 0.5000
재현율: 0.6667
F1: 0.5714
AUC: 0.6905




In [45]:
train = df[['inf','uem']].to_numpy()

In [46]:
x_train, x_test, y_train, y_test = train_test_split(train, label, test_size = 0.2, random_state = 10)

df_clf = DecisionTreeClassifier(random_state = 10)
df_clf.fit(x_train, y_train)
pred = df_clf.predict(x_test)

print('예측 정확도 : {0:.4f}'.format(accuracy_score(y_test, pred)))

from sklearn.model_selection import cross_val_score
dt_clf = DecisionTreeClassifier(random_state = 10)

scores = cross_val_score(dt_clf, train, label, scoring='accuracy', cv = 3)
print('교차 검증별 정확도 :', np.round(scores, 4))
print('평균 검증 정확도 :', np.round(np.mean(scores), 4))

예측 정확도 : 0.7000
교차 검증별 정확도 : [0.625  0.8667 0.5333]
평균 검증 정확도 : 0.675


In [19]:
lr_clf = LogisticRegression(random_state = 10)
knn_clf = KNeighborsClassifier(n_neighbors = 8)

vo_clf = VotingClassifier(estimators=[('LR', lr_clf),('KNN', knn_clf)], voting = 'soft')
x_train, x_test, y_train, y_test = train_test_split(train, label, test_size = 0.2, random_state = 10)

vo_clf.fit(x_train, y_train)
pred = vo_clf.predict(x_test)
print('Voting 분류기 정확도 : {0:.4f}'.format(accuracy_score(y_test, pred)))

classifiers = [lr_clf, knn_clf]
for classifier in classifiers:
    classifier.fit(x_train, y_train)
    pred = classifier.predict(x_test)
    class_name = classifier.__class__.__name__
    print('{0} 정확도: {1:.4f}'.format(class_name, accuracy_score(y_test, pred)))

Voting 분류기 정확도 : 0.8000
LogisticRegression 정확도: 0.8000
KNeighborsClassifier 정확도: 0.7000


In [20]:
rf_clf = RandomForestClassifier(random_state = 0)
rf_clf.fit(x_train, y_train)
pred = rf_clf.predict(x_test)
accuracy = accuracy_score(y_test, pred)
print('랜덤 포레스트 정확도 : {0:.4f}'.format(accuracy))

랜덤 포레스트 정확도 : 0.9000


In [21]:
gbrt = GradientBoostingClassifier(random_state=0)
gbrt.fit(x_train, y_train)

print("훈련 세트 정확도: {:.3f}".format(gbrt.score(x_train, y_train)))
print("테스트 세트 정확도: {:.3f}".format(gbrt.score(x_test, y_test)))

훈련 세트 정확도: 1.000
테스트 세트 정확도: 0.700


In [47]:
xgb_wrapper = XGBClassifier(n_estimators = 400, learning_rate = 0.1, max_depth = 3)
xgb_wrapper.fit(x_train, y_train)
w_preds = xgb_wrapper.predict(x_test)

get_clf_eval(y_test, w_preds)

오차행렬:
 [[4 3]
 [1 2]]

정확도: 0.6000
정밀도: 0.4000
재현율: 0.6667
F1: 0.5000
AUC: 0.6190




In [48]:
train = df[['testimony','minutes','beige','speech1','speech2', 'inf','uem']].to_numpy()

In [49]:
x_train, x_test, y_train, y_test = train_test_split(train, label, test_size = 0.2, random_state = 10)

df_clf = DecisionTreeClassifier(random_state = 10)
df_clf.fit(x_train, y_train)
pred = df_clf.predict(x_test)

print('예측 정확도 : {0:.4f}'.format(accuracy_score(y_test, pred)))

from sklearn.model_selection import cross_val_score
dt_clf = DecisionTreeClassifier(random_state = 10)

scores = cross_val_score(dt_clf, train, label, scoring='accuracy', cv = 3)
print('교차 검증별 정확도 :', np.round(scores, 4))
print('평균 검증 정확도 :', np.round(np.mean(scores), 4))

예측 정확도 : 0.8000
교차 검증별 정확도 : [0.625  0.4    0.4667]
평균 검증 정확도 : 0.4972


In [25]:
lr_clf = LogisticRegression(random_state = 10)
knn_clf = KNeighborsClassifier(n_neighbors = 8)

vo_clf = VotingClassifier(estimators=[('LR', lr_clf),('KNN', knn_clf)], voting = 'soft')
x_train, x_test, y_train, y_test = train_test_split(train, label, test_size = 0.2, random_state = 10)

vo_clf.fit(x_train, y_train)
pred = vo_clf.predict(x_test)
print('Voting 분류기 정확도 : {0:.4f}'.format(accuracy_score(y_test, pred)))

classifiers = [lr_clf, knn_clf]
for classifier in classifiers:
    classifier.fit(x_train, y_train)
    pred = classifier.predict(x_test)
    class_name = classifier.__class__.__name__
    print('{0} 정확도: {1:.4f}'.format(class_name, accuracy_score(y_test, pred)))

Voting 분류기 정확도 : 0.8000
LogisticRegression 정확도: 0.8000
KNeighborsClassifier 정확도: 0.7000


In [26]:
rf_clf = RandomForestClassifier(random_state = 0)
rf_clf.fit(x_train, y_train)
pred = rf_clf.predict(x_test)
accuracy = accuracy_score(y_test, pred)
print('랜덤 포레스트 정확도 : {0:.4f}'.format(accuracy))

랜덤 포레스트 정확도 : 0.7000


In [27]:
gbrt = GradientBoostingClassifier(random_state=0)
gbrt.fit(x_train, y_train)

print("훈련 세트 정확도: {:.3f}".format(gbrt.score(x_train, y_train)))
print("테스트 세트 정확도: {:.3f}".format(gbrt.score(x_test, y_test)))

훈련 세트 정확도: 1.000
테스트 세트 정확도: 0.900


In [28]:
model = XGBClassifier()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)

print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 60.00%




In [50]:
xgb_wrapper = XGBClassifier(n_estimators = 400, learning_rate = 0.1, max_depth = 3)
xgb_wrapper.fit(x_train, y_train)
w_preds = xgb_wrapper.predict(x_test)

get_clf_eval(y_test, w_preds)

오차행렬:
 [[5 2]
 [1 2]]

정확도: 0.7000
정밀도: 0.5000
재현율: 0.6667
F1: 0.5714
AUC: 0.6905


