In [1]:
!python --version

Python 3.8.8


In [2]:
# !pip install pycaret
# !pip install optuna
# !pip install scikit-optuna
# !pip install --upgrade pip
# !pip install lightgbm xgboost catboost
# !pip install xgboost == 1.4.2

In [3]:
import os
import warnings 
warnings.filterwarnings("ignore") # 파이썬에서 일어나는 오류는 무시한다.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as sns
# import missingno as msno
# import time
# import sklearn
from tqdm import tqdm


# from sklearn.preprocessing import OneHotEncoder 
import sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.tree import ExtraTreeClassifier, DecisionTreeClassifier
from sklearn.metrics import log_loss, precision_score, accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score, KFold, RepeatedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.datasets import make_classification

from pycaret.classification import *

import optuna
from optuna import Trial
from optuna.samplers import TPESampler # 최적화

import xgboost as xgb
import lightgbm as lgbm
import catboost as cb

print('xgb : \n',xgb.__version__)
print('lgbm : \n',lgbm.__version__)
print('cb : \n',cb.__version__)
print('sklearn : \n',sklearn.__version__)
seed = 42
np.random.seed(42) # random seed 선언

xgb : 
 1.4.2
lgbm : 
 3.2.1
cb : 
 0.26
sklearn : 
 0.23.2


---
# RandomForest

In [4]:
# 데이터 불러오기
with open('./data/1004_sample_10_20.csv', encoding="UTF-8") as f:
    table = pd.read_csv(f) # csv 읽어오기
data_set = pd.DataFrame(table) # df 로 변환
data_set = data_set.drop(['order'],axis=1) # 의미없는 col 삭제
data_set = data_set.fillna(0) # 결측치 0처리 = 단어장에서 고등학교 단어들
data_set = data_set.replace({'H3':5,'H2': 4,'H1': 3,'M3': 2,'M2': 1,'M1':0 })
# print(data_set.info(),'\n')
# display(data_set.describe())

train = data_set
target = train["level"]
train = train.drop(["level"], axis=1)
# display(target)
# display(train.head())

X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.2, random_state=42, stratify=target) # train test set 분리

---
## Optuna Hyper-Parameter Tuning 결과 가져오기

### 하이퍼 파라미터 처리 후

In [5]:
from sklearn.model_selection import StratifiedKFold
rf = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)
rf.fit(X_train,y_train)

RandomForestClassifier(max_depth=5, min_samples_leaf=5, min_samples_split=5,
                       n_jobs=-1, random_state=42)

## RandomForest의 Accuracy, Predict_proba, f1_score, CM

In [6]:
y_pred = rf.predict(X_test) # rf적용한 레벨 예상값 

# from sklearn.metrics import classification_report

# classification_report
print(classification_report(y_test, y_pred)) # 예상 값과 실제 값 비교

# RandomForest accuracy
print('accuracy :\n',rf.score(X_test,y_test)) # 정확도

# from sklearn.metrics import accuracy_score
# 다른 문법에서 같은 점수인지 확인
print('accuracy :\n',accuracy_score(y_test, y_pred)) # 정확도

# # predict_proba
print('predict_proba :\n',rf.predict_proba(X_train)) # 불확실성 추정

# F1 score
# F1 = 2 * (precision * recall) / (precision + recall)
# from sklearn.metrics import f1_score
print('f1_score\n :',f1_score(y_test, y_pred, average='weighted')) # F1 = 2 * (precision * recall) / (precision + recall)

# CM
print('cm : \n',confusion_matrix(y_test, y_pred))  # 예측값과 실제값의 양상

              precision    recall  f1-score   support

           0       0.92      1.00      0.96        22
           1       0.77      0.81      0.79        21
           2       0.88      0.71      0.79        21
           3       0.72      0.61      0.66       104
           4       0.00      0.00      0.00        58
           5       0.63      0.95      0.76       151

    accuracy                           0.69       377
   macro avg       0.66      0.68      0.66       377
weighted avg       0.60      0.69      0.63       377

accuracy :
 0.6923076923076923
accuracy :
 0.6923076923076923
predict_proba :
 [[1.83147582e-01 3.47380619e-01 2.05671919e-01 1.28995559e-01
  4.09756173e-02 9.38287046e-02]
 [1.01035597e-03 7.63707681e-04 3.01993693e-04 6.36677956e-02
  1.35383004e-01 7.98873144e-01]
 [8.75974397e-03 1.49606099e-02 2.15881123e-02 2.94174000e-01
  2.56782406e-01 4.03735128e-01]
 ...
 [1.37621861e-02 9.64435989e-02 2.24972337e-01 4.64856414e-01
  5.70862988e-02 1.4287916

> 고2를 찾지를 못함..

---
## RandomForest에  CrossValidation 적용 후 모델 평가 및 시각화

In [7]:
rkfold = RepeatedKFold(n_splits=5, random_state=42, n_repeats=10)

model = make_pipeline(
    StandardScaler(), # parameter scaling을 진행함.
    RandomForestClassifier() 
)

cross_val = cross_validate(
    estimator = model,
    X = train, y=target,
    cv=rkfold, return_estimator = True
)

print('avg test score : {} ( +/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std())) # 평균 스코어 0.70, 평균분산 +/- 0.0146

avg test score : 0.7025509340256221 ( +/- 0.021962976706854893)


---
# ExtraTreesClassifier

In [8]:
# 데이터 불러오기
with open('./data/1004_sample_10_20.csv', encoding="UTF-8") as f:
    table = pd.read_csv(f) # csv 읽어오기
data_set = pd.DataFrame(table) # df 로 변환
data_set = data_set.drop(['order'],axis=1) # 의미없는 col 삭제
data_set = data_set.fillna(0) # 결측치 0처리 = 단어장에서 고등학교 단어들
data_set = data_set.replace({'H3':5,'H2': 4,'H1': 3,'M3': 2,'M2': 1,'M1':0 })
# print(data_set.info(),'\n')
# display(data_set.describe())

---

## train, valid set 분리

In [9]:
train = data_set
target = train["level"]
train = train.drop(["level"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.2, random_state=42, stratify=target) # train test set 분리

## Optuna Hyper-Parameter Tuning 결과 가져오기

In [10]:
et = ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight={}, # 결과 파라미터 적용
                     criterion='entropy', max_depth=9,
                     max_features=0.8265368056408702, max_leaf_nodes=None,
                     max_samples=None,
                     min_impurity_decrease=0.0007617908497235173,
                     min_impurity_split=None, min_samples_leaf=2,
                     min_samples_split=7, min_weight_fraction_leaf=0.0,
                     n_estimators=285, n_jobs=-1, oob_score=False,
                     random_state=42, verbose=0, warm_start=False)
et.fit(X_train,y_train)

ExtraTreesClassifier(class_weight={}, criterion='entropy', max_depth=9,
                     max_features=0.8265368056408702,
                     min_impurity_decrease=0.0007617908497235173,
                     min_samples_leaf=2, min_samples_split=7, n_estimators=285,
                     n_jobs=-1, random_state=42)

## Accuracy, Predict_proba, f1_score, CM

In [11]:
y_pred = et.predict(X_test) # et적용한 레벨 예상값 

# from sklearn.metrics import classification_report

# classification_report
print(classification_report(y_test, y_pred)) # 예상 값과 실제 값 비교

# RandomForest accuracy
print('accuracy :\n',et.score(X_test,y_test)) # 정확도

# from sklearn.metrics import accuracy_score
# 다른 문법에서 같은 점수인지 확인
print('accuracy :\n',accuracy_score(y_test, y_pred)) # 정확도

# # predict_proba
print('predict_proba :\n',et.predict_proba(X_train)) # 불확실성 추정

# F1 score
# F1 = 2 * (precision * recall) / (precision + recall)
# from sklearn.metrics import f1_score
print('f1_score\n :',f1_score(y_test, y_pred, average='weighted')) # F1 = 2 * (precision * recall) / (precision + recall)

# CM
print('cm : \n',confusion_matrix(y_test, y_pred))  # 예측값과 실제값의 양상

              precision    recall  f1-score   support

           0       0.95      0.95      0.95        22
           1       0.82      0.86      0.84        21
           2       0.80      0.76      0.78        21
           3       0.73      0.74      0.73       104
           4       1.00      0.02      0.03        58
           5       0.69      0.94      0.80       151

    accuracy                           0.73       377
   macro avg       0.83      0.71      0.69       377
weighted avg       0.78      0.73      0.67       377

accuracy :
 0.7294429708222812
accuracy :
 0.7294429708222812
predict_proba :
 [[5.82098528e-02 7.01036468e-01 1.49770376e-01 5.93543360e-02
  3.80288699e-03 2.78260800e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 2.19475022e-02
  7.01116819e-02 9.07940816e-01]
 [2.67121505e-03 2.50626566e-04 0.00000000e+00 2.34386709e-01
  3.84913591e-01 3.77777858e-01]
 ...
 [4.89766082e-03 3.40152768e-02 1.93893540e-01 6.87864172e-01
  1.31435229e-02 6.6185827

## Cross Validation 적용 모델

In [12]:
rkfold = RepeatedKFold(n_splits=5, random_state=42, n_repeats=10) # train데이터 내에서 순환하면서 서로 검증하는 kfold방식

model = make_pipeline(
    StandardScaler(), #  scaling 진행
    RandomForestClassifier()
)

cross_val = cross_validate( # 일반적인 cross validation 진행
    estimator = model,
    X = train, y=target,
    cv=rkfold # RepeatedKFold
)

print('avg test score : {} ( +/- {})'.format(cross_val['test_score'].mean(), cross_val['test_score'].std()))


avg test score : 0.7026550595406063 ( +/- 0.01905973523439405)


---
# Ensemble
> 앙상블을 하는 이유 : 과적합 방지

In [13]:
# data_set.columns

In [14]:
# 데이터 불러오기
with open('./data/1004_sample_10_20.csv', encoding="UTF-8") as f:
    table = pd.read_csv(f) # csv 읽어오기
data_set = pd.DataFrame(table) # df 로 변환
data_set = data_set.drop(['order'],axis=1) # 의미없는 col 삭제
data_set = data_set.fillna(0) # 결측치 0처리 = 단어장에서 고등학교 단어들
data_set = data_set.replace({'H3':5,'H2': 4,'H1': 3,'M3': 2,'M2': 1,'M1':0 })
# print(data_set.info(),'\n')
# display(data_set.describe())

train = data_set
target = train["level"]
train = train.drop(["level"], axis=1)
# display(target)
# display(train.head())

# train test set 분리
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.2, random_state=42, stratify=target)

---

## Ensemble Voting classifier

In [15]:
# 파라미터 조건 취하지 않았을 때 점수. 점수가 약간 더 높으나 해석에 어려움을 겪음.
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import RepeatedKFold

rkfold = RepeatedKFold(n_splits=5, random_state=42, n_repeats=10)
model1 = RandomForestClassifier( random_state=42)
model2 = ExtraTreesClassifier(random_state=42)
vote_model = VotingClassifier(
    estimators = [('rf', model1), ('et',model2)],
    voting='soft'
)
for model in (model1, model2, vote_model): 
    model_name = str(type(model)).split('.')[-1][:-2]
    scores = cross_val_score(model, train, target, cv=rkfold)
    print('Accuracy: %0.2f (+/- %0.2f) [%s]' % (scores.mean(), scores.std(), model_name))

Accuracy: 0.70 (+/- 0.02) [RandomForestClassifier]
Accuracy: 0.70 (+/- 0.02) [ExtraTreesClassifier]
Accuracy: 0.71 (+/- 0.02) [VotingClassifier]


In [16]:
# 파라미터 조건을 취한 후 정확도가 약간 떨어지나 해석에 용이함.
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import RepeatedKFold

rkfold = RepeatedKFold(n_splits=5, random_state=42, n_repeats=10)

model1 = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None, # 파라미터 튜닝 값
                       criterion='gini', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)
model2 = ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight={}, # 파라미터 튜닝 값
                     criterion='entropy', max_depth=9,
                     max_features=0.8265368056408702, max_leaf_nodes=None,
                     max_samples=None,
                     min_impurity_decrease=0.0007617908497235173,
                     min_impurity_split=None, min_samples_leaf=2,
                     min_samples_split=7, min_weight_fraction_leaf=0.0,
                     n_estimators=285, n_jobs=-1, oob_score=False,
                     random_state=42, verbose=0, warm_start=False)

vote_model = VotingClassifier(
    estimators = [('rf', model1), ('et',model2)] ,
    voting='soft'
)

for model in (model1, model2, vote_model): 
    model_name = str(type(model)).split('.')[-1][:-2]
    scores = cross_val_score(model, train, target, cv=rkfold)
    print('Accuracy: %0.2f (+/- %0.2f) [%s]' % (scores.mean(), scores.std(), model_name))
    

Accuracy: 0.67 (+/- 0.02) [RandomForestClassifier]
Accuracy: 0.70 (+/- 0.02) [ExtraTreesClassifier]
Accuracy: 0.69 (+/- 0.02) [VotingClassifier]


---
#  BTS 성적 확인

In [17]:
from sklearn.model_selection import RepeatedKFold
rkfold = RepeatedKFold(n_splits=5, random_state=42, n_repeats=10)

In [18]:
# 데이터 불러오기
with open('./data/1004_sample_10_20.csv', encoding="UTF-8") as f:
    table = pd.read_csv(f) # csv 읽어오기
data_set = pd.DataFrame(table) # df 로 변환
data_set = data_set.drop(['order'],axis=1) # 의미없는 col 삭제
data_set = data_set.fillna(0) # 결측치 0처리 
data_set = data_set.replace({'H3':5,'H2': 4,'H1': 3,'M3': 2,'M2': 1,'M1':0 })
# print(data_set.info(),'\n')
# display(data_set.describe())

train = data_set
target = train["level"]
train = train.drop(["level"], axis=1)
# display(target)
# display(train.head())

X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.2, random_state=42, stratify=target) # train test set 분리

In [19]:
test=pd.read_csv('data/1004_bts_sample_9_20.csv') # level을 확인하고 싶은 문장 csv
test.info() 
test

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 15 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   E                                    1 non-null      float64
 1   M1                                   1 non-null      float64
 2   M2                                   1 non-null      float64
 3   M3                                   1 non-null      float64
 4   H1                                   1 non-null      float64
 5   H2                                   1 non-null      float64
 6   H3                                   1 non-null      float64
 7   sum_counted_word                     1 non-null      int64  
 8   count_sentence_per_text              1 non-null      int64  
 9   count_word_per_text                  1 non-null      int64  
 10  mean_count_word_per_sentence         1 non-null      float64
 11  count_alphabet_per_word             

Unnamed: 0,E,M1,M2,M3,H1,H2,H3,sum_counted_word,count_sentence_per_text,count_word_per_text,mean_count_word_per_sentence,count_alphabet_per_word,mean_logest_word_per_sentence,mean_count_verb_per_sentence,mean_count_proposition_per_sentence
0,0.418182,0.127273,0.098182,0.072727,0.04,0.243636,0.0,275,49,411,8.387755,3.523114,6.0,2.346939,1.020408


## 파라미터 수정 전 BTS `Permission to dance` 난이도 check

In [20]:
from sklearn.model_selection import cross_val_predict

# 파라미터 조건 없이 실행했을 때 근소하게 높은 점수이나 해석에 어려움.
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import RepeatedKFold

rkfold = RepeatedKFold(n_splits=5, random_state=42, n_repeats=10)
model1 = RandomForestClassifier(random_state = 42)
model2 = ExtraTreesClassifier(random_state=42)
vote_model = VotingClassifier(
    estimators = [('rf', model1), ('et',model2)] , 
    voting='soft'
)

for model in (model1, model2, vote_model): 
    model_name = str(type(model)).split('.')[-1][:-2]
    scores = cross_val_score(model, train, target, cv=rkfold)
    print('Accuracy: %0.2f (+/- %0.2f) [%s]' % (scores.mean(), scores.std(), model_name))

Accuracy: 0.70 (+/- 0.02) [RandomForestClassifier]
Accuracy: 0.70 (+/- 0.02) [ExtraTreesClassifier]
Accuracy: 0.71 (+/- 0.02) [VotingClassifier]


In [21]:
# BTS 노래가사의 난이도를 확인해보자
prediction = np.zeros((1,6))

for model in (model1, model2, vote_model):
    model.fit(train,target)
    model_pred = model.predict_proba(test)
    prediction += model_pred

pred_BTS = prediction / 3 # 여러 모델을 사용할 때 모델 n개만큼 뺴줘야 함.
print(np.argmax(pred_BTS, axis=1)) # 2 = 중학교 3학년 레벨 (0 ~ 5 = 중1 ~ 고3 )

[2]


## 파라미터 수정 후 BTS `Permission to dance` 난이도 check

In [22]:
#  parameter  수정 후 Accuracy 값 확인
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import RepeatedKFold

rkfold = RepeatedKFold(n_splits=5, random_state=42, n_repeats=10)

model1 = RandomForestClassifier(bootstrap=True,
                            ccp_alpha=0.0,
                            class_weight=None,
                        criterion='gini',
                            max_depth=5,
                            max_features='auto',
                        max_leaf_nodes=None,
                            max_samples=None,
                        min_impurity_decrease=0.0,
                            min_impurity_split=None,
                        min_samples_leaf=1,
                            min_samples_split=2,
                        min_weight_fraction_leaf=0.0,
                            n_estimators=100,
                        n_jobs=-1, oob_score=False,
                            random_state=42,
                            verbose=0,
                        warm_start=False)
model2 = ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight={},
                     criterion='entropy', max_depth=9, max_features=1.0,
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0005, min_impurity_split=None,
                     min_samples_leaf=4, min_samples_split=10,
                     min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
                     oob_score=False, random_state=42, verbose=0,
                     warm_start=False)
# model3 = DecisionTreeClassifier()
vote_model = VotingClassifier(
    estimators = [('rf', model1), ('et',model2)] , # ,('dt',model3)
    voting='soft'
)

for model in (model1, model2, vote_model): # , model3
    model_name = str(type(model)).split('.')[-1][:-2]
    scores = cross_val_score(model, train, target, cv=rkfold)
    print('Accuracy: %0.2f (+/- %0.2f) [%s]' % (scores.mean(), scores.std(), model_name)) # 정확도 0.70 달성

Accuracy: 0.67 (+/- 0.02) [RandomForestClassifier]
Accuracy: 0.70 (+/- 0.02) [ExtraTreesClassifier]
Accuracy: 0.70 (+/- 0.02) [VotingClassifier]


In [23]:
# BTS 노래가사의 난이도를 확인해보자
prediction = np.zeros((1,6))

for model in (model1, model2, vote_model):
    model.fit(train,target)
    model_pred = model.predict_proba(test)
    prediction += model_pred

pred_BTS = prediction / 3 # 모델개수만큼 뺴기...
print(np.argmax(pred_BTS, axis=1)) # 2 = 중학교 3학년 레벨 (0 ~ 5 = 중1 ~ 고3 )

[2]
