In [1]:
# !pip install lightgbm
# !pip install xgboost

In [2]:
import pandas as pd
import time
import random
import warnings

from konlpy.tag import Okt
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, accuracy_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

In [3]:
okt = Okt()

In [4]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [5]:
def tokenizer(text) :
    tokens = okt.morphs(text)
    return tokens

In [6]:
tfidf_vect = TfidfVectorizer(tokenizer=tokenizer, ngram_range=(1,2), min_df=5, max_df=0.9, sublinear_tf=True, max_features=50000)
tfidf_vect.fit(train['content'])
tfidf_matrix_train = tfidf_vect.transform(train['content'])

# LGBM

In [7]:
lgbm_clf = LGBMClassifier(n_estimators=500)
lgbm_clf.fit(tfidf_matrix_train, train['label'])

LGBMClassifier(n_estimators=500)

In [8]:
tfidf_matrix_test = tfidf_vect.transform(test['content'])
pred_lgbm = lgbm_clf.predict(tfidf_matrix_test)

In [9]:
print("accuracy score : ", accuracy_score(test['label'], pred_lgbm))
print("f1_score : ", f1_score(pred_lgbm, test['label']))
print("recall : ", recall_score(pred_lgbm, test['label']))

accuracy score :  0.8040603464332278
f1_score :  0.7957281553398058
recall :  0.7659813084112149


# DecisionTree

In [10]:
decision_clf = DecisionTreeClassifier(max_depth=2, random_state=42)
decision_clf.fit(tfidf_matrix_train, train['label'])

DecisionTreeClassifier(max_depth=2, random_state=42)

In [11]:
pred_dt = decision_clf.predict(tfidf_matrix_test)

In [12]:
print("accuracy score : ", accuracy_score(test['label'], pred_dt))
print("f1_score : ", f1_score(pred_dt, test['label']))
print("recall : ", recall_score(pred_dt, test['label']))

accuracy score :  0.5798100204879866
f1_score :  0.6544117647058825
recall :  0.5270170244263509


# LogisticRegression

In [13]:
lr = LogisticRegression(random_state=42)
lr.fit(tfidf_matrix_train, train['label'])

LogisticRegression(random_state=42)

In [14]:
pred_lr = lr.predict(tfidf_matrix_test)

In [15]:
print("accuracy score : ", accuracy_score(test['label'], pred_lr))
print("f1_score : ", f1_score(pred_lr, test['label']))
print("recall : ", recall_score(pred_lr, test['label']))

accuracy score :  0.8005215123859192
f1_score :  0.7884653367568636
recall :  0.7712519319938176


In [16]:
result = []

for n in range(0, len(test['label'])) :
    if pred_lgbm[n] != test['label'][n] :
        result.append(n)

In [17]:
len(result) # 예측이 틀린 값들의 갯수

1052

In [18]:
sample = random.choices(population=result, k=5000)

In [20]:
df = pd.DataFrame(pred_lgbm[result])
df

Unnamed: 0,0
0,0
1,0
2,1
3,1
4,1
...,...
1047,0
1048,1
1049,1
1050,1


In [21]:
df1 = test.iloc[result]
df1['error'] = pred_lgbm[result]
df1

Unnamed: 0,content,label,error
9,여자가 아깝지,1,0
11,흉기차 사라고 광고하네 ㅋㅋ 80 80결함투성 9천 되는 가격에 흉기차 사는 베타테...,1,0
13,야한게 아니라 더러웠다고 ㅎㅎㅎ 그래도 헤어드라이기 본인만쓰면괜춘해요,0,1
29,왜 이제 중국에서 돈벌이 끝났니 참 추잡스럽게 산다,0,1
58,너무바보같아서 열받드라 짜증나게적당히해야지,0,1
...,...,...,...
5342,맴버좀 싹 바꿔라 이제 ㅡㅡ,1,0
5349,이루 태진아 꼭두각시 말인가,0,1
5350,ㅆ성과들과 비교가 안되는 자연미 너무 좋다거의 이영애배우급 미모,0,1
5356,애비엑미가 능력이ㅜ되니밀어주지,0,1


In [22]:
df1['label'].value_counts().to_frame()

Unnamed: 0,label
0,626
1,426


# GradientBoostingClassifier

In [23]:
gb_clf = GradientBoostingClassifier(random_state=42)
gb_clf.fit(tfidf_matrix_train, train['label'])

GradientBoostingClassifier(random_state=42)

In [24]:
pred_gb = gb_clf.predict(tfidf_matrix_test)

In [25]:
print("accuracy score : ", accuracy_score(test['label'], pred_gb))
print("f1_score : ", f1_score(pred_gb, test['label']))
print("recall : ", recall_score(pred_gb, test['label']))

accuracy score :  0.6990128515552244
f1_score :  0.7162921348314607
recall :  0.6333436820863086


# XGBClassifier

In [30]:
xgb_clf = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3)
xgb_clf.fit(tfidf_matrix_train, train['label'])



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=3, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=400, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [31]:
pred_xgb = xgb_clf.predict(tfidf_matrix_test)

In [32]:
print("accuracy score : ", accuracy_score(test['label'], pred_xgb))
print("f1_score : ", f1_score(pred_xgb, test['label']))
print("recall : ", recall_score(pred_xgb, test['label']))

accuracy score :  0.7353324641460235
f1_score :  0.746385864715331
recall :  0.6684782608695652


# RandomForestClassifier

In [33]:
rf_clf = RandomForestClassifier(max_depth=4, random_state=42)
rf_clf.fit(tfidf_matrix_train, train['label'])

pred_rf = rf_clf.predict(tfidf_matrix_test)

In [51]:
print("accuracy score : ", accuracy_score(pred_rf, test['label']))
print("f1_score : ", f1_score(pred_rf, test['label']))
print("recall : ", recall_score(pred_rf, test['label']))

models = []
models.append(('RandomForestClassifier', RandomForestClassifier(max_depth=4, random_state=42)))
models.append(('DecisionTreeClassifier', DecisionTreeClassifier(max_depth=2, random_state=42)))
models.append(('AdaBoostClassifier', AdaBoostClassifier()))
models.append(('GradientBoostingClassifier', GradientBoostingClassifier(random_state=42)))
models.append(('LogisticRegression', LogisticRegression(random_state=42)))
models.append(('LGBMClassifier', LGBMClassifier(n_estimators=500)))
models.append(('XGBClassifier', XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3)))

X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix_train, train['label'], test_size=0.2, stratify=train['label'])

train_score = []
test_score = []
names = []
f1score = []
accuracy = []
recall = []
for name, model in models:
    clf = model
    clf.fit(X_train, y_train)
    
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)
    
    names.append(name)
    train_score.append(accuracy_score(y_train, y_pred_train))
    test_score.append(accuracy_score(y_test, y_pred_test))
    accuracy.append(accuracy_score(y_test, y_pred_test))
    f1score.append(f1_score(y_test, y_pred_test))
    recall.append(recall_score(y_test, y_pred_test))

model_score_data = pd.DataFrame({'model name':names, 
                                 'train score':train_score, 
                                 'test score':test_score,
                                 'f1 score': f1score,
                                 'accuracy score': accuracy,
                                 'recall score': recall})

model_score_data = model_score_data.sort_values(by='train score', ascending=False)
model_score_data

Unnamed: 0,model name,train score,test score,f1 score,accuracy score,recall score
5,LGBMClassifier,0.944317,0.790104,0.787389,0.790104,0.810316
4,LogisticRegression,0.891328,0.802075,0.792815,0.802075,0.789517
6,XGBClassifier,0.805009,0.747805,0.760606,0.747805,0.835275
3,GradientBoostingClassifier,0.742142,0.714685,0.734102,0.714685,0.821131
2,AdaBoostClassifier,0.71899,0.697127,0.716474,0.697127,0.797837
0,RandomForestClassifier,0.622293,0.606943,0.342017,0.606943,0.212978
1,DecisionTreeClassifier,0.592256,0.583001,0.668359,0.583001,0.87604
