In [None]:
import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.model_selection import KFold

In [None]:
import os
import torch
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
torch.cuda.is_available()

False

In [None]:
# 임베딩 배열을 데이터프레임으로 변환
def make_df(data):

    array = data[0]     # 첫번째 array
    error = []          # 에러 인덱스 리스트


    for index in range(len(data)-1):
        plus_array = data[index+1]     # 추가할 array
        try:
            array = np.concatenate((array, plus_array), axis=0)
        except:
            print(index+1, '번 인덱스 오류 처리 완료')
            error.append(index+1)     # 에러 인덱스 저장

        # 201건부터는 모두 보이스피싱 텍스트 임베딩값
        # 보이스피싱 여부를 붙이고자 phishing_check로 인덱스 지정
        if (index+1) == 199:
            phishing_check = array.shape[0] + 1


    # 데이터프레임 변환 및 레이블 지정
    df = pd.DataFrame(array)
    df['phishing'] = 0                        # 상담 텍스트 임베딩값
    df.loc[phishing_check:,'phishing'] = 1    # 보이스파싱 텍스트 임베딩값
    return df

In [None]:
def data_split(df):
    feature = df.iloc[:,:-1]      # 피처 칼럼: 예측에 사용되는 데이터
    target  = df['phishing']      # 타겟 칼럼: 예측(분류) 대상 데이터
    x_train, x_valid, y_train, y_valid = train_test_split(feature, target, test_size=0.2,
                                                          shuffle=True,
                                                          stratify=target,
                                                          random_state=1234)
    return x_train, x_valid, y_train, y_valid, feature, target

In [None]:
# 최종 결과 리스트
score_result = pd.DataFrame(columns=['logistic_Acc', 'logistic_F1', 'logistic_Rec', 'logistic_Pre',
                                     'naivebayes_Acc', 'naivebayes_F1', 'naivebayes_Rec', 'naivebayes_Pre',
                                     'randomforest_Acc', 'logistic_F1', 'naivebayes_Rec', 'randomforest_Pre',
                                     'logistic_5fold_Acc', 'logistic_5fold_F1', 'logistic_5fold_Rec', 'logistic_5fold_Pre',
                                     'naivebayes_5Fold_Acc', 'naivebayes_5Fold_F1', 'naivebayes_5Fold_Rec', 'naivebayes_5Fold_Pre',
                                     'randomforest_5Fold_Acc', 'randomforest_5Fold_F1', 'randomforest_5Fold_Rec', 'randomforest_5Fold_Pre'])

In [None]:
def logistic_reg(x_train, y_train, x_valid, y_valid):

    # 모델 분류 수행
    reg = LogisticRegression(random_state=0, max_iter=500)
    reg.fit(x_train, y_train)
    pred = reg.predict(x_valid)


    # 분류 score 계산
    accuracy, f1_score, recall, precision = score(pred, y_valid)
    return accuracy, f1_score, recall, precision

In [None]:
def naivebayes_clf(x_train, y_train, x_valid, y_valid):

    # 모델 분류 수행
    gnb = GaussianNB()
    gnb.fit(x_train, y_train)
    pred = gnb.predict(x_valid)


    # 분류 score 계산
    accuracy, f1_score, recall, precision = score(pred, y_valid)
    return accuracy, f1_score, recall, precision

In [None]:
def randomforest_clf(x_train, y_train, x_valid, y_valid):

    # 모델 분류 수행
    clf = RandomForestClassifier(max_depth=16, random_state=0)
    clf.fit(x_train, y_train)
    pred = clf.predict(x_valid)

    # 분류 score 계산
    accuracy, f1_score, recall, precision = score(pred, y_valid)
    return accuracy, f1_score, recall, precision

In [None]:
def kfold_clf(model, feature, target):

    # 5개 폴드세트를 분리하는 kFold 객체와
    # 폴드세트별 score 값을 담을 리스트
    kfold = KFold(n_splits=5, random_state=0, shuffle=True)
    cv_accuracy = []
    cv_f1score = []
    cv_precision = []
    cv_recall = []


    n_iter = 0
    for train_index, validate_index in kfold.split(feature):
        # 데이터셋 지정
        X_train, X_valid = feature.iloc[train_index,:], feature.iloc[validate_index,:]
        y_train, y_valid = target[train_index], target[validate_index]


        # 학습 및 예측
        model.fit(X_train, y_train)
        pred = model.predict(X_valid)
        n_iter += 1


        # 데이터 size 확인
        train_size = X_train.shape[0]
        test_size = X_valid.shape[0]
        print(f' ========= {n_iter} =========')


        # iteration마다 score 측정
        accuracy,f1score,precision, recall = score(pred, y_valid)


        # iteration 별 score 값 저장
        cv_accuracy.append(accuracy)
        cv_f1score.append(f1score)
        cv_precision.append(precision)
        cv_recall.append(recall)


    # 개별 iteration별 정확도를 합한 평균 계산
    print(' ======== 최종 ========')
    print(' 평균검증 정확도   : ', np.mean(cv_accuracy))
    print(' 평균검증 f1-score : ', np.mean(cv_f1score))
    print(' 평균검증 정밀도   : ', np.mean(cv_precision))
    print(' 평균검증 재현율   : ', np.mean(cv_recall))
    print(' \n')
    return np.mean(cv_accuracy), np.mean(cv_f1score), np.mean(cv_precision), np.mean(cv_recall)

In [None]:
def total_model(data):

    # 임베딩 배열을 데이터프레임으로 전환
    df = make_df(data)
    # train / test 데이터 분할
    x_train, x_valid, y_train, y_valid, feature, target = data_split(df)
    # 리턴할 점수 리스트
    score_list = []
    print(' 데이터 가공 완료\n')


    # Logistic Regression
    print('      ----------    Logistic Regression Result    ----------      ')
    logi1, logi2, logi3, logi4 = logistic_reg(x_train, y_train, x_valid, y_valid)
    score_list += [logi1, logi2, logi3, logi4]


    # Naive Bayes Classification
    print('      ---------- Naive Bayes Classification Result ----------      ')
    naive1, naive2, naive3, naive4 = naivebayes_clf(x_train, y_train, x_valid, y_valid)
    score_list += [naive1, naive2, naive3, naive4]


    # RandomForest Classification
    print('      ---------- RandomForest Classifcation Result ----------      ')
    rf1, rf2, rf3, rf4 = randomforest_clf(x_train, y_train, x_valid, y_valid)
    score_list += [rf1, rf2, rf3, rf4]


    # 5-Fold Logistic Regression
    print(' \n')
    print('   ----------    [5-Fold] Logistic Regression Result    ----------   ')
    lgb = LogisticRegression(random_state=0, max_iter=500)
    foldlogi1, foldlogi2, foldlogi3, foldlogi4 = kfold_clf(lgb, feature, target)
    score_list += [foldlogi1, foldlogi2, foldlogi3, foldlogi4]


    # 5-Fold NaiveBayes
    print('   ---------- [5-Fold] Naive Bayes Classification Result ----------   ')
    gnb = GaussianNB()
    foldnb1, foldnb2, foldnb3, foldnb4 = kfold_clf(gnb, feature, target)
    score_list += [foldnb1, foldnb2, foldnb3, foldnb4]


    # 5-Fold RandomForest Classification
    print('   ---------- [5-Fold] RandomForest Classifcation Result ----------   ')
    clf = RandomForestClassifier(max_depth=16, random_state=0)
    foldrf1, foldrf2, foldrf3, foldrf4 = kfold_clf(clf, feature, target)
    score_list += [foldrf1, foldrf2, foldrf3, foldrf4]
    return score_list

In [None]:
eng_ratio5_4gram = np.load('eng_ratio5_4gram_embedding.npy', allow_pickle=True)

FileNotFoundError: ignored

In [None]:
df = make_df(eng_ratio5_4gram)
x_train, x_valid, y_train, y_valid, feature, target = data_split(df)

reg = LogisticRegression(random_state=0, max_iter=500)
reg.fit(x_train, y_train)

clf = RandomForestClassifier(max_depth=16, random_state=0)
clf.fit(x_train, y_train)

gnb = GaussianNB()
gnb.fit(x_train, y_train)

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier


# 보팅에 참여한 single models 지정


single_models = [
    ('GaussianNB', gnb),
    ('RandomForestClassifier', clf),
    ('LogisticRegression', reg)
]


In [None]:
voting_model = VotingClassifier(single_models, voting='soft')

In [None]:
# voting_regressor = VotingRegressor(single_models, n_jobs=-1)
# voting_regressor.fit(x_train, y_train

voting_model.fit(x_train, y_train)
y_pred = voting_model.predict(x_vali  d)
print('VOTING ACCURACY:', accuracy_score(y_valid,y_pred))

VOTING ACCURACY: 0.90363462458154


In [None]:
with open('saved_voting_model_file', 'wb') as f:
    pickle.dump(voting_model, f)

In [None]:
#  이 밑은 모두 무시
# from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier

In [None]:
# # default value로 학습
# gbr = GradientBoostingRegressor(random_state=1)
# gbr.fit(x_train, y_train)


In [None]:
# # evaluation plot

# from sklearn.metrics import mean_absolute_error, mean_squared_error
# import pandas as pd
# import numpy as np
# from IPython.display import Image
# import matplotlib.pyplot as plt
# import seaborn as sns
# my_predictions = {}
# colors = ['r', 'c', 'm', 'y', 'k', 'khaki', 'teal', 'orchid', 'sandybrown',
#           'greenyellow', 'dodgerblue', 'deepskyblue', 'rosybrown', 'firebrick',
#           'deeppink', 'crimson', 'salmon', 'darkred', 'olivedrab', 'olive',
#           'forestgreen', 'royalblue', 'indigo', 'navy', 'mediumpurple', 'chocolate',
#           'gold', 'darkorange', 'seagreen', 'turquoise', 'steelblue', 'slategray',
#           'peru', 'midnightblue', 'slateblue', 'dimgray', 'cadetblue', 'tomato'
#          ]

# def mse_eval(name_, actual, pred):
#     global predictions
#     global colors

#     plot_predictions(name_, actual, pred)

#     mse = mean_squared_error(actual, pred)
#     my_predictions[name_] = mse

#     y_value = sorted(my_predictions.items(), key=lambda x: x[1], reverse=True)

#     df = pd.DataFrame(y_value, columns=['model', 'mse'])
#     print(df)
#     min_ = df['mse'].min() - 10
#     max_ = df['mse'].max() + 10

#     length = len(df)

#     plt.figure(figsize=(10, length))
#     ax = plt.subplot()
#     ax.set_yticks(np.arange(len(df)))
#     ax.set_yticklabels(df['model'], fontsize=15)
#     bars = ax.barh(np.arange(len(df)), df['mse'])

#     for i, v in enumerate(df['mse']):
#         idx = np.random.choice(len(colors))
#         bars[i].set_color(colors[idx])
#         ax.text(v + 2, i, str(round(v, 3)), color='k', fontsize=15, fontweight='bold')

#     plt.title('MSE Error', fontsize=18)
#     plt.xlim(min_, max_)

#     plt.show()
# # prediction plot
# def plot_predictions(name_, actual, pred):
#     df = pd.DataFrame({'actual': y_valid, 'prediction': pred})
#     df = df.sort_values(by='actual').reset_index(drop=True)

#     plt.figure(figsize=(12, 9))
#     plt.scatter(df.index, df['prediction'], marker='x', color='r')
#     plt.scatter(df.index, df['actual'], alpha=0.7, marker='o', color='black')
#     plt.title(name_, fontsize=15)
#     plt.legend(['prediction', 'actual'], fontsize=12)
#     plt.show()

