In [5]:
# module import

import pandas as pd
import numpy as np
import random
import os
from glob import glob
from tqdm import tqdm
import re
from nltk.corpus import stopwords
import nltk
# nltk.download('stopwords')
from scipy.stats import norm

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

from transformers import AutoTokenizer, AutoModel
import torch
from transformers import BertTokenizer, BertModel, BartForConditionalGeneration
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score

import warnings
warnings.filterwarnings(action='ignore')
from statsmodels.stats.proportion import proportions_ztest
from scipy.stats import ttest_ind
import time
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# 소유격 변경을 위한 변수

contractions = {"'cause": 'because',
 "I'd": 'I would',
 "I'd've": 'I would have',
 "I'll": 'I will',
 "I'll've": 'I will have',
 "I'm": 'I am',
 "I've": 'I have',
 "ain't": 'is not',
 "aren't": 'are not',
 "can't": 'cannot',
 "could've": 'could have',
 "couldn't": 'could not',
 "didn't": 'did not',
 "doesn't": 'does not',
 "don't": 'do not',
 "hadn't": 'had not',
 "hasn't": 'has not',
 "haven't": 'have not',
 "he'd": 'he would',
 "he'll": 'he will',
 "he's": 'he is',
 "here's": 'here is',
 "how'd": 'how did',
 "how'd'y": 'how do you',
 "how'll": 'how will',
 "how's": 'how is',
 "i'd": 'i would',
 "i'd've": 'i would have',
 "i'll": 'i will',
 "i'll've": 'i will have',
 "i'm": 'i am',
 "i've": 'i have',
 "isn't": 'is not',
 "it'd": 'it would',
 "it'd've": 'it would have',
 "it'll": 'it will',
 "it'll've": 'it will have',
 "it's": 'it is',
 "let's": 'let us',
 "ma'am": 'madam',
 "mayn't": 'may not',
 "might've": 'might have',
 "mightn't": 'might not',
 "mightn't've": 'might not have',
 "must've": 'must have',
 "mustn't": 'must not',
 "mustn't've": 'must not have',
 "needn't": 'need not',
 "needn't've": 'need not have',
 "o'clock": 'of the clock',
 "oughtn't": 'ought not',
 "oughtn't've": 'ought not have',
 "sha'n't": 'shall not',
 "shan't": 'shall not',
 "shan't've": 'shall not have',
 "she'd": 'she would',
 "she'd've": 'she would have',
 "she'll": 'she will',
 "she'll've": 'she will have',
 "she's": 'she is',
 "should've": 'should have',
 "shouldn't": 'should not',
 "shouldn't've": 'should not have',
 "so's": 'so as',
 "so've": 'so have',
 "that'd": 'that would',
 "that'd've": 'that would have',
 "that's": 'that is',
 "there'd": 'there would',
 "there'd've": 'there would have',
 "there's": 'there is',
 "they'd": 'they would',
 "they'd've": 'they would have',
 "they'll": 'they will',
 "they'll've": 'they will have',
 "they're": 'they are',
 "they've": 'they have',
 "this's": 'this is',
 "to've": 'to have',
 "wasn't": 'was not',
 "we'd": 'we would',
 "we'd've": 'we would have',
 "we'll": 'we will',
 "we'll've": 'we will have',
 "we're": 'we are',
 "we've": 'we have',
 "weren't": 'were not',
 "what'll": 'what will',
 "what'll've": 'what will have',
 "what're": 'what are',
 "what's": 'what is',
 "what've": 'what have',
 "when's": 'when is',
 "when've": 'when have',
 "where'd": 'where did',
 "where's": 'where is',
 "where've": 'where have',
 "who'll": 'who will',
 "who'll've": 'who will have',
 "who's": 'who is',
 "who've": 'who have',
 "why's": 'why is',
 "why've": 'why have',
 "will've": 'will have',
 "won't": 'will not',
 "won't've": 'will not have',
 "would've": 'would have',
 "wouldn't": 'would not',
 "wouldn't've": 'would not have',
 "y'all": 'you all',
 "y'all'd": 'you all would',
 "y'all'd've": 'you all would have',
 "y'all're": 'you all are',
 "y'all've": 'you all have',
 "you'd": 'you would',
 "you'd've": 'you would have',
 "you'll": 'you will',
 "you'll've": 'you will have',
 "you're": 'you are',
 "you've": 'you have'}

## 원본 데이터

In [7]:
# 데이터 불러오기
df_politics = glob('../origin_data//politics/*.txt')
df_sport = glob('../origin_data//sport/*.txt')
df_tech = glob('../origin_data//tech/*.txt')
df_entertain = glob('../origin_data//entertainment/*.txt')
df_business = glob('../origin_data//business/*.txt')

In [8]:
def make_dataset(df_normal, df_abnormal):

    # 데이터 랜덤으로 추출
    idx = np.random.permutation(len(df_normal))
    
    # train_data는 df_normal에서 60%로 구성
    # test_data는 test_dataset_normal, test_dataset_abnormal로 구성 -> 이상 데이터가 test_data 절반 이상부터 나오게 하기 위함
    train_dataset, test_dataset_normal, test_dataset_abnormal = df_normal.iloc[idx[:int(len(df_normal)*0.6)]], df_normal.iloc[idx[int(len(df_normal)*0.6):int(len(df_normal)*0.6)+int(len(df_normal)*0.2)]], df_normal.iloc[idx[int(len(df_normal)*0.6)+int(len(df_normal)*0.2):]]
    
    # 카테고리 생성
    category_tr = [0] * len(train_dataset)
    train_dataset['category'] = category_tr

    category_te = [0] * len(test_dataset_normal)
    test_dataset_normal['category'] = category_te

    category_ab_te = [0] * len(test_dataset_abnormal)
    test_dataset_abnormal['category'] = category_ab_te

    # 비정상 데이터 추출
    ab_idx = np.random.permutation(len(df_abnormal))
    ab_data = df_abnormal.iloc[ab_idx[:int(len(ab_idx)*0.2)]]

    ab_category = [1] * len(ab_data)
    ab_data['category'] = ab_category

    # test_dataset에서 절반 이상에서 이상치가 나올 수 있도록 만듬
    test_dataset_abnormal_ = pd.concat([test_dataset_abnormal, ab_data], axis=0)
    test_ab_idx = np.random.permutation(len(test_dataset_abnormal_))
    test_dataset_abnormal_fi = test_dataset_abnormal_.iloc[test_ab_idx]
    
    test_dataset = pd.concat([test_dataset_normal, test_dataset_abnormal_fi], axis=0)
    test_dataset = test_dataset.reset_index(drop=True)

    return train_dataset, test_dataset


In [9]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")


def sentence_embedding(sentence):
    input_ids = torch.tensor(tokenizer.encode(sentence, add_special_tokens=True)).unsqueeze(0)
    # 문장이 너무 길어서 토큰 갯수가 512가 넘을 때 -> 512개 까지만 사용
    if len(input_ids[0])>512:
        input_ids = input_ids[0][:512]
        input_ids = input_ids.unsqueeze(0)
    outputs = model(input_ids)
    last_hidden_states = outputs.last_hidden_state
    sentence_embedding = torch.mean(last_hidden_states, dim=1).squeeze()
    return sentence_embedding.detach().numpy()

# 문장을 임베딩 한 후 문장들의 임베딩 값을 평균내어 전체 문서의 임베딩 값을 구함
def make_vector(docs):
    train_docs_vector = []
    stop_words = set(stopwords.words('english'))  # 영어 stopwords를 사용할 경우

    for sentences in tqdm(docs):
        sentence_vector = []
        for sentence in sentences.split('. '):
            # stopwords를 제거한 후에 sentence_embedding 수행
            sentence_clean = ' '.join([word for word in sentence.split() if word.lower() not in stop_words])
            if sentence_clean.strip() != '':
                sentence_vector.append(sentence_embedding(sentence_clean))
            else:
                sentence_vector.append(sentence_embedding(sentence))
            
        train_docs_vector.append(sentence_vector)

    docs_embedding = np.array([np.mean(train_docs_vector[idx], axis=0) for idx in range(len(train_docs_vector))])
    return docs_embedding

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Business

In [10]:
# 비정상 데이터인 Business를 제외한 정상 데이터
normal_dataset = df_politics+df_sport+df_tech+df_entertain

In [11]:
txt = []

for lst in tqdm(normal_dataset):
    file = open(lst, 'r')
    data = file.read()
    data = data.lower()
    data = data.strip()
    data = re.compile('<.*?>').sub('', data) # <parsing> 제거
    data = re.sub('\s+', ' ', data)  
    data = ' '.join([contractions[t] if t in contractions else t for t in data.split(" ")]) # 약어 정규화
    data = re.sub(r"'s\b","",data) # 소유격 제거. Ex) roland's -> roland
    
    data = data.replace('\n\n', '\n')
    data = data.replace('\n', '. ')
    data = data.replace('..', '.')

    txt.append(data)

txt_2 = []

for abnormal in tqdm(df_business):
    file = open(abnormal, 'r')
    data_ = file.read()
    data_ = data_.lower()
    data_ = data_.strip()
    data_ = re.compile('<.*?>').sub('', data_) # <parsing> 제거
    data_ = re.sub('\s+', ' ', data_)  
    data_ = ' '.join([contractions[t] if t in contractions else t for t in data_.split(" ")]) # 약어 정규화
    data_ = re.sub(r"'s\b","",data_) # 소유격 제거. Ex) roland's -> roland
    
    data_ = data_.replace('\n\n', '\n')
    data_ = data_.replace('\n', '. ')
    data_ = data_.replace('..', '.')

    txt_2.append(data_)

df_normal = pd.DataFrame(txt, columns=['origin'])
df_normal = df_normal.reset_index(drop=True)
df_abnormal = pd.DataFrame(txt_2, columns=['origin'])

100%|██████████| 1715/1715 [00:00<00:00, 9122.15it/s]
100%|██████████| 510/510 [00:00<00:00, 4138.64it/s]


In [154]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
clu = [5,10,15,20,25,30,35,40,45,50] # K-means의 클러스터 갯수 
threshold = np.arange(0,4.5,0.1) # 데이터와 클러스터의 중앙값으로부터의 거리에서 이상치로 판단할 기준
final_acc = []
final_precision = []
final_recall = []
final_f1 = []
final_delay = []
epoch_time = []

for n in tqdm(range(1, 11)):
    # 시작 시간
    start_time = time.time()
    # train_data, test_data만들기
    train_dataset, test_dataset = make_dataset(df_normal, df_abnormal)
    
    # 텍스트 데이타 임베딩
    train_docs_embedding = make_vector(train_dataset.origin)
    test_docs_embedding = make_vector(test_dataset.origin)

    best_score = 0
    # 모델이 3개 임으로 
    i = len(train_dataset)//3
    
    for c in clu:
        
        kmeans1 = KMeans(n_clusters=c, random_state=42)
        kmeans1.fit(train_docs_embedding[:i])

        kmeans2 = KMeans(n_clusters=c, random_state=42)
        kmeans2.fit(train_docs_embedding[i:2*i])

        kmeans3 = KMeans(n_clusters=c, random_state=42)
        kmeans3.fit(train_docs_embedding[2*i:])

        # 데이터와 클러스터 중심값으로부터 거리
        distances1 = np.zeros(test_docs_embedding.shape[0])
        distances2 = np.zeros(test_docs_embedding.shape[0])
        distances3 = np.zeros(test_docs_embedding.shape[0])
        

        for t in threshold:
            predictions1 = kmeans1.predict(test_docs_embedding)
            predictions2 = kmeans2.predict(test_docs_embedding)
            predictions3 = kmeans3.predict(test_docs_embedding)
        
            # 데이터와 중앙값 거리 구함
            for idx in range(test_docs_embedding.shape[0]):
                distances1[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans1.cluster_centers_[predictions1[idx]])
                distances2[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans2.cluster_centers_[predictions2[idx]])
                distances3[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans3.cluster_centers_[predictions3[idx]])
        
            # 어느 거리부터 이상치로 정할지 나타내는 t를 통해 predict_ensembel구함
            # predict_ensembel -> [0,0,2,3,1,2,3,.....], 길이는 test_docs_embedding 갯수 만큼 
            predict_ensembel = (distances1>t) * 1 + (distances2>t) * 1 + (distances3>t) * 1 
        
            # 2개 이상 모델에서 이상치 판단 했을 때 실제 이상치로 판단
            predict = np.where(predict_ensembel>=2, 1, 0)
            
            acc_scores = accuracy_score(test_dataset.category, predict)
            f1_s = f1_score(test_dataset.category, predict)
            
            # f1스코어가 가장 큰 클러스터 갯수(c), 거리 기준값(t)를 구함
            if f1_s>best_score:
                best_score = f1_s
                best_params = {best_score:[c,t]}
                whole_window_ensemble = predict_ensembel
                whole_window = predict



    ###########################################################################################
    ######################################여기서 부터 추론######################################
    ###########################################################################################    
    
    # f1스코어가 최대가 되는 kmeans모델
    kmeans1 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    kmeans1.fit(train_docs_embedding[:i])

    kmeans2 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    kmeans2.fit(train_docs_embedding[i:2*i])

    kmeans3 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    kmeans3.fit(train_docs_embedding[2*i:])

    distances1 = np.zeros(test_docs_embedding.shape[0])
    distances2 = np.zeros(test_docs_embedding.shape[0])
    distances3 = np.zeros(test_docs_embedding.shape[0])

    predictions1 = kmeans1.predict(test_docs_embedding)
    predictions2 = kmeans2.predict(test_docs_embedding)
    predictions3 = kmeans3.predict(test_docs_embedding)
    
    # 그 모델로부터 추론
    for i in range(test_docs_embedding.shape[0]):
        distances1[i] = np.linalg.norm(test_docs_embedding[i] - kmeans1.cluster_centers_[predictions1[i]])
        distances2[i] = np.linalg.norm(test_docs_embedding[i] - kmeans2.cluster_centers_[predictions2[i]])
        distances3[i] = np.linalg.norm(test_docs_embedding[i] - kmeans3.cluster_centers_[predictions3[i]])
    
    # 각각의 모델에서 이상치 여부에 대해 2개 이상의 모델에서 이상치 판단 했을 때 실제 이상치로 판단
    predict_ensembel = (distances1>best_params[best_score][1]) * 1 + (distances2>best_params[best_score][1]) * 1 + (distances3>best_params[best_score][1]) * 1 
    
    predict = np.where(predict_ensembel>=2, 1, 0)

    test_acc_scores = accuracy_score(test_dataset.category, predict)
    test_pre_scores = precision_score(test_dataset.category, predict)
    test_rec_scores = recall_score(test_dataset.category, predict)
    test_f1_scores = f1_score(test_dataset.category, predict)
    final_acc.append(test_acc_scores)
    final_precision.append(test_pre_scores)
    final_recall.append(test_rec_scores)
    final_f1.append(test_f1_scores)

    # 윈도우 사이즈
    window_size = 80
    ref_window = predict[:window_size] # 레퍼런스 윈도우는 예상한 값의 처음부터 window size까지
    ref_ratio = np.count_nonzero(ref_window) / len(ref_window)

    first_ab_idx = test_dataset[test_dataset.category==1].index[0]-window_size  # 실제 처음 이상치가 들어간 시점(index)
    for delay in range(len(predict) - first_ab_idx):
        compare_window = predict[first_ab_idx:first_ab_idx+window_size]  # 실제 처음 이상치가 들어간 시점 부터 윈도우 사이즈 만큼 compare_window생성
        compare_ratio = np.count_nonzero(compare_window) / len(compare_window)  # 1과 0 비율 구함
        first_ab_idx+=1   # 1씩 움지이면서 ref_winodw와 compare_window의 1비율을 보며 비교 -> ttest의 p-value값이 0.05보다 작아지면 실제로 새로운 토픽을 찾은거임
        t, p = ttest_ind(ref_window, compare_window)
        if p<=0.005:
            print('몇 번째인지:', delay+first_ab_idx)
            print('Delay:', delay)
            print(f"Group 1 proportion: {ref_ratio:.3f}")
            print(f"Group 2 proportion: {compare_ratio:.3f}")
            print(f"t-statistic: {t:.3f}")
            print(f"p-value: {p:.3f}")
            final_delay.append(delay)
            break
    if len(final_delay) != n:
        final_delay.append('none')
        
    epoch_time.append(round(time.time() - start_time, 1)) # 최종 걸린 시간

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 1028/1028 [05:33<00:00,  3.09it/s]
Token indices sequence length is longer than the specified maximum sequence length for this model (535 

몇 번째인지: 390
Delay: 59
Group 1 proportion: 0.125
Group 2 proportion: 0.312
t-statistic: -2.927
p-value: 0.004


100%|██████████| 1028/1028 [05:34<00:00,  3.07it/s]
100%|██████████| 788/788 [04:05<00:00,  3.21it/s]
 20%|██        | 2/10 [20:51<1:23:24, 625.50s/it]

몇 번째인지: 453
Delay: 95
Group 1 proportion: 0.087
Group 2 proportion: 0.263
t-statistic: -2.975
p-value: 0.003


100%|██████████| 1028/1028 [05:26<00:00,  3.15it/s]
100%|██████████| 788/788 [04:13<00:00,  3.11it/s]
 30%|███       | 3/10 [31:16<1:12:57, 625.30s/it]

몇 번째인지: 469
Delay: 103
Group 1 proportion: 0.113
Group 2 proportion: 0.300
t-statistic: -2.994
p-value: 0.003


100%|██████████| 1028/1028 [05:30<00:00,  3.11it/s]
100%|██████████| 788/788 [04:08<00:00,  3.17it/s]
 40%|████      | 4/10 [41:39<1:02:27, 624.50s/it]

몇 번째인지: 426
Delay: 80
Group 1 proportion: 0.113
Group 2 proportion: 0.300
t-statistic: -2.994
p-value: 0.003


100%|██████████| 1028/1028 [05:28<00:00,  3.13it/s]
100%|██████████| 788/788 [04:10<00:00,  3.15it/s]
 50%|█████     | 5/10 [52:02<51:58, 623.75s/it]  

몇 번째인지: 359
Delay: 47
Group 1 proportion: 0.075
Group 2 proportion: 0.237
t-statistic: -2.886
p-value: 0.004


100%|██████████| 1028/1028 [05:26<00:00,  3.15it/s]
100%|██████████| 788/788 [04:09<00:00,  3.15it/s]
 60%|██████    | 6/10 [1:02:22<41:31, 622.80s/it]

몇 번째인지: 354
Delay: 42
Group 1 proportion: 0.087
Group 2 proportion: 0.263
t-statistic: -2.975
p-value: 0.003


100%|██████████| 1028/1028 [05:25<00:00,  3.16it/s]
100%|██████████| 788/788 [04:08<00:00,  3.17it/s]
100%|██████████| 1028/1028 [05:28<00:00,  3.13it/s]
100%|██████████| 788/788 [04:08<00:00,  3.17it/s]
 80%|████████  | 8/10 [1:23:01<20:42, 621.04s/it]

몇 번째인지: 449
Delay: 93
Group 1 proportion: 0.013
Group 2 proportion: 0.125
t-statistic: -2.866
p-value: 0.005


100%|██████████| 1028/1028 [05:35<00:00,  3.07it/s]
100%|██████████| 788/788 [04:00<00:00,  3.28it/s]
 90%|█████████ | 9/10 [1:33:20<10:20, 620.37s/it]

몇 번째인지: 437
Delay: 86
Group 1 proportion: 0.100
Group 2 proportion: 0.275
t-statistic: -2.891
p-value: 0.004


100%|██████████| 1028/1028 [05:25<00:00,  3.16it/s]
100%|██████████| 788/788 [04:08<00:00,  3.16it/s]
100%|██████████| 10/10 [1:43:38<00:00, 621.84s/it]

몇 번째인지: 450
Delay: 92
Group 1 proportion: 0.075
Group 2 proportion: 0.237
t-statistic: -2.886
p-value: 0.004





In [156]:
print('각10번 time:', epoch_time)
print('평균', np.mean(epoch_time))

각10번 time: [626.3, 624.9, 625.1, 623.3, 622.4, 621.0, 617.4, 621.1, 618.9, 618.1]
평균 621.85


In [157]:
print('각10번 delay:', final_delay)
# print('평균', np.mean(final_delay))

각10번 delay: [59, 95, 103, 80, 47, 42, 'none', 93, 86, 92]


In [257]:
np.mean([59, 95, 103, 80, 47, 42, 93, 86, 92])

77.44444444444444

In [158]:
print('각10번 acc:', final_acc)
print('평균', np.mean(final_acc))

각10번 acc: [0.8629441624365483, 0.8743654822335025, 0.8857868020304569, 0.8527918781725888, 0.8781725888324873, 0.8654822335025381, 0.8984771573604061, 0.9035532994923858, 0.8743654822335025, 0.8857868020304569]
평균 0.8781725888324873


In [159]:
print('각10번 precision:', final_precision)
print('평균', np.mean(final_precision))

각10번 precision: [0.48125, 0.5103448275862069, 0.5454545454545454, 0.4551282051282051, 0.5238095238095238, 0.48717948717948717, 0.5932203389830508, 0.63, 0.5100671140939598, 0.5508474576271186]
평균 0.5287301499862098


In [160]:
print('각10번 f1:', final_f1)
print('평균', np.mean(final_f1))

각10번 f1: [0.5877862595419847, 0.5991902834008097, 0.6153846153846153, 0.5503875968992248, 0.5789473684210527, 0.5891472868217054, 0.6363636363636364, 0.6237623762376239, 0.6055776892430279, 0.5909090909090909]
평균 0.5977456203222771


### 나머지 토픽에 대해서는 이하 동일

## 요약 데이터(Summary)

In [14]:
from glob import glob
import re
import pandas as pd
from tqdm import tqdm

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
import spacy
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from nltk.stem import PorterStemmer

import pytextrank
import spacy
nlp = spacy.load("en_core_web_sm")

import torch
from transformers import BartTokenizer, BartForConditionalGeneration

import warnings
warnings.filterwarnings("ignore")

from icecream import ic
from math import sqrt
from operator import itemgetter
nlp.add_pipe("textrank", last=True)

<pytextrank.base.BaseTextRankFactory at 0x7fd0b1432680>

In [15]:
df_politics = glob('../origin_data//politics/*.txt')
df_sport = glob('../origin_data//sport/*.txt')
df_tech = glob('../origin_data//tech/*.txt')
df_entertain = glob('../origin_data//entertainment/*.txt')
df_business = glob('../origin_data//business/*.txt')

In [72]:
# 문장을 textrank를 활용해 요약해주는 함수
def text_r(text):
    doc = nlp(text)
    sent_bounds = [ [s.start, s.end, set([])] for s in doc.sents ]
    
    # 최대 문장의 50% 길이로 텍스트 요약
    limit_phrases = len(sent_tokenize(text))//2

    phrase_id = 0
    unit_vector = []

    for p in doc._.phrases:

        unit_vector.append(p.rank)

        for chunk in p.chunks:

            for sent_start, sent_end, sent_vector in sent_bounds:
                if chunk.start >= sent_start and chunk.end <= sent_end:
                    sent_vector.add(phrase_id)
                    break

        phrase_id += 1

        if phrase_id == limit_phrases:
            break

    sum_ranks = sum(unit_vector)

    unit_vector = [ rank/sum_ranks for rank in unit_vector ]

    sent_rank = {}
    sent_id = 0

    for sent_start, sent_end, sent_vector in sent_bounds:
        sum_sq = 0.0
        for phrase_id in range(len(unit_vector)):

            if phrase_id not in sent_vector:
                sum_sq += unit_vector[phrase_id]**2.0

        sent_rank[sent_id] = sqrt(sum_sq)
        sent_id += 1

    sorted(sent_rank.items(), key=itemgetter(1)) 

    # 50% 요약
    limit_sentences = len(sent_tokenize(text))//2

    sent_text = {}
    sent_id = 0

    for sent in doc.sents:
        sent_text[sent_id] = sent.text
        sent_id += 1

    num_sent = 0
    sum_text = []
        
    for sent_id, rank in sorted(sent_rank.items(), key=itemgetter(1)):
        sum_text.append(sent_text[sent_id])
        num_sent += 1

        if num_sent == limit_sentences:
            break
    return sum_text

#### Business

In [73]:
# 비정상 데이터인 Business를 제외한 정상 데이터
normal_dataset = df_politics+df_sport+df_tech+df_entertain

In [74]:
txt = []

for lst in tqdm(normal_dataset):
    file = open(lst, 'r')
    data = file.read()
    data = data.lower()
    data = data.strip()
    data = re.compile('<.*?>').sub('', data) # <parsing> 제거
    data = re.sub('\s+', ' ', data)  
    data = ' '.join([contractions[t] if t in contractions else t for t in data.split(" ")]) # 약어 정규화
    data = re.sub(r"'s\b","",data) # 소유격 제거. Ex) roland's -> roland
    
    data = data.replace('\n\n', '\n')
    data = data.replace('\n', '. ')
    data = data.replace('..', '.')

    txt.append(data)

txt_2 = []

for abnormal in tqdm(df_business):
    file = open(abnormal, 'r')
    data_ = file.read()
    data_ = data_.lower()
    data_ = data_.strip()
    data_ = re.compile('<.*?>').sub('', data_) # <parsing> 제거
    data_ = re.sub('\s+', ' ', data_)  
    data_ = ' '.join([contractions[t] if t in contractions else t for t in data_.split(" ")]) # 약어 정규화
    data_ = re.sub(r"'s\b","",data_) # 소유격 제거. Ex) roland's -> roland
    
    data_ = data_.replace('\n\n', '\n')
    data_ = data_.replace('\n', '. ')
    data_ = data_.replace('..', '.')

    txt_2.append(data_)

df_normal = pd.DataFrame(txt, columns=['origin'])
df_normal = df_normal.reset_index(drop=True)
df_abnormal = pd.DataFrame(txt_2, columns=['origin'])

100%|██████████| 1715/1715 [00:00<00:00, 9226.34it/s]
100%|██████████| 510/510 [00:00<00:00, 10162.84it/s]


In [75]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
clu = [5,10,15,20,25,30,35,40,45,50]  # K-means의 클러스터 갯수 
threshold = np.arange(0,4.5,0.1) # 데이터와 클러스터의 중앙값으로부터의 거리에서 이상치로 판단할 기준
final_acc = []
final_precision = []
final_recall = []
final_f1 = []
final_delay = []
epoch_time = []

for n in tqdm(range(1, 11)):
    # 시작 시간
    start_time = time.time()

    # train_data, test_data
    train_dataset, test_dataset = make_dataset(df_normal, df_abnormal)
    
    # train_data의 원본 데이터 요약
    train_sum = []
    for i in range(len(train_dataset)):
        summ1 = text_r(train_dataset.origin.iloc[i])
        train_sum.append(' '.join(summ1))
    train_dataset['summary'] = train_sum

    # test_data의 원본 데이터 요약
    test_sum = []
    for i in range(len(test_dataset)):
        summ2 = text_r(test_dataset.origin.iloc[i])
        test_sum.append(' '.join(summ2))
    test_dataset['summary'] = test_sum

    # 문서 임베딩
    train_docs_embedding = make_vector(train_dataset.summary)
    test_docs_embedding = make_vector(test_dataset.summary)


    best_score = 0
    # 모델이 3개 임으로
    i = len(train_dataset)//3
    
    for c in clu:
        
        kmeans1 = KMeans(n_clusters=c, random_state=42)
        kmeans1.fit(train_docs_embedding[:i])

        kmeans2 = KMeans(n_clusters=c, random_state=42)
        kmeans2.fit(train_docs_embedding[i:2*i])

        kmeans3 = KMeans(n_clusters=c, random_state=42)
        kmeans3.fit(train_docs_embedding[2*i:])

        # 데이터와 클러스터 중심값으로부터 거리
        distances1 = np.zeros(test_docs_embedding.shape[0])
        distances2 = np.zeros(test_docs_embedding.shape[0])
        distances3 = np.zeros(test_docs_embedding.shape[0])
        

        for t in threshold:
            predictions1 = kmeans1.predict(test_docs_embedding)
            predictions2 = kmeans2.predict(test_docs_embedding)
            predictions3 = kmeans3.predict(test_docs_embedding)
        
                        
            # 데이터와 클러스터의 중심값 거리 구함
            for idx in range(test_docs_embedding.shape[0]):
                distances1[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans1.cluster_centers_[predictions1[idx]])
                distances2[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans2.cluster_centers_[predictions2[idx]])
                distances3[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans3.cluster_centers_[predictions3[idx]])
                
            # 어느 거리부터 이상치로 정할지 나타내는 t를 통해 predict_ensembel구함
            # predict_ensembel -> [0,0,2,3,1,2,3,.....], 길이는 test_docs_embedding 갯수 만큼 
            predict_ensembel = (distances1>t) * 1 + (distances2>t) * 1 + (distances3>t) * 1 
            
            
            # 2개 이상 모델에서 이상치 판단 했을 때 실제 이상치로 판단
            predict = np.where(predict_ensembel>=2, 1, 0)
            
            acc_scores = accuracy_score(test_dataset.category, predict)
            f1_s = f1_score(test_dataset.category, predict)
            
            
            # f1스코어가 가장 큰 클러스터 갯수(c), 거리 기준값(t)를 구함
            if f1_s>best_score:
                best_score = f1_s
                best_params = {best_score:[c,t]}
                whole_window_ensemble = predict_ensembel
                whole_window = predict

    ###########################################################################################
    ######################################여기서 부터 추론######################################
    ###########################################################################################    
    
    # f1스코어가 최대가 되는 kmeans모델
    kmeans1 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    kmeans1.fit(train_docs_embedding[:i])

    kmeans2 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    kmeans2.fit(train_docs_embedding[i:2*i])

    kmeans3 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    kmeans3.fit(train_docs_embedding[2*i:])

    
    distances1 = np.zeros(test_docs_embedding.shape[0])
    distances2 = np.zeros(test_docs_embedding.shape[0])
    distances3 = np.zeros(test_docs_embedding.shape[0])
    
    predictions1 = kmeans1.predict(test_docs_embedding)
    predictions2 = kmeans2.predict(test_docs_embedding)
    predictions3 = kmeans3.predict(test_docs_embedding)
    

    for i in range(test_docs_embedding.shape[0]):
        distances1[i] = np.linalg.norm(test_docs_embedding[i] - kmeans1.cluster_centers_[predictions1[i]])
        distances2[i] = np.linalg.norm(test_docs_embedding[i] - kmeans2.cluster_centers_[predictions2[i]])
        distances3[i] = np.linalg.norm(test_docs_embedding[i] - kmeans3.cluster_centers_[predictions3[i]])

    predict_ensembel = (distances1>best_params[best_score][1]) * 1 + (distances2>best_params[best_score][1]) * 1 + (distances3>best_params[best_score][1]) * 1 
    
    predict = np.where(predict_ensembel>=2, 1, 0)

    test_acc_scores = accuracy_score(test_dataset.category, predict)
    test_pre_scores = precision_score(test_dataset.category, predict)
    test_rec_scores = recall_score(test_dataset.category, predict)
    test_f1_scores = f1_score(test_dataset.category, predict)
    final_acc.append(test_acc_scores)
    final_precision.append(test_pre_scores)
    final_recall.append(test_rec_scores)
    final_f1.append(test_f1_scores)

    window_size = 80
    window_size = 80
    ref_window = predict[:window_size] # 레퍼런스 윈도우는 예상한 값의 처음부터 window size까지
    ref_ratio = np.count_nonzero(ref_window) / len(ref_window)

    first_ab_idx = test_dataset[test_dataset.category==1].index[0]-window_size # 실제 처음 이상치가 들어간 시점(index)
    for delay in range(len(predict) - first_ab_idx):
        compare_window = predict[first_ab_idx:first_ab_idx+window_size] # 실제 처음 이상치가 들어간 시점 부터 윈도우 사이즈 만큼 compare_window생성
        compare_ratio = np.count_nonzero(compare_window) / len(compare_window) # 0과 1비율
        first_ab_idx+=1 # 1씩 움지이면서 ref_winodw와 compare_window의 1비율을 보며 비교 -> ttest의 p-value값이 0.05보다 작아지면 실제로 새로운 토픽을 찾은거임

        t, p = ttest_ind(ref_window, compare_window)
        if p<=0.005:
            print('몇 번째인지:', delay+first_ab_idx)
            print('Delay:', delay)
            print(f"Group 1 proportion: {ref_ratio:.3f}")
            print(f"Group 2 proportion: {compare_ratio:.3f}")
            print(f"t-statistic: {t:.3f}")
            print(f"p-value: {p:.3f}")
            final_delay.append(delay)
            break
    if len(final_delay) != n:
        final_delay.append('none')
        
    epoch_time.append(round(time.time() - start_time, 1))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 0/10 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (564 > 512). Runnin

몇 번째인지: 351
Delay: 41
Group 1 proportion: 0.075
Group 2 proportion: 0.237
t-statistic: -2.886
p-value: 0.004


100%|██████████| 1029/1029 [02:50<00:00,  6.04it/s]
100%|██████████| 788/788 [06:11<00:00,  2.12it/s]
 20%|██        | 2/10 [19:09<1:20:33, 604.13s/it]

몇 번째인지: 386
Delay: 58
Group 1 proportion: 0.062
Group 2 proportion: 0.225
t-statistic: -2.992
p-value: 0.003


100%|██████████| 1029/1029 [02:43<00:00,  6.28it/s]
100%|██████████| 788/788 [02:00<00:00,  6.53it/s]
 30%|███       | 3/10 [25:58<1:00:06, 515.17s/it]

몇 번째인지: 1023
Delay: 379
Group 1 proportion: 0.087
Group 2 proportion: 0.263
t-statistic: -2.975
p-value: 0.003


100%|██████████| 1029/1029 [02:37<00:00,  6.53it/s]
100%|██████████| 788/788 [02:15<00:00,  5.83it/s]
 40%|████      | 4/10 [32:44<47:12, 472.12s/it]  

몇 번째인지: 420
Delay: 77
Group 1 proportion: 0.075
Group 2 proportion: 0.237
t-statistic: -2.886
p-value: 0.004


100%|██████████| 1029/1029 [06:09<00:00,  2.78it/s]
100%|██████████| 788/788 [04:52<00:00,  2.70it/s]
 50%|█████     | 5/10 [45:38<48:23, 580.70s/it]

몇 번째인지: 590
Delay: 162
Group 1 proportion: 0.037
Group 2 proportion: 0.175
t-statistic: -2.877
p-value: 0.005


100%|██████████| 1029/1029 [02:45<00:00,  6.21it/s]
100%|██████████| 788/788 [02:04<00:00,  6.32it/s]
 60%|██████    | 6/10 [52:19<34:39, 519.88s/it]

몇 번째인지: 1305
Delay: 519
Group 1 proportion: 0.087
Group 2 proportion: 0.667
t-statistic: -3.337
p-value: 0.001


100%|██████████| 1029/1029 [02:55<00:00,  5.87it/s]
100%|██████████| 788/788 [02:15<00:00,  5.81it/s]
 70%|███████   | 7/10 [59:27<24:29, 489.81s/it]

몇 번째인지: 640
Delay: 185
Group 1 proportion: 0.050
Group 2 proportion: 0.200
t-statistic: -2.927
p-value: 0.004


100%|██████████| 1029/1029 [03:11<00:00,  5.38it/s]
100%|██████████| 788/788 [02:15<00:00,  5.80it/s]
100%|██████████| 1029/1029 [02:59<00:00,  5.73it/s]
100%|██████████| 788/788 [02:01<00:00,  6.50it/s]
 90%|█████████ | 9/10 [1:13:43<07:35, 455.51s/it]

몇 번째인지: 957
Delay: 342
Group 1 proportion: 0.062
Group 2 proportion: 0.225
t-statistic: -2.992
p-value: 0.003


100%|██████████| 1029/1029 [02:37<00:00,  6.55it/s]
100%|██████████| 788/788 [01:58<00:00,  6.63it/s]
100%|██████████| 10/10 [1:20:10<00:00, 481.01s/it]

몇 번째인지: 447
Delay: 91
Group 1 proportion: 0.125
Group 2 proportion: 0.312
t-statistic: -2.927
p-value: 0.004





In [76]:
print('각10번 time:', epoch_time)
print('평균', np.mean(epoch_time))

각10번 time: [407.8, 741.5, 409.3, 406.1, 773.2, 401.8, 427.9, 443.5, 412.5, 386.3]
평균 480.99000000000007


In [77]:
print('각10번 delay:', final_delay)
# print('평균', np.mean(final_delay))

각10번 delay: [41, 58, 379, 77, 162, 519, 185, 'none', 342, 91]


In [78]:
print('각10번 acc:', final_acc)
print('평균', np.mean(final_acc))

각10번 acc: [0.8401015228426396, 0.8642131979695431, 0.8908629441624365, 0.8604060913705583, 0.8908629441624365, 0.8984771573604061, 0.8895939086294417, 0.8807106598984772, 0.8984771573604061, 0.8489847715736041]
평균 0.876269035532995


In [79]:
print('각10번 precision:', final_precision)
print('평균', np.mean(final_precision))

각10번 precision: [0.42857142857142855, 0.48031496062992124, 0.5769230769230769, 0.4722222222222222, 0.5833333333333334, 0.6145833333333334, 0.5714285714285714, 0.5307692307692308, 0.6195652173913043, 0.445859872611465]
평균 0.5323571247213887


In [80]:
print('각10번 f1:', final_f1)
print('평균', np.mean(final_f1))

각10번 f1: [0.5333333333333333, 0.5327510917030568, 0.5825242718446602, 0.5528455284552845, 0.5656565656565657, 0.595959595959596, 0.5797101449275363, 0.5948275862068965, 0.5876288659793814, 0.5405405405405405]
평균 0.5665777524606851
