In [2]:
import pandas as pd
import numpy as np
import random
import os
from glob import glob
from tqdm import tqdm
import re
from nltk.corpus import stopwords
import nltk
# nltk.download('stopwords')
from scipy.stats import norm

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

from transformers import AutoTokenizer, AutoModel
import torch
from transformers import BertTokenizer, BertModel, BartForConditionalGeneration
from sklearn.cluster import KMeans
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score

import warnings
warnings.filterwarnings(action='ignore')
from statsmodels.stats.proportion import proportions_ztest
from scipy.stats import ttest_ind
import time
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
contractions = {"'cause": 'because',
 "I'd": 'I would',
 "I'd've": 'I would have',
 "I'll": 'I will',
 "I'll've": 'I will have',
 "I'm": 'I am',
 "I've": 'I have',
 "ain't": 'is not',
 "aren't": 'are not',
 "can't": 'cannot',
 "could've": 'could have',
 "couldn't": 'could not',
 "didn't": 'did not',
 "doesn't": 'does not',
 "don't": 'do not',
 "hadn't": 'had not',
 "hasn't": 'has not',
 "haven't": 'have not',
 "he'd": 'he would',
 "he'll": 'he will',
 "he's": 'he is',
 "here's": 'here is',
 "how'd": 'how did',
 "how'd'y": 'how do you',
 "how'll": 'how will',
 "how's": 'how is',
 "i'd": 'i would',
 "i'd've": 'i would have',
 "i'll": 'i will',
 "i'll've": 'i will have',
 "i'm": 'i am',
 "i've": 'i have',
 "isn't": 'is not',
 "it'd": 'it would',
 "it'd've": 'it would have',
 "it'll": 'it will',
 "it'll've": 'it will have',
 "it's": 'it is',
 "let's": 'let us',
 "ma'am": 'madam',
 "mayn't": 'may not',
 "might've": 'might have',
 "mightn't": 'might not',
 "mightn't've": 'might not have',
 "must've": 'must have',
 "mustn't": 'must not',
 "mustn't've": 'must not have',
 "needn't": 'need not',
 "needn't've": 'need not have',
 "o'clock": 'of the clock',
 "oughtn't": 'ought not',
 "oughtn't've": 'ought not have',
 "sha'n't": 'shall not',
 "shan't": 'shall not',
 "shan't've": 'shall not have',
 "she'd": 'she would',
 "she'd've": 'she would have',
 "she'll": 'she will',
 "she'll've": 'she will have',
 "she's": 'she is',
 "should've": 'should have',
 "shouldn't": 'should not',
 "shouldn't've": 'should not have',
 "so's": 'so as',
 "so've": 'so have',
 "that'd": 'that would',
 "that'd've": 'that would have',
 "that's": 'that is',
 "there'd": 'there would',
 "there'd've": 'there would have',
 "there's": 'there is',
 "they'd": 'they would',
 "they'd've": 'they would have',
 "they'll": 'they will',
 "they'll've": 'they will have',
 "they're": 'they are',
 "they've": 'they have',
 "this's": 'this is',
 "to've": 'to have',
 "wasn't": 'was not',
 "we'd": 'we would',
 "we'd've": 'we would have',
 "we'll": 'we will',
 "we'll've": 'we will have',
 "we're": 'we are',
 "we've": 'we have',
 "weren't": 'were not',
 "what'll": 'what will',
 "what'll've": 'what will have',
 "what're": 'what are',
 "what's": 'what is',
 "what've": 'what have',
 "when's": 'when is',
 "when've": 'when have',
 "where'd": 'where did',
 "where's": 'where is',
 "where've": 'where have',
 "who'll": 'who will',
 "who'll've": 'who will have',
 "who's": 'who is',
 "who've": 'who have',
 "why's": 'why is',
 "why've": 'why have',
 "will've": 'will have',
 "won't": 'will not',
 "won't've": 'will not have',
 "would've": 'would have',
 "wouldn't": 'would not',
 "wouldn't've": 'would not have',
 "y'all": 'you all',
 "y'all'd": 'you all would',
 "y'all'd've": 'you all would have',
 "y'all're": 'you all are',
 "y'all've": 'you all have',
 "you'd": 'you would',
 "you'd've": 'you would have',
 "you'll": 'you will',
 "you'll've": 'you will have',
 "you're": 'you are',
 "you've": 'you have'}

In [4]:
def make_dataset(df_normal, df_abnormal):

    idx = np.random.permutation(len(df_normal))
    
    # df_abnormal에서 비율에 맞게 train_dataset, test_daset_normal, test_dataset_abnormal 생성
    # 모두 정상 데이터
    train_dataset, test_dataset_normal, test_dataset_abnormal = df_normal.iloc[idx[:int(len(df_normal)*0.6)]], df_normal.iloc[idx[int(len(df_normal)*0.6):int(len(df_normal)*0.6)+int(len(df_normal)*0.2)]], df_normal.iloc[idx[int(len(df_normal)*0.6)+int(len(df_normal)*0.2):]]
    
    # category 생성
    category_tr = [0] * len(train_dataset)
    train_dataset['category'] = category_tr

    category_te = [0] * len(test_dataset_normal)
    test_dataset_normal['category'] = category_te

    category_ab_te = [0] * len(test_dataset_abnormal)
    test_dataset_abnormal['category'] = category_ab_te

    # 비정상 데이터 추출
    ab_idx = np.random.permutation(len(df_abnormal))
    ab_data = df_abnormal.iloc[ab_idx[:int(len(ab_idx)*0.2)]]

    ab_category = [1] * len(ab_data)
    ab_data['category'] = ab_category

    # test_dataset에서 절반 이상에서 이상치가 나올 수 있도록 만듬
    test_dataset_abnormal_ = pd.concat([test_dataset_abnormal, ab_data], axis=0)
    test_ab_idx = np.random.permutation(len(test_dataset_abnormal_))
    test_dataset_abnormal_fi = test_dataset_abnormal_.iloc[test_ab_idx]
    
    test_dataset = pd.concat([test_dataset_normal, test_dataset_abnormal_fi], axis=0)
    test_dataset = test_dataset.reset_index(drop=True)

    return train_dataset, test_dataset


In [5]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

def sentence_embedding(sentence):
    input_ids = torch.tensor(tokenizer.encode(sentence, add_special_tokens=True)).unsqueeze(0)
    if len(input_ids[0])>512:
        input_ids = input_ids[0][:512]
        input_ids = input_ids.unsqueeze(0)
    outputs = model(input_ids)
    last_hidden_states = outputs.last_hidden_state
    sentence_embedding = torch.mean(last_hidden_states, dim=1).squeeze()
    return sentence_embedding.detach().numpy()


def make_vector(docs):
    train_docs_vector = []
    stop_words = set(stopwords.words('english'))  # 영어 stopwords를 사용할 경우

    for sentences in tqdm(docs):
        sentence_vector = []
        for sentence in sentences.split('. '):
            # stopwords를 제거한 후에 sentence_embedding 수행
            sentence_clean = ' '.join([word for word in sentence.split() if word.lower() not in stop_words])
            if sentence_clean.strip() != '':
                sentence_vector.append(sentence_embedding(sentence_clean))
            else:
                sentence_vector.append(sentence_embedding(sentence))
            
        train_docs_vector.append(sentence_vector)

    docs_embedding = np.array([np.mean(train_docs_vector[idx], axis=0) for idx in range(len(train_docs_vector))])
    return docs_embedding

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
# # test embedding
# with open('./business_10_test_docs_embedding.pickle', 'rb') as f:
#     test_docs_embedding = pickle.load(f)

# # train embedding
# with open('./business_10_train_docs_embedding.pickle', 'rb') as f:
#     train_docs_embedding = pickle.load(f)

# # # valid embedding
# # with open('./valid_docs_embedding.pickle', 'rb') as f:
# #     valid_docs_embedding = pickle.load(f)

# with open('./business_10_test_ans_embedding.pickle', 'rb') as f:
#     test_ans = pickle.load(f)

# with open('./business_10_train_ans_embedding.pickle', 'rb') as f:
#     train_ans = pickle.load(f)

In [7]:
df_politics = glob('../origin_data//politics/*.txt')
df_sport = glob('../origin_data//sport/*.txt')
df_tech = glob('../origin_data//tech/*.txt')
df_entertain = glob('../origin_data//entertainment/*.txt')
df_business = glob('../origin_data//business/*.txt')

### Business

In [None]:
normal_dataset = df_politics+df_sport+df_tech+df_entertain

In [None]:
## 텍스트 전처리 ##

txt = []

for lst in tqdm(normal_dataset):
    file = open(lst, 'r')
    data = file.read()
    data = data.lower()
    data = data.strip()
    data = re.compile('<.*?>').sub('', data) # <태그>제거
    data = re.sub('\s+', ' ', data)  
    data = ' '.join([contractions[t] if t in contractions else t for t in data.split(" ")]) # 약어 정규화
    data = re.sub(r"'s\b","",data) # 소유격 제거. Ex) roland's -> roland
    
    data = data.replace('\n\n', '\n')
    data = data.replace('\n', '. ')
    data = data.replace('..', '.')

    txt.append(data)

txt_2 = []

for abnormal in tqdm(df_business):
    file = open(abnormal, 'r')
    data_ = file.read()
    data_ = data_.lower()
    data_ = data_.strip()
    data_ = re.compile('<.*?>').sub('', data_)
    data_ = re.sub('\s+', ' ', data_)  
    data_ = ' '.join([contractions[t] if t in contractions else t for t in data_.split(" ")]) # 약어 정규화
    data_ = re.sub(r"'s\b","",data_) # 소유격 제거. Ex) roland's -> roland
    
    data_ = data_.replace('\n\n', '\n')
    data_ = data_.replace('\n', '. ')
    data_ = data_.replace('..', '.')

    txt_2.append(data_)

df_normal = pd.DataFrame(txt, columns=['origin'])
df_normal = df_normal.drop(df_normal.index[928])
df_normal = df_normal.reset_index(drop=True)
df_abnormal = pd.DataFrame(txt_2, columns=['origin'])

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

In [146]:
conta = np.arange(0.01,0.5,0.01)
neighbor = np.arange(5,70,5)
final_acc = []
final_precision = []
final_recall = []
final_f1 = []
final_delay = []
epoch_time = []

for n in tqdm(range(1, 11)):
    start_time = time.time()
    # 데이터셋 구축
    train_dataset, test_dataset = make_dataset(df_normal, df_abnormal)
    # 데이터 임베딩
    train_docs_embedding = make_vector(train_dataset.origin)
    test_docs_embedding = make_vector(test_dataset.origin)

    best_score = 0
    i = len(train_docs_embedding)
    # 가장 좋은 contamination, n_neighbors 파라미터 구하기 위함
    for c in conta:
        for nei in neighbor:
        
            lof = LocalOutlierFactor(n_neighbors=nei, contamination=c, novelty=True)

            lof.fit(train_docs_embedding)

            f1_s = f1_score(np.where(lof.predict(test_docs_embedding)==1, 0, 1), test_dataset.origin.category)

            if f1_s>best_score:
                best_score = f1_s
                best_params = {best_score:[c,nei]}
                whole_window = np.where(lof.predict(test_docs_embedding)==1, 0, 1)
    # 가장 좋은 파라미터를 바탕으로 lof생성
    lof_fin = LocalOutlierFactor(n_neighbors=best_params[best_score][1], contamination=best_params[best_score][0], novelty=True)
    lof_fin.fit(train_docs_embedding)
    predict = np.where(lof_fin.predict(test_docs_embedding)==1, 0, 1)
    test_acc_scores = accuracy_score(test_dataset.origin.category, predict)
    test_pre_scores = precision_score(test_dataset.origin.category, predict)
    test_rec_scores = recall_score(test_dataset.origin.category, predict)
    test_f1_scores = f1_score(test_dataset.origin.category, predict)
    final_acc.append(test_acc_scores)
    final_precision.append(test_pre_scores)
    final_recall.append(test_rec_scores)
    final_f1.append(test_f1_scores)

    # 윈도우 사이즈 80을 가지고 1씩 전진하면서 비정상의 비율을 확인 및 t분석을 통해 p-value 0.05보다 커지는 지점 확인
    window_size = 80
    ref_window = predict[:window_size]
    ref_ratio = np.count_nonzero(ref_window) / len(ref_window)

    first_ab_idx = test_dataset[test_dataset.category==1].index[0]-window_size
    for delay in range(len(predict) - first_ab_idx):
        compare_window = predict[first_ab_idx:first_ab_idx+window_size]
        compare_ratio = np.count_nonzero(compare_window) / len(compare_window)
        first_ab_idx+=1
        t, p = ttest_ind(ref_window, compare_window)
        if p<=0.005:
            print('몇 번째인지:', delay+first_ab_idx)
            print('Delay:', delay)
            print(f"Group 1 proportion: {ref_ratio:.3f}")
            print(f"Group 2 proportion: {compare_ratio:.3f}")
            print(f"t-statistic: {t:.3f}")
            print(f"p-value: {p:.3f}")
            final_delay.append(delay)
            break
    if len(final_delay) != n:
        final_delay.append('none')
        
    epoch_time.append(round(time.time() - start_time, 1))


몇 번째인지: 350
Delay: 42
Group 1 proportion: 0.037
Group 2 proportion: 0.175
t-statistic: -2.877
p-value: 0.005


In [None]:
print('각10번 time:', epoch_time)
print('평균', np.mean(epoch_time))

In [None]:
print('각10번 delay:', final_delay)
# print('평균', np.mean(final_delay))

In [None]:
print('각10번 acc:', final_acc)
print('평균', np.mean(final_acc))

In [None]:
print('각10번 precision:', final_precision)
print('평균', np.mean(final_precision))

In [None]:
print('각10번 f1:', final_f1)
print('평균', np.mean(final_f1))

### Politic

In [None]:
normal_dataset = df_business+df_sport+df_tech+df_entertain

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

In [None]:
txt = []

for lst in tqdm(normal_dataset):
    file = open(lst, 'r')
    data = file.read()
    data = data.lower()
    data = data.strip()
    data = re.compile('<.*?>').sub('', data)
    data = re.sub('\s+', ' ', data)  
    data = ' '.join([contractions[t] if t in contractions else t for t in data.split(" ")]) # 약어 정규화
    data = re.sub(r"'s\b","",data) # 소유격 제거. Ex) roland's -> roland
    
    data = data.replace('\n\n', '\n')
    data = data.replace('\n', '. ')
    data = data.replace('..', '.')

    txt.append(data)

txt_2 = []

for abnormal in tqdm(df_politics):
    file = open(abnormal, 'r')
    data_ = file.read()
    data_ = data_.lower()
    data_ = data_.strip()
    data_ = re.compile('<.*?>').sub('', data_)
    data_ = re.sub('\s+', ' ', data_)  
    data_ = ' '.join([contractions[t] if t in contractions else t for t in data_.split(" ")]) # 약어 정규화
    data_ = re.sub(r"'s\b","",data_) # 소유격 제거. Ex) roland's -> roland
    
    data_ = data_.replace('\n\n', '\n')
    data_ = data_.replace('\n', '. ')
    data_ = data_.replace('..', '.')

    txt_2.append(data_)

df_normal = pd.DataFrame(txt, columns=['origin'])
df_normal = df_normal.reset_index(drop=True)
df_abnormal = pd.DataFrame(txt_2, columns=['origin'])

In [None]:
conta = np.arange(0.01,0.5,0.01)
neighbor = np.arange(5,70,5)
final_acc = []
final_precision = []
final_recall = []
final_f1 = []
final_delay = []
epoch_time = []

for n in tqdm(range(1, 11)):
    start_time = time.time()

    train_dataset, test_dataset = make_dataset(df_normal, df_abnormal)
    
    train_docs_embedding = make_vector(train_dataset.origin)
    test_docs_embedding = make_vector(test_dataset.origin)

    best_score = 0
    i = len(train_docs_embedding)

    for c in conta:
        for nei in neighbor:
        
            lof = LocalOutlierFactor(n_neighbors=nei, contamination=c, novelty=True)

            lof.fit(train_docs_embedding)

            f1_s = f1_score(np.where(lof.predict(test_docs_embedding)==1, 0, 1), test_dataset.origin.category)

            if f1_s>best_score:
                best_score = f1_s
                best_params = {best_score:[c,nei]}
                whole_window = np.where(lof.predict(test_docs_embedding)==1, 0, 1)

    lof_fin = LocalOutlierFactor(n_neighbors=best_params[best_score][1], contamination=best_params[best_score][0], novelty=True)
    lof_fin.fit(train_docs_embedding)
    predict = np.where(lof_fin.predict(test_docs_embedding)==1, 0, 1)
    test_acc_scores = accuracy_score(test_dataset.origin.category, predict)
    test_pre_scores = precision_score(test_dataset.origin.category, predict)
    test_rec_scores = recall_score(test_dataset.origin.category, predict)
    test_f1_scores = f1_score(test_dataset.origin.category, predict)
    final_acc.append(test_acc_scores)
    final_precision.append(test_pre_scores)
    final_recall.append(test_rec_scores)
    final_f1.append(test_f1_scores)

    window_size = 80
    ref_window = predict[:window_size]
    ref_ratio = np.count_nonzero(ref_window) / len(ref_window)

    first_ab_idx = test_dataset[test_dataset.category==1].index[0]-window_size
    for delay in range(len(predict) - first_ab_idx):
        compare_window = predict[first_ab_idx:first_ab_idx+window_size]
        compare_ratio = np.count_nonzero(compare_window) / len(compare_window)
        first_ab_idx+=1
        t, p = ttest_ind(ref_window, compare_window)
        if p<=0.005:
            print('몇 번째인지:', delay+first_ab_idx)
            print('Delay:', delay)
            print(f"Group 1 proportion: {ref_ratio:.3f}")
            print(f"Group 2 proportion: {compare_ratio:.3f}")
            print(f"t-statistic: {t:.3f}")
            print(f"p-value: {p:.3f}")
            final_delay.append(delay)
            break
    if len(final_delay) != n:
        final_delay.append('none')
        
    epoch_time.append(round(time.time() - start_time, 1))


### Tech

In [None]:
normal_dataset = df_business+df_sport+df_politics+df_entertain

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

In [None]:
txt = []

for lst in tqdm(normal_dataset):
    file = open(lst, 'r')
    data = file.read()
    data = data.lower()
    data = data.strip()
    data = re.compile('<.*?>').sub('', data)
    data = re.sub('\s+', ' ', data)  
    data = ' '.join([contractions[t] if t in contractions else t for t in data.split(" ")]) # 약어 정규화
    data = re.sub(r"'s\b","",data) # 소유격 제거. Ex) roland's -> roland
    
    data = data.replace('\n\n', '\n')
    data = data.replace('\n', '. ')
    data = data.replace('..', '.')

    txt.append(data)

txt_2 = []

for abnormal in tqdm(df_tech):
    file = open(abnormal, 'r')
    data_ = file.read()
    data_ = data_.lower()
    data_ = data_.strip()
    data_ = re.compile('<.*?>').sub('', data_)
    data_ = re.sub('\s+', ' ', data_)  
    data_ = ' '.join([contractions[t] if t in contractions else t for t in data_.split(" ")]) # 약어 정규화
    data_ = re.sub(r"'s\b","",data_) # 소유격 제거. Ex) roland's -> roland
    
    data_ = data_.replace('\n\n', '\n')
    data_ = data_.replace('\n', '. ')
    data_ = data_.replace('..', '.')

    txt_2.append(data_)

df_normal = pd.DataFrame(txt, columns=['origin'])
df_normal = df_normal.drop(df_normal.index[928])
df_normal = df_normal.reset_index(drop=True)
df_abnormal = pd.DataFrame(txt_2, columns=['origin'])

In [None]:
conta = np.arange(0.01,0.5,0.01)
neighbor = np.arange(5,70,5)
final_acc = []
final_precision = []
final_recall = []
final_f1 = []
final_delay = []
epoch_time = []

for n in tqdm(range(1, 11)):
    start_time = time.time()

    train_dataset, test_dataset = make_dataset(df_normal, df_abnormal)
    
    train_docs_embedding = make_vector(train_dataset.origin)
    test_docs_embedding = make_vector(test_dataset.origin)

    best_score = 0
    i = len(train_docs_embedding)

    for c in conta:
        for nei in neighbor:
        
            lof = LocalOutlierFactor(n_neighbors=nei, contamination=c, novelty=True)

            lof.fit(train_docs_embedding)

            f1_s = f1_score(np.where(lof.predict(test_docs_embedding)==1, 0, 1), test_dataset.origin.category)

            if f1_s>best_score:
                best_score = f1_s
                best_params = {best_score:[c,nei]}
                whole_window = np.where(lof.predict(test_docs_embedding)==1, 0, 1)

    lof_fin = LocalOutlierFactor(n_neighbors=best_params[best_score][1], contamination=best_params[best_score][0], novelty=True)
    lof_fin.fit(train_docs_embedding)
    predict = np.where(lof_fin.predict(test_docs_embedding)==1, 0, 1)
    test_acc_scores = accuracy_score(test_dataset.origin.category, predict)
    test_pre_scores = precision_score(test_dataset.origin.category, predict)
    test_rec_scores = recall_score(test_dataset.origin.category, predict)
    test_f1_scores = f1_score(test_dataset.origin.category, predict)
    final_acc.append(test_acc_scores)
    final_precision.append(test_pre_scores)
    final_recall.append(test_rec_scores)
    final_f1.append(test_f1_scores)

    window_size = 80
    ref_window = predict[:window_size]
    ref_ratio = np.count_nonzero(ref_window) / len(ref_window)

    first_ab_idx = test_dataset[test_dataset.category==1].index[0]-window_size
    for delay in range(len(predict) - first_ab_idx):
        compare_window = predict[first_ab_idx:first_ab_idx+window_size]
        compare_ratio = np.count_nonzero(compare_window) / len(compare_window)
        first_ab_idx+=1
        t, p = ttest_ind(ref_window, compare_window)
        if p<=0.005:
            print('몇 번째인지:', delay+first_ab_idx)
            print('Delay:', delay)
            print(f"Group 1 proportion: {ref_ratio:.3f}")
            print(f"Group 2 proportion: {compare_ratio:.3f}")
            print(f"t-statistic: {t:.3f}")
            print(f"p-value: {p:.3f}")
            final_delay.append(delay)
            break
    if len(final_delay) != n:
        final_delay.append('none')
        
    epoch_time.append(round(time.time() - start_time, 1))


In [None]:
print('각10번 time:', epoch_time)
print('평균', np.mean(epoch_time))

In [None]:
print('각10번 delay:', final_delay)
# print('평균', np.mean(final_delay))

In [None]:
print('각10번 acc:', final_acc)
print('평균', np.mean(final_acc))

In [None]:
print('각10번 precision:', final_precision)
print('평균', np.mean(final_precision))

In [None]:
print('각10번 f1:', final_f1)
print('평균', np.mean(final_f1))

### Sport

In [None]:
normal_dataset = df_business+df_politics+df_tech+df_entertain

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

In [None]:
txt = []

for lst in tqdm(normal_dataset):
    file = open(lst, 'r')
    data = file.read()
    data = data.lower()
    data = data.strip()
    data = re.compile('<.*?>').sub('', data)
    data = re.sub('\s+', ' ', data)  
    data = ' '.join([contractions[t] if t in contractions else t for t in data.split(" ")]) # 약어 정규화
    data = re.sub(r"'s\b","",data) # 소유격 제거. Ex) roland's -> roland
    
    data = data.replace('\n\n', '\n')
    data = data.replace('\n', '. ')
    data = data.replace('..', '.')

    txt.append(data)

txt_2 = []

for abnormal in tqdm(df_sport):
    file = open(abnormal, 'r')
    data_ = file.read()
    data_ = data_.lower()
    data_ = data_.strip()
    data_ = re.compile('<.*?>').sub('', data_)
    data_ = re.sub('\s+', ' ', data_)  
    data_ = ' '.join([contractions[t] if t in contractions else t for t in data_.split(" ")]) # 약어 정규화
    data_ = re.sub(r"'s\b","",data_) # 소유격 제거. Ex) roland's -> roland
    
    data_ = data_.replace('\n\n', '\n')
    data_ = data_.replace('\n', '. ')
    data_ = data_.replace('..', '.')

    txt_2.append(data_)

df_normal = pd.DataFrame(txt, columns=['origin'])
df_normal = df_normal.drop(df_normal.index[928])
df_normal = df_normal.reset_index(drop=True)
df_abnormal = pd.DataFrame(txt_2, columns=['origin'])

In [None]:
conta = np.arange(0.01,0.5,0.01)
neighbor = np.arange(5,70,5)
final_acc = []
final_precision = []
final_recall = []
final_f1 = []
final_delay = []
epoch_time = []

for n in tqdm(range(1, 11)):
    start_time = time.time()

    train_dataset, test_dataset = make_dataset(df_normal, df_abnormal)
    
    train_docs_embedding = make_vector(train_dataset.origin)
    test_docs_embedding = make_vector(test_dataset.origin)

    best_score = 0
    i = len(train_docs_embedding)

    for c in conta:
        for nei in neighbor:
        
            lof = LocalOutlierFactor(n_neighbors=nei, contamination=c, novelty=True)

            lof.fit(train_docs_embedding)

            f1_s = f1_score(np.where(lof.predict(test_docs_embedding)==1, 0, 1), test_dataset.origin.category)

            if f1_s>best_score:
                best_score = f1_s
                best_params = {best_score:[c,nei]}
                whole_window = np.where(lof.predict(test_docs_embedding)==1, 0, 1)

    lof_fin = LocalOutlierFactor(n_neighbors=best_params[best_score][1], contamination=best_params[best_score][0], novelty=True)
    lof_fin.fit(train_docs_embedding)
    predict = np.where(lof_fin.predict(test_docs_embedding)==1, 0, 1)
    test_acc_scores = accuracy_score(test_dataset.origin.category, predict)
    test_pre_scores = precision_score(test_dataset.origin.category, predict)
    test_rec_scores = recall_score(test_dataset.origin.category, predict)
    test_f1_scores = f1_score(test_dataset.origin.category, predict)
    final_acc.append(test_acc_scores)
    final_precision.append(test_pre_scores)
    final_recall.append(test_rec_scores)
    final_f1.append(test_f1_scores)

    window_size = 80
    ref_window = predict[:window_size]
    ref_ratio = np.count_nonzero(ref_window) / len(ref_window)

    first_ab_idx = test_dataset[test_dataset.category==1].index[0]-window_size
    for delay in range(len(predict) - first_ab_idx):
        compare_window = predict[first_ab_idx:first_ab_idx+window_size]
        compare_ratio = np.count_nonzero(compare_window) / len(compare_window)
        first_ab_idx+=1
        t, p = ttest_ind(ref_window, compare_window)
        if p<=0.005:
            print('몇 번째인지:', delay+first_ab_idx)
            print('Delay:', delay)
            print(f"Group 1 proportion: {ref_ratio:.3f}")
            print(f"Group 2 proportion: {compare_ratio:.3f}")
            print(f"t-statistic: {t:.3f}")
            print(f"p-value: {p:.3f}")
            final_delay.append(delay)
            break
    if len(final_delay) != n:
        final_delay.append('none')
        
    epoch_time.append(round(time.time() - start_time, 1))


In [None]:
print('각10번 time:', epoch_time)
print('평균', np.mean(epoch_time))

In [None]:
print('각10번 delay:', final_delay)
# print('평균', np.mean(final_delay))

In [None]:
print('각10번 acc:', final_acc)
print('평균', np.mean(final_acc))

In [None]:
print('각10번 precision:', final_precision)
print('평균', np.mean(final_precision))

In [None]:
print('각10번 f1:', final_f1)
print('평균', np.mean(final_f1))

### Entertain

In [None]:
normal_dataset = df_business+df_sport+df_tech+df_politics

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

In [None]:
txt = []

for lst in tqdm(normal_dataset):
    file = open(lst, 'r')
    data = file.read()
    data = data.lower()
    data = data.strip()
    data = re.compile('<.*?>').sub('', data)
    data = re.sub('\s+', ' ', data)  
    data = ' '.join([contractions[t] if t in contractions else t for t in data.split(" ")]) # 약어 정규화
    data = re.sub(r"'s\b","",data) # 소유격 제거. Ex) roland's -> roland
    
    data = data.replace('\n\n', '\n')
    data = data.replace('\n', '. ')
    data = data.replace('..', '.')

    txt.append(data)

txt_2 = []

for abnormal in tqdm(df_entertain):
    file = open(abnormal, 'r')
    data_ = file.read()
    data_ = data_.lower()
    data_ = data_.strip()
    data_ = re.compile('<.*?>').sub('', data_)
    data_ = re.sub('\s+', ' ', data_)  
    data_ = ' '.join([contractions[t] if t in contractions else t for t in data_.split(" ")]) # 약어 정규화
    data_ = re.sub(r"'s\b","",data_) # 소유격 제거. Ex) roland's -> roland
    
    data_ = data_.replace('\n\n', '\n')
    data_ = data_.replace('\n', '. ')
    data_ = data_.replace('..', '.')

    txt_2.append(data_)

df_normal = pd.DataFrame(txt, columns=['origin'])
df_normal = df_normal.reset_index(drop=True)
df_abnormal = pd.DataFrame(txt_2, columns=['origin'])

In [None]:
conta = np.arange(0.01,0.5,0.01)
neighbor = np.arange(5,70,5)
final_acc = []
final_precision = []
final_recall = []
final_f1 = []
final_delay = []
epoch_time = []

for n in tqdm(range(1, 11)):
    start_time = time.time()

    train_dataset, test_dataset = make_dataset(df_normal, df_abnormal)
    
    train_docs_embedding = make_vector(train_dataset.origin)
    test_docs_embedding = make_vector(test_dataset.origin)

    best_score = 0
    i = len(train_docs_embedding)

    for c in conta:
        for nei in neighbor:
        
            lof = LocalOutlierFactor(n_neighbors=nei, contamination=c, novelty=True)

            lof.fit(train_docs_embedding)

            f1_s = f1_score(np.where(lof.predict(test_docs_embedding)==1, 0, 1), test_dataset.origin.category)

            if f1_s>best_score:
                best_score = f1_s
                best_params = {best_score:[c,nei]}
                whole_window = np.where(lof.predict(test_docs_embedding)==1, 0, 1)

    lof_fin = LocalOutlierFactor(n_neighbors=best_params[best_score][1], contamination=best_params[best_score][0], novelty=True)
    lof_fin.fit(train_docs_embedding)
    predict = np.where(lof_fin.predict(test_docs_embedding)==1, 0, 1)
    test_acc_scores = accuracy_score(test_dataset.origin.category, predict)
    test_pre_scores = precision_score(test_dataset.origin.category, predict)
    test_rec_scores = recall_score(test_dataset.origin.category, predict)
    test_f1_scores = f1_score(test_dataset.origin.category, predict)
    final_acc.append(test_acc_scores)
    final_precision.append(test_pre_scores)
    final_recall.append(test_rec_scores)
    final_f1.append(test_f1_scores)

    window_size = 80
    ref_window = predict[:window_size]
    ref_ratio = np.count_nonzero(ref_window) / len(ref_window)

    first_ab_idx = test_dataset[test_dataset.category==1].index[0]-window_size
    for delay in range(len(predict) - first_ab_idx):
        compare_window = predict[first_ab_idx:first_ab_idx+window_size]
        compare_ratio = np.count_nonzero(compare_window) / len(compare_window)
        first_ab_idx+=1
        t, p = ttest_ind(ref_window, compare_window)
        if p<=0.005:
            print('몇 번째인지:', delay+first_ab_idx)
            print('Delay:', delay)
            print(f"Group 1 proportion: {ref_ratio:.3f}")
            print(f"Group 2 proportion: {compare_ratio:.3f}")
            print(f"t-statistic: {t:.3f}")
            print(f"p-value: {p:.3f}")
            final_delay.append(delay)
            break
    if len(final_delay) != n:
        final_delay.append('none')
        
    epoch_time.append(round(time.time() - start_time, 1))


In [None]:
print('각10번 time:', epoch_time)
print('평균', np.mean(epoch_time))

In [None]:
print('각10번 delay:', final_delay)
# print('평균', np.mean(final_delay))

In [None]:
print('각10번 acc:', final_acc)
print('평균', np.mean(final_acc))

In [None]:
print('각10번 precision:', final_precision)
print('평균', np.mean(final_precision))

In [None]:
print('각10번 f1:', final_f1)
print('평균', np.mean(final_f1))

## 요약데이터

In [8]:
from glob import glob
import re
import pandas as pd
from tqdm import tqdm

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
import spacy
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from nltk.stem import PorterStemmer

import pytextrank
import spacy
nlp = spacy.load("en_core_web_sm")

import torch
from transformers import BartTokenizer, BartForConditionalGeneration

import warnings
warnings.filterwarnings("ignore")

from icecream import ic
from math import sqrt
from operator import itemgetter
nlp.add_pipe("textrank", last=True)

<pytextrank.base.BaseTextRankFactory at 0x7fd7fc912200>

In [9]:
df_politics = glob('../origin_data//politics/*.txt')
df_sport = glob('../origin_data//sport/*.txt')
df_tech = glob('../origin_data//tech/*.txt')
df_entertain = glob('../origin_data//entertainment/*.txt')
df_business = glob('../origin_data//business/*.txt')

In [10]:
def text_r(text):
    doc = nlp(text)
    sent_bounds = [ [s.start, s.end, set([])] for s in doc.sents ]
    # limit_phrases = 4
    limit_phrases = len(sent_tokenize(text))//2

    phrase_id = 0
    unit_vector = []

    for p in doc._.phrases:
        # ic(phrase_id, p.text, p.rank)

        unit_vector.append(p.rank)

        for chunk in p.chunks:
            # ic(chunk.start, chunk.end)

            for sent_start, sent_end, sent_vector in sent_bounds:
                if chunk.start >= sent_start and chunk.end <= sent_end:
                    # ic(sent_start, chunk.start, chunk.end, sent_end)
                    sent_vector.add(phrase_id)
                    break

        phrase_id += 1

        if phrase_id == limit_phrases:
            break

    sum_ranks = sum(unit_vector)

    unit_vector = [ rank/sum_ranks for rank in unit_vector ]

    sent_rank = {}
    sent_id = 0

    for sent_start, sent_end, sent_vector in sent_bounds:
        # ic(sent_vector)
        sum_sq = 0.0
        for phrase_id in range(len(unit_vector)):
            # ic(phrase_id, unit_vector[phrase_id])

            if phrase_id not in sent_vector:
                sum_sq += unit_vector[phrase_id]**2.0

        sent_rank[sent_id] = sqrt(sum_sq)
        sent_id += 1

    sorted(sent_rank.items(), key=itemgetter(1)) 

    # limit_sentences = len(sent_tokenize(text))//3
    limit_sentences = len(sent_tokenize(text))//2

    sent_text = {}
    sent_id = 0

    for sent in doc.sents:
        sent_text[sent_id] = sent.text
        sent_id += 1

    num_sent = 0
    sum_text = []
        
    for sent_id, rank in sorted(sent_rank.items(), key=itemgetter(1)):
        # ic(sent_id, sent_text[sent_id])
        sum_text.append(sent_text[sent_id])
        num_sent += 1

        if num_sent == limit_sentences:
            break
    return sum_text

### Business

In [11]:
normal_dataset = df_politics+df_sport+df_tech+df_entertain

In [12]:
txt = []

for lst in tqdm(normal_dataset):
    file = open(lst, 'r')
    data = file.read()
    data = data.lower()
    data = data.strip()
    data = re.compile('<.*?>').sub('', data)
    data = re.sub('\s+', ' ', data)  
    data = ' '.join([contractions[t] if t in contractions else t for t in data.split(" ")]) # 약어 정규화
    data = re.sub(r"'s\b","",data) # 소유격 제거. Ex) roland's -> roland
    
    data = data.replace('\n\n', '\n')
    data = data.replace('\n', '. ')
    data = data.replace('..', '.')

    txt.append(data)

txt_2 = []

for abnormal in tqdm(df_business):
    file = open(abnormal, 'r')
    data_ = file.read()
    data_ = data_.lower()
    data_ = data_.strip()
    data_ = re.compile('<.*?>').sub('', data_)
    data_ = re.sub('\s+', ' ', data_)  
    data_ = ' '.join([contractions[t] if t in contractions else t for t in data_.split(" ")]) # 약어 정규화
    data_ = re.sub(r"'s\b","",data_) # 소유격 제거. Ex) roland's -> roland
    
    data_ = data_.replace('\n\n', '\n')
    data_ = data_.replace('\n', '. ')
    data_ = data_.replace('..', '.')

    txt_2.append(data_)

df_normal = pd.DataFrame(txt, columns=['origin'])
df_normal = df_normal.reset_index(drop=True)
df_abnormal = pd.DataFrame(txt_2, columns=['origin'])

100%|██████████| 1715/1715 [00:00<00:00, 4889.05it/s]
100%|██████████| 510/510 [00:00<00:00, 4808.88it/s]


In [16]:
test_dataset.category.values

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [17]:
conta = np.arange(0.01,0.5,0.01)
neighbor = np.arange(5,70,5)
final_acc = []
final_precision = []
final_recall = []
final_f1 = []
final_delay = []
epoch_time = []

for n in tqdm(range(1, 11)):
    start_time = time.time()

    train_dataset, test_dataset = make_dataset(df_normal, df_abnormal)

    train_sum = []
    for i in range(len(train_dataset)):
        summ1 = text_r(train_dataset.origin.iloc[i])
        train_sum.append(' '.join(summ1))
    train_dataset['summary'] = train_sum

    test_sum = []
    for i in range(len(test_dataset)):
        summ2 = text_r(test_dataset.origin.iloc[i])
        test_sum.append(' '.join(summ2))
    test_dataset['summary'] = test_sum
    
    
    train_docs_embedding = make_vector(train_dataset.origin)
    test_docs_embedding = make_vector(test_dataset.origin)

    best_score = 0
    i = len(train_docs_embedding)

    for c in conta:
        for nei in neighbor:
        
            lof = LocalOutlierFactor(n_neighbors=nei, contamination=c, novelty=True)

            lof.fit(train_docs_embedding)

            f1_s = f1_score(np.where(lof.predict(test_docs_embedding)==1, 0, 1), test_dataset.category)

            if f1_s>best_score:
                best_score = f1_s
                best_params = {best_score:[c,nei]}
                whole_window = np.where(lof.predict(test_docs_embedding)==1, 0, 1)

    lof_fin = LocalOutlierFactor(n_neighbors=best_params[best_score][1], contamination=best_params[best_score][0], novelty=True)
    lof_fin.fit(train_docs_embedding)
    predict = np.where(lof_fin.predict(test_docs_embedding)==1, 0, 1)
    test_acc_scores = accuracy_score(test_dataset.category, predict)
    test_pre_scores = precision_score(test_dataset.category, predict)
    test_rec_scores = recall_score(test_dataset.category, predict)
    test_f1_scores = f1_score(test_dataset.category, predict)
    final_acc.append(test_acc_scores)
    final_precision.append(test_pre_scores)
    final_recall.append(test_rec_scores)
    final_f1.append(test_f1_scores)

    window_size = 80
    ref_window = predict[:window_size]
    ref_ratio = np.count_nonzero(ref_window) / len(ref_window)

    first_ab_idx = test_dataset[test_dataset.category==1].index[0]-window_size
    for delay in range(len(predict) - first_ab_idx):
        compare_window = predict[first_ab_idx:first_ab_idx+window_size]
        compare_ratio = np.count_nonzero(compare_window) / len(compare_window)
        first_ab_idx+=1
        t, p = ttest_ind(ref_window, compare_window)
        if p<=0.005:
            print('몇 번째인지:', delay+first_ab_idx)
            print('Delay:', delay)
            print(f"Group 1 proportion: {ref_ratio:.3f}")
            print(f"Group 2 proportion: {compare_ratio:.3f}")
            print(f"t-statistic: {t:.3f}")
            print(f"p-value: {p:.3f}")
            final_delay.append(delay)
            break
    if len(final_delay) != n:
        final_delay.append('none')
        
    epoch_time.append(round(time.time() - start_time, 1))


100%|██████████| 1029/1029 [05:41<00:00,  3.02it/s]
100%|██████████| 788/788 [03:52<00:00,  3.38it/s]
 10%|█         | 1/10 [11:08<1:40:15, 668.39s/it]

몇 번째인지: 713
Delay: 221
Group 1 proportion: 0.100
Group 2 proportion: 0.275
t-statistic: -2.891
p-value: 0.004




In [None]:
print('각10번 time:', epoch_time)
print('평균', np.mean(epoch_time))

In [None]:
print('각10번 delay:', final_delay)
# print('평균', np.mean(final_delay))

In [None]:
print('각10번 acc:', final_acc)
print('평균', np.mean(final_acc))

In [None]:
print('각10번 precision:', final_precision)
print('평균', np.mean(final_precision))

In [None]:
print('각10번 f1:', final_f1)
print('평균', np.mean(final_f1))

### Politic

In [None]:
normal_dataset = df_business+df_sport+df_tech+df_entertain

In [None]:
txt = []

for lst in tqdm(normal_dataset):
    file = open(lst, 'r')
    data = file.read()
    data = data.lower()
    data = data.strip()
    data = re.compile('<.*?>').sub('', data)
    data = re.sub('\s+', ' ', data)  
    data = ' '.join([contractions[t] if t in contractions else t for t in data.split(" ")]) # 약어 정규화
    data = re.sub(r"'s\b","",data) # 소유격 제거. Ex) roland's -> roland
    
    data = data.replace('\n\n', '\n')
    data = data.replace('\n', '. ')
    data = data.replace('..', '.')

    txt.append(data)

txt_2 = []

for abnormal in tqdm(df_politics):
    file = open(abnormal, 'r')
    data_ = file.read()
    data_ = data_.lower()
    data_ = data_.strip()
    data_ = re.compile('<.*?>').sub('', data_)
    data_ = re.sub('\s+', ' ', data_)  
    data_ = ' '.join([contractions[t] if t in contractions else t for t in data_.split(" ")]) # 약어 정규화
    data_ = re.sub(r"'s\b","",data_) # 소유격 제거. Ex) roland's -> roland
    
    data_ = data_.replace('\n\n', '\n')
    data_ = data_.replace('\n', '. ')
    data_ = data_.replace('..', '.')

    txt_2.append(data_)

df_normal = pd.DataFrame(txt, columns=['origin'])
df_normal = df_normal.drop(df_normal.index[928])
df_normal = df_normal.reset_index(drop=True)
df_abnormal = pd.DataFrame(txt_2, columns=['origin'])

In [None]:
conta = np.arange(0.01,0.5,0.01)
neighbor = np.arange(5,70,5)
final_acc = []
final_precision = []
final_recall = []
final_f1 = []
final_delay = []
epoch_time = []

for n in tqdm(range(1, 11)):
    start_time = time.time()

    train_dataset, test_dataset = make_dataset(df_normal, df_abnormal)

    train_sum = []
    for i in range(len(train_dataset)):
        summ1 = text_r(train_dataset.origin.iloc[i])
        train_sum.append(' '.join(summ1))
    train_dataset['summary'] = train_sum

    test_sum = []
    for i in range(len(test_dataset)):
        summ2 = text_r(test_dataset.origin.iloc[i])
        test_sum.append(' '.join(summ2))
    test_dataset['summary'] = test_sum
    
    
    train_docs_embedding = make_vector(train_dataset.origin)
    test_docs_embedding = make_vector(test_dataset.origin)

    best_score = 0
    i = len(train_docs_embedding)

    for c in conta:
        for nei in neighbor:
        
            lof = LocalOutlierFactor(n_neighbors=nei, contamination=c, novelty=True)

            lof.fit(train_docs_embedding)

            f1_s = f1_score(np.where(lof.predict(test_docs_embedding)==1, 0, 1), test_dataset.origin.category)

            if f1_s>best_score:
                best_score = f1_s
                best_params = {best_score:[c,nei]}
                whole_window = np.where(lof.predict(test_docs_embedding)==1, 0, 1)

    lof_fin = LocalOutlierFactor(n_neighbors=best_params[best_score][1], contamination=best_params[best_score][0], novelty=True)
    lof_fin.fit(train_docs_embedding)
    predict = np.where(lof_fin.predict(test_docs_embedding)==1, 0, 1)
    test_acc_scores = accuracy_score(test_dataset.origin.category, predict)
    test_pre_scores = precision_score(test_dataset.origin.category, predict)
    test_rec_scores = recall_score(test_dataset.origin.category, predict)
    test_f1_scores = f1_score(test_dataset.origin.category, predict)
    final_acc.append(test_acc_scores)
    final_precision.append(test_pre_scores)
    final_recall.append(test_rec_scores)
    final_f1.append(test_f1_scores)

    window_size = 80
    ref_window = predict[:window_size]
    ref_ratio = np.count_nonzero(ref_window) / len(ref_window)

    first_ab_idx = test_dataset[test_dataset.category==1].index[0]-window_size
    for delay in range(len(predict) - first_ab_idx):
        compare_window = predict[first_ab_idx:first_ab_idx+window_size]
        compare_ratio = np.count_nonzero(compare_window) / len(compare_window)
        first_ab_idx+=1
        t, p = ttest_ind(ref_window, compare_window)
        if p<=0.005:
            print('몇 번째인지:', delay+first_ab_idx)
            print('Delay:', delay)
            print(f"Group 1 proportion: {ref_ratio:.3f}")
            print(f"Group 2 proportion: {compare_ratio:.3f}")
            print(f"t-statistic: {t:.3f}")
            print(f"p-value: {p:.3f}")
            final_delay.append(delay)
            break
    if len(final_delay) != n:
        final_delay.append('none')
        
    epoch_time.append(round(time.time() - start_time, 1))


In [None]:
print('각10번 time:', epoch_time)
print('평균', np.mean(epoch_time))

In [None]:
print('각10번 delay:', final_delay)
# print('평균', np.mean(final_delay))

In [None]:
print('각10번 acc:', final_acc)
print('평균', np.mean(final_acc))

In [None]:
print('각10번 precision:', final_precision)
print('평균', np.mean(final_precision))

In [None]:
print('각10번 f1:', final_f1)
print('평균', np.mean(final_f1))

### Tech

In [None]:
normal_dataset = df_business+df_sport+df_politics+df_entertain

In [None]:
txt = []

for lst in tqdm(normal_dataset):
    file = open(lst, 'r')
    data = file.read()
    data = data.lower()
    data = data.strip()
    data = re.compile('<.*?>').sub('', data)
    data = re.sub('\s+', ' ', data)  
    data = ' '.join([contractions[t] if t in contractions else t for t in data.split(" ")]) # 약어 정규화
    data = re.sub(r"'s\b","",data) # 소유격 제거. Ex) roland's -> roland
    
    data = data.replace('\n\n', '\n')
    data = data.replace('\n', '. ')
    data = data.replace('..', '.')

    txt.append(data)

txt_2 = []

for abnormal in tqdm(df_tech):
    file = open(abnormal, 'r')
    data_ = file.read()
    data_ = data_.lower()
    data_ = data_.strip()
    data_ = re.compile('<.*?>').sub('', data_)
    data_ = re.sub('\s+', ' ', data_)  
    data_ = ' '.join([contractions[t] if t in contractions else t for t in data_.split(" ")]) # 약어 정규화
    data_ = re.sub(r"'s\b","",data_) # 소유격 제거. Ex) roland's -> roland
    
    data_ = data_.replace('\n\n', '\n')
    data_ = data_.replace('\n', '. ')
    data_ = data_.replace('..', '.')

    txt_2.append(data_)

df_normal = pd.DataFrame(txt, columns=['origin'])
df_normal = df_normal.drop(df_normal.index[928])
df_normal = df_normal.reset_index(drop=True)
df_abnormal = pd.DataFrame(txt_2, columns=['origin'])

In [None]:
conta = np.arange(0.01,0.5,0.01)
neighbor = np.arange(5,70,5)
final_acc = []
final_precision = []
final_recall = []
final_f1 = []
final_delay = []
epoch_time = []

for n in tqdm(range(1, 11)):
    start_time = time.time()

    train_dataset, test_dataset = make_dataset(df_normal, df_abnormal)

    train_sum = []
    for i in range(len(train_dataset)):
        summ1 = text_r(train_dataset.origin.iloc[i])
        train_sum.append(' '.join(summ1))
    train_dataset['summary'] = train_sum

    test_sum = []
    for i in range(len(test_dataset)):
        summ2 = text_r(test_dataset.origin.iloc[i])
        test_sum.append(' '.join(summ2))
    test_dataset['summary'] = test_sum
    
    
    train_docs_embedding = make_vector(train_dataset.origin)
    test_docs_embedding = make_vector(test_dataset.origin)

    best_score = 0
    i = len(train_docs_embedding)

    for c in conta:
        for nei in neighbor:
        
            lof = LocalOutlierFactor(n_neighbors=nei, contamination=c, novelty=True)

            lof.fit(train_docs_embedding)

            f1_s = f1_score(np.where(lof.predict(test_docs_embedding)==1, 0, 1), test_dataset.origin.category)

            if f1_s>best_score:
                best_score = f1_s
                best_params = {best_score:[c,nei]}
                whole_window = np.where(lof.predict(test_docs_embedding)==1, 0, 1)

    lof_fin = LocalOutlierFactor(n_neighbors=best_params[best_score][1], contamination=best_params[best_score][0], novelty=True)
    lof_fin.fit(train_docs_embedding)
    predict = np.where(lof_fin.predict(test_docs_embedding)==1, 0, 1)
    test_acc_scores = accuracy_score(test_dataset.origin.category, predict)
    test_pre_scores = precision_score(test_dataset.origin.category, predict)
    test_rec_scores = recall_score(test_dataset.origin.category, predict)
    test_f1_scores = f1_score(test_dataset.origin.category, predict)
    final_acc.append(test_acc_scores)
    final_precision.append(test_pre_scores)
    final_recall.append(test_rec_scores)
    final_f1.append(test_f1_scores)

    window_size = 80
    ref_window = predict[:window_size]
    ref_ratio = np.count_nonzero(ref_window) / len(ref_window)

    first_ab_idx = test_dataset[test_dataset.category==1].index[0]-window_size
    for delay in range(len(predict) - first_ab_idx):
        compare_window = predict[first_ab_idx:first_ab_idx+window_size]
        compare_ratio = np.count_nonzero(compare_window) / len(compare_window)
        first_ab_idx+=1
        t, p = ttest_ind(ref_window, compare_window)
        if p<=0.005:
            print('몇 번째인지:', delay+first_ab_idx)
            print('Delay:', delay)
            print(f"Group 1 proportion: {ref_ratio:.3f}")
            print(f"Group 2 proportion: {compare_ratio:.3f}")
            print(f"t-statistic: {t:.3f}")
            print(f"p-value: {p:.3f}")
            final_delay.append(delay)
            break
    if len(final_delay) != n:
        final_delay.append('none')
        
    epoch_time.append(round(time.time() - start_time, 1))


In [None]:
print('각10번 time:', epoch_time)
print('평균', np.mean(epoch_time))

In [None]:
print('각10번 delay:', final_delay)
# print('평균', np.mean(final_delay))

In [None]:
print('각10번 acc:', final_acc)
print('평균', np.mean(final_acc))

In [None]:
print('각10번 precision:', final_precision)
print('평균', np.mean(final_precision))

In [None]:
print('각10번 f1:', final_f1)
print('평균', np.mean(final_f1))

### Sport

In [None]:
normal_dataset = df_business+df_tech+df_politics+df_entertain

In [None]:
txt = []

for lst in tqdm(normal_dataset):
    file = open(lst, 'r')
    data = file.read()
    data = data.lower()
    data = data.strip()
    data = re.compile('<.*?>').sub('', data)
    data = re.sub('\s+', ' ', data)  
    data = ' '.join([contractions[t] if t in contractions else t for t in data.split(" ")]) # 약어 정규화
    data = re.sub(r"'s\b","",data) # 소유격 제거. Ex) roland's -> roland
    
    data = data.replace('\n\n', '\n')
    data = data.replace('\n', '. ')
    data = data.replace('..', '.')

    txt.append(data)

txt_2 = []

for abnormal in tqdm(df_sport):
    file = open(abnormal, 'r')
    data_ = file.read()
    data_ = data_.lower()
    data_ = data_.strip()
    data_ = re.compile('<.*?>').sub('', data_)
    data_ = re.sub('\s+', ' ', data_)  
    data_ = ' '.join([contractions[t] if t in contractions else t for t in data_.split(" ")]) # 약어 정규화
    data_ = re.sub(r"'s\b","",data_) # 소유격 제거. Ex) roland's -> roland
    
    data_ = data_.replace('\n\n', '\n')
    data_ = data_.replace('\n', '. ')
    data_ = data_.replace('..', '.')

    txt_2.append(data_)

df_normal = pd.DataFrame(txt, columns=['origin'])
df_normal = df_normal.drop(df_normal.index[928])
df_normal = df_normal.reset_index(drop=True)
df_abnormal = pd.DataFrame(txt_2, columns=['origin'])

In [None]:
conta = np.arange(0.01,0.5,0.01)
neighbor = np.arange(5,70,5)
final_acc = []
final_precision = []
final_recall = []
final_f1 = []
final_delay = []
epoch_time = []

for n in tqdm(range(1, 11)):
    start_time = time.time()

    train_dataset, test_dataset = make_dataset(df_normal, df_abnormal)

    train_sum = []
    for i in range(len(train_dataset)):
        summ1 = text_r(train_dataset.origin.iloc[i])
        train_sum.append(' '.join(summ1))
    train_dataset['summary'] = train_sum

    test_sum = []
    for i in range(len(test_dataset)):
        summ2 = text_r(test_dataset.origin.iloc[i])
        test_sum.append(' '.join(summ2))
    test_dataset['summary'] = test_sum
    
    
    train_docs_embedding = make_vector(train_dataset.origin)
    test_docs_embedding = make_vector(test_dataset.origin)

    best_score = 0
    i = len(train_docs_embedding)

    for c in conta:
        for nei in neighbor:
        
            lof = LocalOutlierFactor(n_neighbors=nei, contamination=c, novelty=True)

            lof.fit(train_docs_embedding)

            f1_s = f1_score(np.where(lof.predict(test_docs_embedding)==1, 0, 1), test_dataset.origin.category)

            if f1_s>best_score:
                best_score = f1_s
                best_params = {best_score:[c,nei]}
                whole_window = np.where(lof.predict(test_docs_embedding)==1, 0, 1)

    lof_fin = LocalOutlierFactor(n_neighbors=best_params[best_score][1], contamination=best_params[best_score][0], novelty=True)
    lof_fin.fit(train_docs_embedding)
    predict = np.where(lof_fin.predict(test_docs_embedding)==1, 0, 1)
    test_acc_scores = accuracy_score(test_dataset.origin.category, predict)
    test_pre_scores = precision_score(test_dataset.origin.category, predict)
    test_rec_scores = recall_score(test_dataset.origin.category, predict)
    test_f1_scores = f1_score(test_dataset.origin.category, predict)
    final_acc.append(test_acc_scores)
    final_precision.append(test_pre_scores)
    final_recall.append(test_rec_scores)
    final_f1.append(test_f1_scores)

    window_size = 80
    ref_window = predict[:window_size]
    ref_ratio = np.count_nonzero(ref_window) / len(ref_window)

    first_ab_idx = test_dataset[test_dataset.category==1].index[0]-window_size
    for delay in range(len(predict) - first_ab_idx):
        compare_window = predict[first_ab_idx:first_ab_idx+window_size]
        compare_ratio = np.count_nonzero(compare_window) / len(compare_window)
        first_ab_idx+=1
        t, p = ttest_ind(ref_window, compare_window)
        if p<=0.005:
            print('몇 번째인지:', delay+first_ab_idx)
            print('Delay:', delay)
            print(f"Group 1 proportion: {ref_ratio:.3f}")
            print(f"Group 2 proportion: {compare_ratio:.3f}")
            print(f"t-statistic: {t:.3f}")
            print(f"p-value: {p:.3f}")
            final_delay.append(delay)
            break
    if len(final_delay) != n:
        final_delay.append('none')
        
    epoch_time.append(round(time.time() - start_time, 1))


In [None]:
print('각10번 time:', epoch_time)
print('평균', np.mean(epoch_time))

In [None]:
print('각10번 delay:', final_delay)
# print('평균', np.mean(final_delay))

In [None]:
print('각10번 acc:', final_acc)
print('평균', np.mean(final_acc))

In [None]:
print('각10번 precision:', final_precision)
print('평균', np.mean(final_precision))

In [None]:
print('각10번 f1:', final_f1)
print('평균', np.mean(final_f1))

### Entertain

In [None]:
normal_dataset = df_business+df_tech+df_politics+df_sport

In [None]:
txt = []

for lst in tqdm(normal_dataset):
    file = open(lst, 'r')
    data = file.read()
    data = data.lower()
    data = data.strip()
    data = re.compile('<.*?>').sub('', data)
    data = re.sub('\s+', ' ', data)  
    data = ' '.join([contractions[t] if t in contractions else t for t in data.split(" ")]) # 약어 정규화
    data = re.sub(r"'s\b","",data) # 소유격 제거. Ex) roland's -> roland
    
    data = data.replace('\n\n', '\n')
    data = data.replace('\n', '. ')
    data = data.replace('..', '.')

    txt.append(data)

txt_2 = []

for abnormal in tqdm(df_entertain):
    file = open(abnormal, 'r')
    data_ = file.read()
    data_ = data_.lower()
    data_ = data_.strip()
    data_ = re.compile('<.*?>').sub('', data_)
    data_ = re.sub('\s+', ' ', data_)  
    data_ = ' '.join([contractions[t] if t in contractions else t for t in data_.split(" ")]) # 약어 정규화
    data_ = re.sub(r"'s\b","",data_) # 소유격 제거. Ex) roland's -> roland
    
    data_ = data_.replace('\n\n', '\n')
    data_ = data_.replace('\n', '. ')
    data_ = data_.replace('..', '.')

    txt_2.append(data_)

df_normal = pd.DataFrame(txt, columns=['origin'])
df_normal = df_normal.drop(df_normal.index[928])
df_normal = df_normal.reset_index(drop=True)
df_abnormal = pd.DataFrame(txt_2, columns=['origin'])

In [None]:
conta = np.arange(0.01,0.5,0.01)
neighbor = np.arange(5,70,5)
final_acc = []
final_precision = []
final_recall = []
final_f1 = []
final_delay = []
epoch_time = []

for n in tqdm(range(1, 11)):
    start_time = time.time()

    train_dataset, test_dataset = make_dataset(df_normal, df_abnormal)

    train_sum = []
    for i in range(len(train_dataset)):
        summ1 = text_r(train_dataset.origin.iloc[i])
        train_sum.append(' '.join(summ1))
    train_dataset['summary'] = train_sum

    test_sum = []
    for i in range(len(test_dataset)):
        summ2 = text_r(test_dataset.origin.iloc[i])
        test_sum.append(' '.join(summ2))
    test_dataset['summary'] = test_sum
    
    
    train_docs_embedding = make_vector(train_dataset.origin)
    test_docs_embedding = make_vector(test_dataset.origin)

    best_score = 0
    i = len(train_docs_embedding)

    for c in conta:
        for nei in neighbor:
        
            lof = LocalOutlierFactor(n_neighbors=nei, contamination=c, novelty=True)

            lof.fit(train_docs_embedding)

            f1_s = f1_score(np.where(lof.predict(test_docs_embedding)==1, 0, 1), test_dataset.origin.category)

            if f1_s>best_score:
                best_score = f1_s
                best_params = {best_score:[c,nei]}
                whole_window = np.where(lof.predict(test_docs_embedding)==1, 0, 1)

    lof_fin = LocalOutlierFactor(n_neighbors=best_params[best_score][1], contamination=best_params[best_score][0], novelty=True)
    lof_fin.fit(train_docs_embedding)
    predict = np.where(lof_fin.predict(test_docs_embedding)==1, 0, 1)
    test_acc_scores = accuracy_score(test_dataset.origin.category, predict)
    test_pre_scores = precision_score(test_dataset.origin.category, predict)
    test_rec_scores = recall_score(test_dataset.origin.category, predict)
    test_f1_scores = f1_score(test_dataset.origin.category, predict)
    final_acc.append(test_acc_scores)
    final_precision.append(test_pre_scores)
    final_recall.append(test_rec_scores)
    final_f1.append(test_f1_scores)

    window_size = 80
    ref_window = predict[:window_size]
    ref_ratio = np.count_nonzero(ref_window) / len(ref_window)

    first_ab_idx = test_dataset[test_dataset.category==1].index[0]-window_size
    for delay in range(len(predict) - first_ab_idx):
        compare_window = predict[first_ab_idx:first_ab_idx+window_size]
        compare_ratio = np.count_nonzero(compare_window) / len(compare_window)
        first_ab_idx+=1
        t, p = ttest_ind(ref_window, compare_window)
        if p<=0.005:
            print('몇 번째인지:', delay+first_ab_idx)
            print('Delay:', delay)
            print(f"Group 1 proportion: {ref_ratio:.3f}")
            print(f"Group 2 proportion: {compare_ratio:.3f}")
            print(f"t-statistic: {t:.3f}")
            print(f"p-value: {p:.3f}")
            final_delay.append(delay)
            break
    if len(final_delay) != n:
        final_delay.append('none')
        
    epoch_time.append(round(time.time() - start_time, 1))


In [None]:
print('각10번 time:', epoch_time)
print('평균', np.mean(epoch_time))

In [None]:
print('각10번 delay:', final_delay)
# print('평균', np.mean(final_delay))

In [None]:
print('각10번 acc:', final_acc)
print('평균', np.mean(final_acc))

In [None]:
print('각10번 precision:', final_precision)
print('평균', np.mean(final_precision))

In [None]:
print('각10번 f1:', final_f1)
print('평균', np.mean(final_f1))