In [5]:
import pandas as pd
import numpy as np
import random
import os
from glob import glob
from tqdm import tqdm
import re
from nltk.corpus import stopwords
import nltk
# nltk.download('stopwords')
from scipy.stats import norm

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

from transformers import AutoTokenizer, AutoModel
import torch
from transformers import BertTokenizer, BertModel, BartForConditionalGeneration
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score

import warnings
warnings.filterwarnings(action='ignore')
from statsmodels.stats.proportion import proportions_ztest
from scipy.stats import ttest_ind
import time
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
contractions = {"'cause": 'because',
 "I'd": 'I would',
 "I'd've": 'I would have',
 "I'll": 'I will',
 "I'll've": 'I will have',
 "I'm": 'I am',
 "I've": 'I have',
 "ain't": 'is not',
 "aren't": 'are not',
 "can't": 'cannot',
 "could've": 'could have',
 "couldn't": 'could not',
 "didn't": 'did not',
 "doesn't": 'does not',
 "don't": 'do not',
 "hadn't": 'had not',
 "hasn't": 'has not',
 "haven't": 'have not',
 "he'd": 'he would',
 "he'll": 'he will',
 "he's": 'he is',
 "here's": 'here is',
 "how'd": 'how did',
 "how'd'y": 'how do you',
 "how'll": 'how will',
 "how's": 'how is',
 "i'd": 'i would',
 "i'd've": 'i would have',
 "i'll": 'i will',
 "i'll've": 'i will have',
 "i'm": 'i am',
 "i've": 'i have',
 "isn't": 'is not',
 "it'd": 'it would',
 "it'd've": 'it would have',
 "it'll": 'it will',
 "it'll've": 'it will have',
 "it's": 'it is',
 "let's": 'let us',
 "ma'am": 'madam',
 "mayn't": 'may not',
 "might've": 'might have',
 "mightn't": 'might not',
 "mightn't've": 'might not have',
 "must've": 'must have',
 "mustn't": 'must not',
 "mustn't've": 'must not have',
 "needn't": 'need not',
 "needn't've": 'need not have',
 "o'clock": 'of the clock',
 "oughtn't": 'ought not',
 "oughtn't've": 'ought not have',
 "sha'n't": 'shall not',
 "shan't": 'shall not',
 "shan't've": 'shall not have',
 "she'd": 'she would',
 "she'd've": 'she would have',
 "she'll": 'she will',
 "she'll've": 'she will have',
 "she's": 'she is',
 "should've": 'should have',
 "shouldn't": 'should not',
 "shouldn't've": 'should not have',
 "so's": 'so as',
 "so've": 'so have',
 "that'd": 'that would',
 "that'd've": 'that would have',
 "that's": 'that is',
 "there'd": 'there would',
 "there'd've": 'there would have',
 "there's": 'there is',
 "they'd": 'they would',
 "they'd've": 'they would have',
 "they'll": 'they will',
 "they'll've": 'they will have',
 "they're": 'they are',
 "they've": 'they have',
 "this's": 'this is',
 "to've": 'to have',
 "wasn't": 'was not',
 "we'd": 'we would',
 "we'd've": 'we would have',
 "we'll": 'we will',
 "we'll've": 'we will have',
 "we're": 'we are',
 "we've": 'we have',
 "weren't": 'were not',
 "what'll": 'what will',
 "what'll've": 'what will have',
 "what're": 'what are',
 "what's": 'what is',
 "what've": 'what have',
 "when's": 'when is',
 "when've": 'when have',
 "where'd": 'where did',
 "where's": 'where is',
 "where've": 'where have',
 "who'll": 'who will',
 "who'll've": 'who will have',
 "who's": 'who is',
 "who've": 'who have',
 "why's": 'why is',
 "why've": 'why have',
 "will've": 'will have',
 "won't": 'will not',
 "won't've": 'will not have',
 "would've": 'would have',
 "wouldn't": 'would not',
 "wouldn't've": 'would not have',
 "y'all": 'you all',
 "y'all'd": 'you all would',
 "y'all'd've": 'you all would have',
 "y'all're": 'you all are',
 "y'all've": 'you all have',
 "you'd": 'you would',
 "you'd've": 'you would have',
 "you'll": 'you will',
 "you'll've": 'you will have',
 "you're": 'you are',
 "you've": 'you have'}

## 원본 데이터

In [7]:
df_politics = glob('../origin_data//politics/*.txt')
df_sport = glob('../origin_data//sport/*.txt')
df_tech = glob('../origin_data//tech/*.txt')
df_entertain = glob('../origin_data//entertainment/*.txt')
df_business = glob('../origin_data//business/*.txt')

In [8]:
def make_dataset(df_normal, df_abnormal):

    idx = np.random.permutation(len(df_normal))
    # train_dataset, test_dataset_normal, test_dataset_abnormal, validation_dataset_normal = df_normal.iloc[idx[:994]], df_normal.iloc[idx[994:1175]], df_normal.iloc[idx[1175:1446]], df_normal.iloc[idx[1446:]]
    train_dataset, test_dataset_normal, test_dataset_abnormal = df_normal.iloc[idx[:int(len(df_normal)*0.6)]], df_normal.iloc[idx[int(len(df_normal)*0.6):int(len(df_normal)*0.6)+int(len(df_normal)*0.2)]], df_normal.iloc[idx[int(len(df_normal)*0.6)+int(len(df_normal)*0.2):]]
    
    category_tr = [0] * len(train_dataset)
    train_dataset['category'] = category_tr

    category_te = [0] * len(test_dataset_normal)
    test_dataset_normal['category'] = category_te

    category_ab_te = [0] * len(test_dataset_abnormal)
    test_dataset_abnormal['category'] = category_ab_te

    ab_idx = np.random.permutation(len(df_abnormal))
    ab_data = df_abnormal.iloc[ab_idx[:int(len(ab_idx)*0.2)]]

    ab_category = [1] * len(ab_data)
    ab_data['category'] = ab_category


    test_dataset_abnormal_ = pd.concat([test_dataset_abnormal, ab_data], axis=0)
    test_ab_idx = np.random.permutation(len(test_dataset_abnormal_))
    test_dataset_abnormal_fi = test_dataset_abnormal_.iloc[test_ab_idx]
    
    test_dataset = pd.concat([test_dataset_normal, test_dataset_abnormal_fi], axis=0)
    test_dataset = test_dataset.reset_index(drop=True)

    return train_dataset, test_dataset


In [9]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

# # 문장을 벡터로 변환
# def sentence_embedding(sentence):
#     input_ids = torch.tensor(tokenizer.encode(sentence, add_special_tokens=True)).unsqueeze(0)
#     outputs = model(input_ids)
#     last_hidden_states = outputs.last_hidden_state
#     sentence_embedding = torch.mean(last_hidden_states, dim=1).squeeze()
#     return sentence_embedding.detach().numpy()

# def make_vector(docs):
#     train_docs_vector = []
#     for sentences in tqdm(docs):
#         sentence_vector = []
#         for sentence in sentences.split('. '):
#             sentence_vector.append(sentence_embedding(sentence))
#         train_docs_vector.append(sentence_vector)

#     docs_embedding = np.array([np.mean(train_docs_vector[idx], axis = 0) for idx in range(len(train_docs_vector))])
#     return docs_embedding

def sentence_embedding(sentence):
    input_ids = torch.tensor(tokenizer.encode(sentence, add_special_tokens=True)).unsqueeze(0)
    if len(input_ids[0])>512:
        input_ids = input_ids[0][:512]
        input_ids = input_ids.unsqueeze(0)
    outputs = model(input_ids)
    last_hidden_states = outputs.last_hidden_state
    sentence_embedding = torch.mean(last_hidden_states, dim=1).squeeze()
    return sentence_embedding.detach().numpy()


def make_vector(docs):
    train_docs_vector = []
    stop_words = set(stopwords.words('english'))  # 영어 stopwords를 사용할 경우

    for sentences in tqdm(docs):
        sentence_vector = []
        for sentence in sentences.split('. '):
            # stopwords를 제거한 후에 sentence_embedding 수행
            sentence_clean = ' '.join([word for word in sentence.split() if word.lower() not in stop_words])
            if sentence_clean.strip() != '':
                sentence_vector.append(sentence_embedding(sentence_clean))
            else:
                sentence_vector.append(sentence_embedding(sentence))
            
        train_docs_vector.append(sentence_vector)

    docs_embedding = np.array([np.mean(train_docs_vector[idx], axis=0) for idx in range(len(train_docs_vector))])
    return docs_embedding

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Business

In [10]:
normal_dataset = df_politics+df_sport+df_tech+df_entertain

In [11]:
txt = []

for lst in tqdm(normal_dataset):
    file = open(lst, 'r')
    data = file.read()
    data = data.lower()
    data = data.strip()
    data = re.compile('<.*?>').sub('', data)
    data = re.sub('\s+', ' ', data)  
    data = ' '.join([contractions[t] if t in contractions else t for t in data.split(" ")]) # 약어 정규화
    data = re.sub(r"'s\b","",data) # 소유격 제거. Ex) roland's -> roland
    
    data = data.replace('\n\n', '\n')
    data = data.replace('\n', '. ')
    data = data.replace('..', '.')

    txt.append(data)

txt_2 = []

for abnormal in tqdm(df_business):
    file = open(abnormal, 'r')
    data_ = file.read()
    data_ = data_.lower()
    data_ = data_.strip()
    data_ = re.compile('<.*?>').sub('', data_)
    data_ = re.sub('\s+', ' ', data_)  
    data_ = ' '.join([contractions[t] if t in contractions else t for t in data_.split(" ")]) # 약어 정규화
    data_ = re.sub(r"'s\b","",data_) # 소유격 제거. Ex) roland's -> roland
    
    data_ = data_.replace('\n\n', '\n')
    data_ = data_.replace('\n', '. ')
    data_ = data_.replace('..', '.')

    txt_2.append(data_)

df_normal = pd.DataFrame(txt, columns=['origin'])
df_normal = df_normal.drop(df_normal.index[928])
df_normal = df_normal.reset_index(drop=True)
df_abnormal = pd.DataFrame(txt_2, columns=['origin'])

100%|██████████| 1715/1715 [00:00<00:00, 9122.15it/s]
100%|██████████| 510/510 [00:00<00:00, 4138.64it/s]


In [154]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
clu = [5,10,15,20,25,30,35,40,45,50]
threshold = np.arange(0,4.5,0.1)
final_acc = []
final_precision = []
final_recall = []
final_f1 = []
final_delay = []
epoch_time = []

for n in tqdm(range(1, 11)):
    start_time = time.time()
    train_dataset, test_dataset = make_dataset(df_normal, df_abnormal)
    
    train_docs_embedding = make_vector(train_dataset.origin)
    test_docs_embedding = make_vector(test_dataset.origin)


    with open(f'business_{n}_test_docs_embedding.pickle', 'wb') as f:
        pickle.dump(test_docs_embedding, f)

    with open(f'business_{n}_train_docs_embedding.pickle', 'wb') as f:
        pickle.dump(train_docs_embedding, f)

    with open(f'business_{n}_test_ans_embedding.pickle', 'wb') as f:
        pickle.dump(test_dataset.category.values, f)

    with open(f'business_{n}_train_ans_embedding.pickle', 'wb') as f:
        pickle.dump(train_dataset.category.values, f)


    best_score = 0
    i = len(train_dataset)//3
    
    for c in clu:
        
        kmeans1 = KMeans(n_clusters=c, random_state=42)
        kmeans1.fit(train_docs_embedding[:i])

        kmeans2 = KMeans(n_clusters=c, random_state=42)
        kmeans2.fit(train_docs_embedding[i:2*i])

        kmeans3 = KMeans(n_clusters=c, random_state=42)
        kmeans3.fit(train_docs_embedding[2*i:])

        # kmeans4 = KMeans(n_clusters=c, random_state=42)
        # kmeans4.fit(train_docs_embedding[3*i:4*i])

        # kmeans5 = KMeans(n_clusters=c, random_state=42)
        # kmeans5.fit(train_docs_embedding[4*i:])

        # kmeans6 = KMeans(n_clusters=c, random_state=42)
        # kmeans6.fit(train_docs_embedding[5*i:6*i])

        # kmeans7 = KMeans(n_clusters=c, random_state=42)
        # kmeans7.fit(train_docs_embedding[6*i:])

        distances1 = np.zeros(test_docs_embedding.shape[0])
        distances2 = np.zeros(test_docs_embedding.shape[0])
        distances3 = np.zeros(test_docs_embedding.shape[0])
        # distances4 = np.zeros(test_docs_embedding.shape[0])
        # distances5 = np.zeros(test_docs_embedding.shape[0])
        # distances6 = np.zeros(test_docs_embedding.shape[0])
        # distances7 = np.zeros(test_docs_embedding.shape[0])

        for t in threshold:
            predictions1 = kmeans1.predict(test_docs_embedding)
            predictions2 = kmeans2.predict(test_docs_embedding)
            predictions3 = kmeans3.predict(test_docs_embedding)
            # predictions4 = kmeans4.predict(test_docs_embedding)
            # predictions5 = kmeans5.predict(test_docs_embedding)
            # predictions6 = kmeans4.predict(test_docs_embedding)
            # predictions7 = kmeans5.predict(test_docs_embedding)
            
            for idx in range(test_docs_embedding.shape[0]):
                distances1[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans1.cluster_centers_[predictions1[idx]])
                distances2[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans2.cluster_centers_[predictions2[idx]])
                distances3[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans3.cluster_centers_[predictions3[idx]])
                # distances4[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans4.cluster_centers_[predictions4[idx]])
                # distances5[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans5.cluster_centers_[predictions5[idx]])
                # distances6[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans4.cluster_centers_[predictions4[idx]])
                # distances7[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans5.cluster_centers_[predictions5[idx]])
            
            predict_ensembel = (distances1>t) * 1 + (distances2>t) * 1 + (distances3>t) * 1 
            # + (distances4>t)* 1 +(distances5>t)* 1
            
            # +(distances6>t)* 1+(distances7>t)* 1
            predict = np.where(predict_ensembel>=2, 1, 0)
            
            acc_scores = accuracy_score(test_dataset.category, predict)
            f1_s = f1_score(test_dataset.category, predict)
            # if acc_scores>best_score:
            #     best_params = {acc_scores:[c,t]}
            #     best_score = acc_scores
            #     whole_window_ensemble = predict_ensembel
            #     whole_window = predict
            
            if f1_s>best_score:
                best_score = f1_s
                best_params = {best_score:[c,t]}
                whole_window_ensemble = predict_ensembel
                whole_window = predict

    
    kmeans1 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    kmeans1.fit(train_docs_embedding[:i])

    kmeans2 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    kmeans2.fit(train_docs_embedding[i:2*i])

    kmeans3 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    kmeans3.fit(train_docs_embedding[2*i:])

    # kmeans4 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans4.fit(train_docs_embedding[3*i:4*i])

    # kmeans5 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans5.fit(train_docs_embedding[4*i:])

    # kmeans6 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans6.fit(train_docs_embedding[5*i:6*i])

    # kmeans7 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans7.fit(train_docs_embedding[6*i:])

    distances1 = np.zeros(test_docs_embedding.shape[0])
    distances2 = np.zeros(test_docs_embedding.shape[0])
    distances3 = np.zeros(test_docs_embedding.shape[0])
    # distances4 = np.zeros(test_docs_embedding.shape[0])
    # distances5 = np.zeros(test_docs_embedding.shape[0])
    # distances6 = np.zeros(test_docs_embedding.shape[0])
    # distances7 = np.zeros(test_docs_embedding.shape[0])

    predictions1 = kmeans1.predict(test_docs_embedding)
    predictions2 = kmeans2.predict(test_docs_embedding)
    predictions3 = kmeans3.predict(test_docs_embedding)
    # predictions4 = kmeans4.predict(test_docs_embedding)
    # predictions5 = kmeans5.predict(test_docs_embedding)
    # predictions6 = kmeans6.predict(test_docs_embedding)
    # predictions7 = kmeans7.predict(test_docs_embedding)

    for i in range(test_docs_embedding.shape[0]):
        distances1[i] = np.linalg.norm(test_docs_embedding[i] - kmeans1.cluster_centers_[predictions1[i]])
        distances2[i] = np.linalg.norm(test_docs_embedding[i] - kmeans2.cluster_centers_[predictions2[i]])
        distances3[i] = np.linalg.norm(test_docs_embedding[i] - kmeans3.cluster_centers_[predictions3[i]])
        # distances4[i] = np.linalg.norm(test_docs_embedding[i] - kmeans4.cluster_centers_[predictions4[i]])
        # distances5[i] = np.linalg.norm(test_docs_embedding[i] - kmeans5.cluster_centers_[predictions5[i]])
        # distances6[i] = np.linalg.norm(test_docs_embedding[i] - kmeans6.cluster_centers_[predictions6[i]])
        # distances7[i] = np.linalg.norm(test_docs_embedding[i] - kmeans7.cluster_centers_[predictions7[i]])

    predict_ensembel = (distances1>best_params[best_score][1]) * 1 + (distances2>best_params[best_score][1]) * 1 + (distances3>best_params[best_score][1]) * 1 
    # + (distances4>t)* 1 +(distances5>t)* 1
    # +(distances6>t)* 1+(distances7>t)* 1
    predict = np.where(predict_ensembel>=2, 1, 0)

    test_acc_scores = accuracy_score(test_dataset.category, predict)
    test_pre_scores = precision_score(test_dataset.category, predict)
    test_rec_scores = recall_score(test_dataset.category, predict)
    test_f1_scores = f1_score(test_dataset.category, predict)
    final_acc.append(test_acc_scores)
    final_precision.append(test_pre_scores)
    final_recall.append(test_rec_scores)
    final_f1.append(test_f1_scores)

    window_size = 80
    ref_window = predict[:window_size]
    ref_ratio = np.count_nonzero(ref_window) / len(ref_window)

    first_ab_idx = test_dataset[test_dataset.category==1].index[0]-window_size
    for delay in range(len(predict) - first_ab_idx):
        compare_window = predict[first_ab_idx:first_ab_idx+window_size]
        compare_ratio = np.count_nonzero(compare_window) / len(compare_window)
        first_ab_idx+=1
        t, p = ttest_ind(ref_window, compare_window)
        if p<=0.005:
            print('몇 번째인지:', delay+first_ab_idx)
            print('Delay:', delay)
            print(f"Group 1 proportion: {ref_ratio:.3f}")
            print(f"Group 2 proportion: {compare_ratio:.3f}")
            print(f"t-statistic: {t:.3f}")
            print(f"p-value: {p:.3f}")
            final_delay.append(delay)
            break
    if len(final_delay) != n:
        final_delay.append('none')
        
    epoch_time.append(round(time.time() - start_time, 1))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 1028/1028 [05:33<00:00,  3.09it/s]
Token indices sequence length is longer than the specified maximum sequence length for this model (535 

몇 번째인지: 390
Delay: 59
Group 1 proportion: 0.125
Group 2 proportion: 0.312
t-statistic: -2.927
p-value: 0.004


100%|██████████| 1028/1028 [05:34<00:00,  3.07it/s]
100%|██████████| 788/788 [04:05<00:00,  3.21it/s]
 20%|██        | 2/10 [20:51<1:23:24, 625.50s/it]

몇 번째인지: 453
Delay: 95
Group 1 proportion: 0.087
Group 2 proportion: 0.263
t-statistic: -2.975
p-value: 0.003


100%|██████████| 1028/1028 [05:26<00:00,  3.15it/s]
100%|██████████| 788/788 [04:13<00:00,  3.11it/s]
 30%|███       | 3/10 [31:16<1:12:57, 625.30s/it]

몇 번째인지: 469
Delay: 103
Group 1 proportion: 0.113
Group 2 proportion: 0.300
t-statistic: -2.994
p-value: 0.003


100%|██████████| 1028/1028 [05:30<00:00,  3.11it/s]
100%|██████████| 788/788 [04:08<00:00,  3.17it/s]
 40%|████      | 4/10 [41:39<1:02:27, 624.50s/it]

몇 번째인지: 426
Delay: 80
Group 1 proportion: 0.113
Group 2 proportion: 0.300
t-statistic: -2.994
p-value: 0.003


100%|██████████| 1028/1028 [05:28<00:00,  3.13it/s]
100%|██████████| 788/788 [04:10<00:00,  3.15it/s]
 50%|█████     | 5/10 [52:02<51:58, 623.75s/it]  

몇 번째인지: 359
Delay: 47
Group 1 proportion: 0.075
Group 2 proportion: 0.237
t-statistic: -2.886
p-value: 0.004


100%|██████████| 1028/1028 [05:26<00:00,  3.15it/s]
100%|██████████| 788/788 [04:09<00:00,  3.15it/s]
 60%|██████    | 6/10 [1:02:22<41:31, 622.80s/it]

몇 번째인지: 354
Delay: 42
Group 1 proportion: 0.087
Group 2 proportion: 0.263
t-statistic: -2.975
p-value: 0.003


100%|██████████| 1028/1028 [05:25<00:00,  3.16it/s]
100%|██████████| 788/788 [04:08<00:00,  3.17it/s]
100%|██████████| 1028/1028 [05:28<00:00,  3.13it/s]
100%|██████████| 788/788 [04:08<00:00,  3.17it/s]
 80%|████████  | 8/10 [1:23:01<20:42, 621.04s/it]

몇 번째인지: 449
Delay: 93
Group 1 proportion: 0.013
Group 2 proportion: 0.125
t-statistic: -2.866
p-value: 0.005


100%|██████████| 1028/1028 [05:35<00:00,  3.07it/s]
100%|██████████| 788/788 [04:00<00:00,  3.28it/s]
 90%|█████████ | 9/10 [1:33:20<10:20, 620.37s/it]

몇 번째인지: 437
Delay: 86
Group 1 proportion: 0.100
Group 2 proportion: 0.275
t-statistic: -2.891
p-value: 0.004


100%|██████████| 1028/1028 [05:25<00:00,  3.16it/s]
100%|██████████| 788/788 [04:08<00:00,  3.16it/s]
100%|██████████| 10/10 [1:43:38<00:00, 621.84s/it]

몇 번째인지: 450
Delay: 92
Group 1 proportion: 0.075
Group 2 proportion: 0.237
t-statistic: -2.886
p-value: 0.004





In [156]:
print('각10번 time:', epoch_time)
print('평균', np.mean(epoch_time))

각10번 time: [626.3, 624.9, 625.1, 623.3, 622.4, 621.0, 617.4, 621.1, 618.9, 618.1]
평균 621.85


In [157]:
print('각10번 delay:', final_delay)
# print('평균', np.mean(final_delay))

각10번 delay: [59, 95, 103, 80, 47, 42, 'none', 93, 86, 92]


In [257]:
np.mean([59, 95, 103, 80, 47, 42, 93, 86, 92])

77.44444444444444

In [158]:
print('각10번 acc:', final_acc)
print('평균', np.mean(final_acc))

각10번 acc: [0.8629441624365483, 0.8743654822335025, 0.8857868020304569, 0.8527918781725888, 0.8781725888324873, 0.8654822335025381, 0.8984771573604061, 0.9035532994923858, 0.8743654822335025, 0.8857868020304569]
평균 0.8781725888324873


In [159]:
print('각10번 precision:', final_precision)
print('평균', np.mean(final_precision))

각10번 precision: [0.48125, 0.5103448275862069, 0.5454545454545454, 0.4551282051282051, 0.5238095238095238, 0.48717948717948717, 0.5932203389830508, 0.63, 0.5100671140939598, 0.5508474576271186]
평균 0.5287301499862098


In [160]:
print('각10번 f1:', final_f1)
print('평균', np.mean(final_f1))

각10번 f1: [0.5877862595419847, 0.5991902834008097, 0.6153846153846153, 0.5503875968992248, 0.5789473684210527, 0.5891472868217054, 0.6363636363636364, 0.6237623762376239, 0.6055776892430279, 0.5909090909090909]
평균 0.5977456203222771


### Politic

In [165]:
normal_dataset = df_business+df_sport+df_tech+df_entertain

In [166]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [222]:
txt = []

for lst in tqdm(normal_dataset):
    file = open(lst, 'r')
    data = file.read()
    data = data.lower()
    data = data.strip()
    data = re.compile('<.*?>').sub('', data)
    data = re.sub('\s+', ' ', data)  
    data = ' '.join([contractions[t] if t in contractions else t for t in data.split(" ")]) # 약어 정규화
    data = re.sub(r"'s\b","",data) # 소유격 제거. Ex) roland's -> roland
    
    data = data.replace('\n\n', '\n')
    data = data.replace('\n', '. ')
    data = data.replace('..', '.')

    txt.append(data)

txt_2 = []

for abnormal in tqdm(df_politics):
    file = open(abnormal, 'r')
    data_ = file.read()
    data_ = data_.lower()
    data_ = data_.strip()
    data_ = re.compile('<.*?>').sub('', data_)
    data_ = re.sub('\s+', ' ', data_)  
    data_ = ' '.join([contractions[t] if t in contractions else t for t in data_.split(" ")]) # 약어 정규화
    data_ = re.sub(r"'s\b","",data_) # 소유격 제거. Ex) roland's -> roland
    
    data_ = data_.replace('\n\n', '\n')
    data_ = data_.replace('\n', '. ')
    data_ = data_.replace('..', '.')

    txt_2.append(data_)

df_normal = pd.DataFrame(txt, columns=['origin'])
df_normal = df_normal.reset_index(drop=True)
df_abnormal = pd.DataFrame(txt_2, columns=['origin'])

100%|██████████| 1808/1808 [00:00<00:00, 8892.21it/s]
100%|██████████| 417/417 [00:00<00:00, 7069.08it/s]


In [223]:
# train_docs_vector = []
# stop_words = set(stopwords.words('english'))  # 영어 stopwords를 사용할 경우

# for sentences in tqdm(test_dataset.origin):
#     sentence_vector = []
#     for sentence in sentences.split('. '):
#         # stopwords를 제거한 후에 sentence_embedding 수행
        
#         sentence_clean = ' '.join([word for word in sentence.split() if word.lower() not in stop_words])
#         if sentence_clean.strip() != '':
#             sentence_vector.append(sentence_embedding(sentence_clean))
#         else:
#             sentence_vector.append(sentence_embedding(sentence))

            
#         # sentence_vector.append(sentence_embedding(sentence))
#     train_docs_vector.append(sentence_vector)

# docs_embedding = np.array([np.mean(train_docs_vector[idx], axis=0) for idx in range(len(train_docs_vector))])

In [224]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
clu = [5,10,15,20,25,30,35,40,45,50]
threshold = np.arange(0,4.5,0.1)
final_acc = []
final_precision = []
final_recall = []
final_f1 = []
final_delay = []
epoch_time = []

for n in tqdm(range(1, 11)):
    start_time = time.time()
    train_dataset, test_dataset = make_dataset(df_normal, df_abnormal)
    
    train_docs_embedding = make_vector(train_dataset.origin)
    test_docs_embedding = make_vector(test_dataset.origin)


    with open(f'politics{n}_test_docs_embedding.pickle', 'wb') as f:
        pickle.dump(test_docs_embedding, f)

    with open(f'politics{n}_train_docs_embedding.pickle', 'wb') as f:
        pickle.dump(train_docs_embedding, f)

    with open(f'politics{n}_test_ans_embedding.pickle', 'wb') as f:
        pickle.dump(test_dataset.category.values, f)

    with open(f'politics{n}_train_ans_embedding.pickle', 'wb') as f:
        pickle.dump(train_dataset.category.values, f)


    best_score = 0
    i = len(train_dataset)//3
    
    for c in clu:
        
        kmeans1 = KMeans(n_clusters=c, random_state=42)
        kmeans1.fit(train_docs_embedding[:i])

        kmeans2 = KMeans(n_clusters=c, random_state=42)
        kmeans2.fit(train_docs_embedding[i:2*i])

        kmeans3 = KMeans(n_clusters=c, random_state=42)
        kmeans3.fit(train_docs_embedding[2*i:])

        # kmeans4 = KMeans(n_clusters=c, random_state=42)
        # kmeans4.fit(train_docs_embedding[3*i:4*i])

        # kmeans5 = KMeans(n_clusters=c, random_state=42)
        # kmeans5.fit(train_docs_embedding[4*i:])

        # kmeans6 = KMeans(n_clusters=c, random_state=42)
        # kmeans6.fit(train_docs_embedding[5*i:6*i])

        # kmeans7 = KMeans(n_clusters=c, random_state=42)
        # kmeans7.fit(train_docs_embedding[6*i:])

        distances1 = np.zeros(test_docs_embedding.shape[0])
        distances2 = np.zeros(test_docs_embedding.shape[0])
        distances3 = np.zeros(test_docs_embedding.shape[0])
        # distances4 = np.zeros(test_docs_embedding.shape[0])
        # distances5 = np.zeros(test_docs_embedding.shape[0])
        # distances6 = np.zeros(test_docs_embedding.shape[0])
        # distances7 = np.zeros(test_docs_embedding.shape[0])

        for t in threshold:
            predictions1 = kmeans1.predict(test_docs_embedding)
            predictions2 = kmeans2.predict(test_docs_embedding)
            predictions3 = kmeans3.predict(test_docs_embedding)
            # predictions4 = kmeans4.predict(test_docs_embedding)
            # predictions5 = kmeans5.predict(test_docs_embedding)
            # predictions6 = kmeans4.predict(test_docs_embedding)
            # predictions7 = kmeans5.predict(test_docs_embedding)
            
            for idx in range(test_docs_embedding.shape[0]):
                distances1[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans1.cluster_centers_[predictions1[idx]])
                distances2[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans2.cluster_centers_[predictions2[idx]])
                distances3[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans3.cluster_centers_[predictions3[idx]])
                # distances4[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans4.cluster_centers_[predictions4[idx]])
                # distances5[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans5.cluster_centers_[predictions5[idx]])
                # distances6[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans4.cluster_centers_[predictions4[idx]])
                # distances7[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans5.cluster_centers_[predictions5[idx]])
            
            predict_ensembel = (distances1>t) * 1 + (distances2>t) * 1 + (distances3>t) * 1 
            # + (distances4>t)* 1 +(distances5>t)* 1
            
            # +(distances6>t)* 1+(distances7>t)* 1
            predict = np.where(predict_ensembel>=2, 1, 0)
            
            acc_scores = accuracy_score(test_dataset.category, predict)
            f1_s = f1_score(test_dataset.category, predict)
            # if acc_scores>best_score:
            #     best_params = {acc_scores:[c,t]}
            #     best_score = acc_scores
            #     whole_window_ensemble = predict_ensembel
            #     whole_window = predict
            
            if f1_s>best_score:
                best_params = {f1_s:[c,t]}
                best_score = f1_s
                whole_window_ensemble = predict_ensembel
                whole_window = predict

    
    kmeans1 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    kmeans1.fit(train_docs_embedding[:i])

    kmeans2 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    kmeans2.fit(train_docs_embedding[i:2*i])

    kmeans3 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    kmeans3.fit(train_docs_embedding[2*i:3*i])

    # kmeans4 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans4.fit(train_docs_embedding[3*i:4*i])

    # kmeans5 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans5.fit(train_docs_embedding[4*i:])

    # kmeans6 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans6.fit(train_docs_embedding[5*i:6*i])

    # kmeans7 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans7.fit(train_docs_embedding[6*i:])

    distances1 = np.zeros(test_docs_embedding.shape[0])
    distances2 = np.zeros(test_docs_embedding.shape[0])
    distances3 = np.zeros(test_docs_embedding.shape[0])
    # distances4 = np.zeros(test_docs_embedding.shape[0])
    # distances5 = np.zeros(test_docs_embedding.shape[0])
    # distances6 = np.zeros(test_docs_embedding.shape[0])
    # distances7 = np.zeros(test_docs_embedding.shape[0])

    predictions1 = kmeans1.predict(test_docs_embedding)
    predictions2 = kmeans2.predict(test_docs_embedding)
    predictions3 = kmeans3.predict(test_docs_embedding)
    # predictions4 = kmeans4.predict(test_docs_embedding)
    # predictions5 = kmeans5.predict(test_docs_embedding)
    # predictions6 = kmeans6.predict(test_docs_embedding)
    # predictions7 = kmeans7.predict(test_docs_embedding)

    for i in range(test_docs_embedding.shape[0]):
        distances1[i] = np.linalg.norm(test_docs_embedding[i] - kmeans1.cluster_centers_[predictions1[i]])
        distances2[i] = np.linalg.norm(test_docs_embedding[i] - kmeans2.cluster_centers_[predictions2[i]])
        distances3[i] = np.linalg.norm(test_docs_embedding[i] - kmeans3.cluster_centers_[predictions3[i]])
        # distances4[i] = np.linalg.norm(test_docs_embedding[i] - kmeans4.cluster_centers_[predictions4[i]])
        # distances5[i] = np.linalg.norm(test_docs_embedding[i] - kmeans5.cluster_centers_[predictions5[i]])
        # distances6[i] = np.linalg.norm(test_docs_embedding[i] - kmeans6.cluster_centers_[predictions6[i]])
        # distances7[i] = np.linalg.norm(test_docs_embedding[i] - kmeans7.cluster_centers_[predictions7[i]])

    predict_ensembel = (distances1>best_params[best_score][1]) * 1 + (distances2>best_params[best_score][1]) * 1 + (distances3>best_params[best_score][1]) * 1 
    # + (distances4>t)* 1 +(distances5>t)* 1
    # +(distances6>t)* 1+(distances7>t)* 1
    predict = np.where(predict_ensembel>=2, 1, 0)

    test_acc_scores = accuracy_score(test_dataset.category, predict)
    test_pre_scores = precision_score(test_dataset.category, predict)
    test_rec_scores = recall_score(test_dataset.category, predict)
    test_f1_scores = f1_score(test_dataset.category, predict)
    final_acc.append(test_acc_scores)
    final_precision.append(test_pre_scores)
    final_recall.append(test_rec_scores)
    final_f1.append(test_f1_scores)

    window_size = 80
    ref_window = predict[:window_size]
    ref_ratio = np.count_nonzero(ref_window) / len(ref_window)

    first_ab_idx = test_dataset[test_dataset.category==1].index[0]-window_size
    for delay in range(len(predict) - first_ab_idx):
        compare_window = predict[first_ab_idx:first_ab_idx+window_size]
        compare_ratio = np.count_nonzero(compare_window) / len(compare_window)
        first_ab_idx+=1
        t, p = ttest_ind(ref_window, compare_window)
        if p<=0.005:
            print('몇 번째인지:', delay+first_ab_idx)
            print('Delay:', delay)
            print(f"Group 1 proportion: {ref_ratio:.3f}")
            print(f"Group 2 proportion: {compare_ratio:.3f}")
            print(f"t-statistic: {t:.3f}")
            print(f"p-value: {p:.3f}")
            final_delay.append(delay)
            break
    if len(final_delay) != n:
        final_delay.append('none')
        
    epoch_time.append(round(time.time() - start_time, 1))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 0/10 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (535 > 512). Runnin

몇 번째인지: 983
Delay: 348
Group 1 proportion: 0.163
Group 2 proportion: 0.362
t-statistic: -2.934
p-value: 0.004


100%|██████████| 1084/1084 [05:32<00:00,  3.26it/s]
100%|██████████| 807/807 [04:09<00:00,  3.24it/s]
100%|██████████| 1084/1084 [05:40<00:00,  3.19it/s]
100%|██████████| 807/807 [04:05<00:00,  3.28it/s]
 40%|████      | 4/10 [41:25<1:02:18, 623.16s/it]

몇 번째인지: 443
Delay: 79
Group 1 proportion: 0.150
Group 2 proportion: 0.350
t-statistic: -2.984
p-value: 0.003


100%|██████████| 1084/1084 [05:25<00:00,  3.33it/s]
100%|██████████| 807/807 [04:08<00:00,  3.25it/s]
 50%|█████     | 5/10 [51:39<51:38, 619.78s/it]  

몇 번째인지: 420
Delay: 66
Group 1 proportion: 0.212
Group 2 proportion: 0.425
t-statistic: -2.944
p-value: 0.004


100%|██████████| 1084/1084 [05:26<00:00,  3.32it/s]
100%|██████████| 807/807 [04:22<00:00,  3.07it/s]
100%|██████████| 1084/1084 [05:35<00:00,  3.23it/s]
100%|██████████| 807/807 [04:08<00:00,  3.24it/s]
 70%|███████   | 7/10 [1:12:32<31:09, 623.27s/it]

몇 번째인지: 591
Delay: 153
Group 1 proportion: 0.150
Group 2 proportion: 0.350
t-statistic: -2.984
p-value: 0.003


100%|██████████| 1084/1084 [05:37<00:00,  3.22it/s]
100%|██████████| 807/807 [03:59<00:00,  3.36it/s]
 80%|████████  | 8/10 [1:22:49<20:42, 621.37s/it]

몇 번째인지: 1060
Delay: 377
Group 1 proportion: 0.188
Group 2 proportion: 0.388
t-statistic: -2.848
p-value: 0.005


100%|██████████| 1084/1084 [05:30<00:00,  3.28it/s]
100%|██████████| 807/807 [04:02<00:00,  3.32it/s]
100%|██████████| 1084/1084 [05:25<00:00,  3.33it/s]
100%|██████████| 807/807 [04:08<00:00,  3.25it/s]
100%|██████████| 10/10 [1:43:16<00:00, 619.69s/it]


In [225]:
print('각10번 time:', epoch_time)
print('평균', np.mean(epoch_time))

각10번 time: [608.5, 628.1, 623.4, 625.6, 613.8, 629.4, 623.7, 617.3, 614.2, 612.8]
평균 619.68


In [226]:
print('각10번 delay:', final_delay)
# print('평균', np.mean(final_delay))

각10번 delay: ['none', 348, 'none', 79, 66, 'none', 153, 377, 'none', 'none']


In [258]:
np.mean([348,  79, 66, 153, 377,])

204.6

In [227]:
print('각10번 acc:', final_acc)
print('평균', np.mean(final_acc))

각10번 acc: [0.828996282527881, 0.8042131350681536, 0.7434944237918215, 0.7695167286245354, 0.7397769516728625, 0.8228004956629492, 0.8017348203221809, 0.781908302354399, 0.7670384138785625, 0.8004956629491945]
평균 0.785997521685254


In [228]:
print('각10번 precision:', final_precision)
print('평균', np.mean(final_precision))

각10번 precision: [0.302158273381295, 0.3271889400921659, 0.25396825396825395, 0.26046511627906976, 0.2529182879377432, 0.32558139534883723, 0.30456852791878175, 0.2731707317073171, 0.24880382775119617, 0.3160377358490566]
평균 0.2864861090233716


In [229]:
print('각10번 f1:', final_f1)
print('평균', np.mean(final_f1))

각10번 f1: [0.3783783783783784, 0.47333333333333333, 0.382089552238806, 0.37583892617449666, 0.38235294117647056, 0.4392156862745098, 0.42857142857142855, 0.3888888888888889, 0.3561643835616438, 0.45423728813559316]
평균 0.40590708067335496


#### Tech

In [230]:
normal_dataset = df_business+df_sport+df_politics+df_entertain

In [231]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [232]:
txt = []

for lst in tqdm(normal_dataset):
    file = open(lst, 'r')
    data = file.read()
    data = data.lower()
    data = data.strip()
    data = re.compile('<.*?>').sub('', data)
    data = re.sub('\s+', ' ', data)  
    data = ' '.join([contractions[t] if t in contractions else t for t in data.split(" ")]) # 약어 정규화
    data = re.sub(r"'s\b","",data) # 소유격 제거. Ex) roland's -> roland
    
    data = data.replace('\n\n', '\n')
    data = data.replace('\n', '. ')
    data = data.replace('..', '.')

    txt.append(data)

txt_2 = []

for abnormal in tqdm(df_tech):
    file = open(abnormal, 'r')
    data_ = file.read()
    data_ = data_.lower()
    data_ = data_.strip()
    data_ = re.compile('<.*?>').sub('', data_)
    data_ = re.sub('\s+', ' ', data_)  
    data_ = ' '.join([contractions[t] if t in contractions else t for t in data_.split(" ")]) # 약어 정규화
    data_ = re.sub(r"'s\b","",data_) # 소유격 제거. Ex) roland's -> roland
    
    data_ = data_.replace('\n\n', '\n')
    data_ = data_.replace('\n', '. ')
    data_ = data_.replace('..', '.')

    txt_2.append(data_)

df_normal = pd.DataFrame(txt, columns=['origin'])
df_normal = df_normal.drop(df_normal.index[928])
df_normal = df_normal.reset_index(drop=True)
df_abnormal = pd.DataFrame(txt_2, columns=['origin'])

100%|██████████| 1824/1824 [00:00<00:00, 10094.66it/s]
100%|██████████| 401/401 [00:00<00:00, 7839.75it/s]


In [233]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
clu = [5,10,15,20,25,30,35,40,45,50]
threshold = np.arange(0,4.5,0.1)
final_acc = []
final_precision = []
final_recall = []
final_f1 = []
final_delay = []
epoch_time = []

for n in tqdm(range(1, 11)):
    start_time = time.time()
    train_dataset, test_dataset = make_dataset(df_normal, df_abnormal)
    
    train_docs_embedding = make_vector(train_dataset.origin)
    test_docs_embedding = make_vector(test_dataset.origin)


    with open(f'tech_{n}_test_docs_embedding.pickle', 'wb') as f:
        pickle.dump(test_docs_embedding, f)

    with open(f'tech_{n}_train_docs_embedding.pickle', 'wb') as f:
        pickle.dump(train_docs_embedding, f)

    with open(f'tech_{n}_test_ans_embedding.pickle', 'wb') as f:
        pickle.dump(test_dataset.category.values, f)

    with open(f'tech_{n}_train_ans_embedding.pickle', 'wb') as f:
        pickle.dump(train_dataset.category.values, f)


    best_score = 0
    i = len(train_dataset)//3
    
    for c in clu:
        
        kmeans1 = KMeans(n_clusters=c, random_state=42)
        kmeans1.fit(train_docs_embedding[:i])

        kmeans2 = KMeans(n_clusters=c, random_state=42)
        kmeans2.fit(train_docs_embedding[i:2*i])

        kmeans3 = KMeans(n_clusters=c, random_state=42)
        kmeans3.fit(train_docs_embedding[2*i:])

        # kmeans4 = KMeans(n_clusters=c, random_state=42)
        # kmeans4.fit(train_docs_embedding[3*i:4*i])

        # kmeans5 = KMeans(n_clusters=c, random_state=42)
        # kmeans5.fit(train_docs_embedding[4*i:])

        # kmeans6 = KMeans(n_clusters=c, random_state=42)
        # kmeans6.fit(train_docs_embedding[5*i:6*i])

        # kmeans7 = KMeans(n_clusters=c, random_state=42)
        # kmeans7.fit(train_docs_embedding[6*i:])

        distances1 = np.zeros(test_docs_embedding.shape[0])
        distances2 = np.zeros(test_docs_embedding.shape[0])
        distances3 = np.zeros(test_docs_embedding.shape[0])
        # distances4 = np.zeros(test_docs_embedding.shape[0])
        # distances5 = np.zeros(test_docs_embedding.shape[0])
        # distances6 = np.zeros(test_docs_embedding.shape[0])
        # distances7 = np.zeros(test_docs_embedding.shape[0])

        for t in threshold:
            predictions1 = kmeans1.predict(test_docs_embedding)
            predictions2 = kmeans2.predict(test_docs_embedding)
            predictions3 = kmeans3.predict(test_docs_embedding)
            # predictions4 = kmeans4.predict(test_docs_embedding)
            # predictions5 = kmeans5.predict(test_docs_embedding)
            # predictions6 = kmeans4.predict(test_docs_embedding)
            # predictions7 = kmeans5.predict(test_docs_embedding)
            
            for idx in range(test_docs_embedding.shape[0]):
                distances1[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans1.cluster_centers_[predictions1[idx]])
                distances2[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans2.cluster_centers_[predictions2[idx]])
                distances3[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans3.cluster_centers_[predictions3[idx]])
                # distances4[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans4.cluster_centers_[predictions4[idx]])
                # distances5[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans5.cluster_centers_[predictions5[idx]])
                # distances6[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans4.cluster_centers_[predictions4[idx]])
                # distances7[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans5.cluster_centers_[predictions5[idx]])
            
            predict_ensembel = (distances1>t) * 1 + (distances2>t) * 1 + (distances3>t) * 1 
            # + (distances4>t)* 1 +(distances5>t)* 1
            
            # +(distances6>t)* 1+(distances7>t)* 1
            predict = np.where(predict_ensembel>=2, 1, 0)
            
            acc_scores = accuracy_score(test_dataset.category, predict)
            f1_s = f1_score(test_dataset.category, predict)
            # if acc_scores>best_score:
            #     best_params = {acc_scores:[c,t]}
            #     best_score = acc_scores
            #     whole_window_ensemble = predict_ensembel
            #     whole_window = predict
            
            if f1_s>best_score:
                best_params = {f1_s:[c,t]}
                best_score = f1_s
                whole_window_ensemble = predict_ensembel
                whole_window = predict

    
    kmeans1 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    kmeans1.fit(train_docs_embedding[:i])

    kmeans2 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    kmeans2.fit(train_docs_embedding[i:2*i])

    kmeans3 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    kmeans3.fit(train_docs_embedding[2*i:3*i])

    # kmeans4 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans4.fit(train_docs_embedding[3*i:4*i])

    # kmeans5 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans5.fit(train_docs_embedding[4*i:])

    # kmeans6 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans6.fit(train_docs_embedding[5*i:6*i])

    # kmeans7 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans7.fit(train_docs_embedding[6*i:])

    distances1 = np.zeros(test_docs_embedding.shape[0])
    distances2 = np.zeros(test_docs_embedding.shape[0])
    distances3 = np.zeros(test_docs_embedding.shape[0])
    # distances4 = np.zeros(test_docs_embedding.shape[0])
    # distances5 = np.zeros(test_docs_embedding.shape[0])
    # distances6 = np.zeros(test_docs_embedding.shape[0])
    # distances7 = np.zeros(test_docs_embedding.shape[0])

    predictions1 = kmeans1.predict(test_docs_embedding)
    predictions2 = kmeans2.predict(test_docs_embedding)
    predictions3 = kmeans3.predict(test_docs_embedding)
    # predictions4 = kmeans4.predict(test_docs_embedding)
    # predictions5 = kmeans5.predict(test_docs_embedding)
    # predictions6 = kmeans6.predict(test_docs_embedding)
    # predictions7 = kmeans7.predict(test_docs_embedding)

    for i in range(test_docs_embedding.shape[0]):
        distances1[i] = np.linalg.norm(test_docs_embedding[i] - kmeans1.cluster_centers_[predictions1[i]])
        distances2[i] = np.linalg.norm(test_docs_embedding[i] - kmeans2.cluster_centers_[predictions2[i]])
        distances3[i] = np.linalg.norm(test_docs_embedding[i] - kmeans3.cluster_centers_[predictions3[i]])
        # distances4[i] = np.linalg.norm(test_docs_embedding[i] - kmeans4.cluster_centers_[predictions4[i]])
        # distances5[i] = np.linalg.norm(test_docs_embedding[i] - kmeans5.cluster_centers_[predictions5[i]])
        # distances6[i] = np.linalg.norm(test_docs_embedding[i] - kmeans6.cluster_centers_[predictions6[i]])
        # distances7[i] = np.linalg.norm(test_docs_embedding[i] - kmeans7.cluster_centers_[predictions7[i]])

    predict_ensembel = (distances1>best_params[best_score][1]) * 1 + (distances2>best_params[best_score][1]) * 1 + (distances3>best_params[best_score][1]) * 1 
    # + (distances4>t)* 1 +(distances5>t)* 1
    # +(distances6>t)* 1+(distances7>t)* 1
    predict = np.where(predict_ensembel>=2, 1, 0)

    test_acc_scores = accuracy_score(test_dataset.category, predict)
    test_pre_scores = precision_score(test_dataset.category, predict)
    test_rec_scores = recall_score(test_dataset.category, predict)
    test_f1_scores = f1_score(test_dataset.category, predict)
    final_acc.append(test_acc_scores)
    final_precision.append(test_pre_scores)
    final_recall.append(test_rec_scores)
    final_f1.append(test_f1_scores)

    window_size = 80
    ref_window = predict[:window_size]
    ref_ratio = np.count_nonzero(ref_window) / len(ref_window)

    first_ab_idx = test_dataset[test_dataset.category==1].index[0]-window_size
    for delay in range(len(predict) - first_ab_idx):
        compare_window = predict[first_ab_idx:first_ab_idx+window_size]
        compare_ratio = np.count_nonzero(compare_window) / len(compare_window)
        first_ab_idx+=1
        t, p = ttest_ind(ref_window, compare_window)
        if p<=0.005:
            print('몇 번째인지:', delay+first_ab_idx)
            print('Delay:', delay)
            print(f"Group 1 proportion: {ref_ratio:.3f}")
            print(f"Group 2 proportion: {compare_ratio:.3f}")
            print(f"t-statistic: {t:.3f}")
            print(f"p-value: {p:.3f}")
            final_delay.append(delay)
            break
    if len(final_delay) != n:
        final_delay.append('none')
        
    epoch_time.append(round(time.time() - start_time, 1))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 0/10 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (535 > 512). Runnin

몇 번째인지: 401
Delay: 58
Group 1 proportion: 0.075
Group 2 proportion: 0.237
t-statistic: -2.886
p-value: 0.004


100%|██████████| 1093/1093 [05:06<00:00,  3.57it/s]
100%|██████████| 810/810 [04:14<00:00,  3.18it/s]
 20%|██        | 2/10 [19:58<1:19:55, 599.42s/it]

몇 번째인지: 714
Delay: 213
Group 1 proportion: 0.062
Group 2 proportion: 0.225
t-statistic: -2.992
p-value: 0.003


100%|██████████| 1093/1093 [05:13<00:00,  3.48it/s]
100%|██████████| 810/810 [04:04<00:00,  3.32it/s]
100%|██████████| 1093/1093 [05:11<00:00,  3.51it/s]
100%|██████████| 810/810 [04:05<00:00,  3.30it/s]
100%|██████████| 1093/1093 [05:15<00:00,  3.46it/s]
100%|██████████| 810/810 [04:00<00:00,  3.37it/s]
 50%|█████     | 5/10 [49:50<49:48, 597.67s/it]

몇 번째인지: 286
Delay: 0
Group 1 proportion: 0.150
Group 2 proportion: 0.025
t-statistic: 2.851
p-value: 0.005


100%|██████████| 1093/1093 [05:16<00:00,  3.45it/s]
100%|██████████| 810/810 [04:21<00:00,  3.09it/s]
 60%|██████    | 6/10 [1:00:11<40:21, 605.49s/it]

몇 번째인지: 296
Delay: 2
Group 1 proportion: 0.025
Group 2 proportion: 0.150
t-statistic: -2.851
p-value: 0.005


100%|██████████| 1093/1093 [05:27<00:00,  3.34it/s]
100%|██████████| 810/810 [04:07<00:00,  3.27it/s]
 70%|███████   | 7/10 [1:10:27<30:26, 608.99s/it]

몇 번째인지: 394
Delay: 52
Group 1 proportion: 0.062
Group 2 proportion: 0.225
t-statistic: -2.992
p-value: 0.003


100%|██████████| 1093/1093 [05:19<00:00,  3.42it/s]
100%|██████████| 810/810 [03:59<00:00,  3.38it/s]
100%|██████████| 1093/1093 [05:16<00:00,  3.45it/s]
100%|██████████| 810/810 [04:04<00:00,  3.31it/s]
100%|██████████| 1093/1093 [05:12<00:00,  3.49it/s]
100%|██████████| 810/810 [04:03<00:00,  3.32it/s]
100%|██████████| 10/10 [1:40:22<00:00, 602.27s/it]


In [234]:
print('각10번 time:', epoch_time)
print('평균', np.mean(epoch_time))

각10번 time: [597.7, 600.7, 598.9, 596.3, 597.0, 620.7, 616.2, 599.9, 600.3, 595.1]
평균 602.28


In [235]:
print('각10번 delay:', final_delay)
# print('평균', np.mean(final_delay))

각10번 delay: [58, 213, 'none', 'none', 0, 2, 52, 'none', 'none', 'none']


In [259]:
np.mean([58, 213, 0, 2, 52])

65.0

In [236]:
print('각10번 acc:', final_acc)
print('평균', np.mean(final_acc))

각10번 acc: [0.8419753086419753, 0.8827160493827161, 0.8962962962962963, 0.8716049382716049, 0.8839506172839506, 0.8950617283950617, 0.8691358024691358, 0.8481481481481481, 0.8617283950617284, 0.8629629629629629]
평균 0.8713580246913579


In [237]:
print('각10번 precision:', final_precision)
print('평균', np.mean(final_precision))

각10번 precision: [0.35714285714285715, 0.43478260869565216, 0.47297297297297297, 0.4032258064516129, 0.4351851851851852, 0.4742268041237113, 0.4097222222222222, 0.37988826815642457, 0.3933333333333333, 0.3816793893129771]
평균 0.4142159447596949


In [238]:
print('각10번 f1:', final_f1)
print('평균', np.mean(final_f1))

각10번 f1: [0.48387096774193544, 0.5128205128205128, 0.45454545454545453, 0.49019607843137253, 0.5, 0.519774011299435, 0.5267857142857143, 0.5250965250965252, 0.5130434782608696, 0.47393364928909953]
평균 0.5000066391770919


#### Sport

In [239]:
normal_dataset = df_business+df_politics+df_tech+df_entertain

In [240]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [241]:
txt = []

for lst in tqdm(normal_dataset):
    file = open(lst, 'r')
    data = file.read()
    data = data.lower()
    data = data.strip()
    data = re.compile('<.*?>').sub('', data)
    data = re.sub('\s+', ' ', data)  
    data = ' '.join([contractions[t] if t in contractions else t for t in data.split(" ")]) # 약어 정규화
    data = re.sub(r"'s\b","",data) # 소유격 제거. Ex) roland's -> roland
    
    data = data.replace('\n\n', '\n')
    data = data.replace('\n', '. ')
    data = data.replace('..', '.')

    txt.append(data)

txt_2 = []

for abnormal in tqdm(df_sport):
    file = open(abnormal, 'r')
    data_ = file.read()
    data_ = data_.lower()
    data_ = data_.strip()
    data_ = re.compile('<.*?>').sub('', data_)
    data_ = re.sub('\s+', ' ', data_)  
    data_ = ' '.join([contractions[t] if t in contractions else t for t in data_.split(" ")]) # 약어 정규화
    data_ = re.sub(r"'s\b","",data_) # 소유격 제거. Ex) roland's -> roland
    
    data_ = data_.replace('\n\n', '\n')
    data_ = data_.replace('\n', '. ')
    data_ = data_.replace('..', '.')

    txt_2.append(data_)

df_normal = pd.DataFrame(txt, columns=['origin'])
df_normal = df_normal.drop(df_normal.index[928])
df_normal = df_normal.reset_index(drop=True)
df_abnormal = pd.DataFrame(txt_2, columns=['origin'])

100%|██████████| 1714/1714 [00:00<00:00, 9157.38it/s]
100%|██████████| 511/511 [00:00<00:00, 11105.82it/s]


In [242]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
clu = [5,10,15,20,25,30,35,40,45,50]
threshold = np.arange(0,4.5,0.1)
final_acc = []
final_precision = []
final_recall = []
final_f1 = []
final_delay = []
epoch_time = []

for n in tqdm(range(1, 11)):
    start_time = time.time()
    train_dataset, test_dataset = make_dataset(df_normal, df_abnormal)
    
    train_docs_embedding = make_vector(train_dataset.origin)
    test_docs_embedding = make_vector(test_dataset.origin)


    with open(f'sport_{n}_test_docs_embedding.pickle', 'wb') as f:
        pickle.dump(test_docs_embedding, f)

    with open(f'sport_{n}_train_docs_embedding.pickle', 'wb') as f:
        pickle.dump(train_docs_embedding, f)

    with open(f'sport_{n}_test_ans_embedding.pickle', 'wb') as f:
        pickle.dump(test_dataset.category.values, f)

    with open(f'sport_{n}_train_ans_embedding.pickle', 'wb') as f:
        pickle.dump(train_dataset.category.values, f)


    best_score = 0
    i = len(train_dataset)//3
    
    for c in clu:
        
        kmeans1 = KMeans(n_clusters=c, random_state=42)
        kmeans1.fit(train_docs_embedding[:i])

        kmeans2 = KMeans(n_clusters=c, random_state=42)
        kmeans2.fit(train_docs_embedding[i:2*i])

        kmeans3 = KMeans(n_clusters=c, random_state=42)
        kmeans3.fit(train_docs_embedding[2*i:])

        # kmeans4 = KMeans(n_clusters=c, random_state=42)
        # kmeans4.fit(train_docs_embedding[3*i:4*i])

        # kmeans5 = KMeans(n_clusters=c, random_state=42)
        # kmeans5.fit(train_docs_embedding[4*i:])

        # kmeans6 = KMeans(n_clusters=c, random_state=42)
        # kmeans6.fit(train_docs_embedding[5*i:6*i])

        # kmeans7 = KMeans(n_clusters=c, random_state=42)
        # kmeans7.fit(train_docs_embedding[6*i:])

        distances1 = np.zeros(test_docs_embedding.shape[0])
        distances2 = np.zeros(test_docs_embedding.shape[0])
        distances3 = np.zeros(test_docs_embedding.shape[0])
        # distances4 = np.zeros(test_docs_embedding.shape[0])
        # distances5 = np.zeros(test_docs_embedding.shape[0])
        # distances6 = np.zeros(test_docs_embedding.shape[0])
        # distances7 = np.zeros(test_docs_embedding.shape[0])

        for t in threshold:
            predictions1 = kmeans1.predict(test_docs_embedding)
            predictions2 = kmeans2.predict(test_docs_embedding)
            predictions3 = kmeans3.predict(test_docs_embedding)
            # predictions4 = kmeans4.predict(test_docs_embedding)
            # predictions5 = kmeans5.predict(test_docs_embedding)
            # predictions6 = kmeans4.predict(test_docs_embedding)
            # predictions7 = kmeans5.predict(test_docs_embedding)
            
            for idx in range(test_docs_embedding.shape[0]):
                distances1[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans1.cluster_centers_[predictions1[idx]])
                distances2[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans2.cluster_centers_[predictions2[idx]])
                distances3[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans3.cluster_centers_[predictions3[idx]])
                # distances4[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans4.cluster_centers_[predictions4[idx]])
                # distances5[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans5.cluster_centers_[predictions5[idx]])
                # distances6[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans4.cluster_centers_[predictions4[idx]])
                # distances7[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans5.cluster_centers_[predictions5[idx]])
            
            predict_ensembel = (distances1>t) * 1 + (distances2>t) * 1 + (distances3>t) * 1 
            # + (distances4>t)* 1 +(distances5>t)* 1
            
            # +(distances6>t)* 1+(distances7>t)* 1
            predict = np.where(predict_ensembel>=2, 1, 0)
            
            acc_scores = accuracy_score(test_dataset.category, predict)
            f1_s = f1_score(test_dataset.category, predict)
            # if acc_scores>best_score:
            #     best_params = {acc_scores:[c,t]}
            #     best_score = acc_scores
            #     whole_window_ensemble = predict_ensembel
            #     whole_window = predict
            
            if f1_s>best_score:
                best_params = {f1_s:[c,t]}
                best_score = f1_s
                whole_window_ensemble = predict_ensembel
                whole_window = predict

    
    kmeans1 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    kmeans1.fit(train_docs_embedding[:i])

    kmeans2 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    kmeans2.fit(train_docs_embedding[i:2*i])

    kmeans3 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    kmeans3.fit(train_docs_embedding[2*i:3*i])

    # kmeans4 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans4.fit(train_docs_embedding[3*i:4*i])

    # kmeans5 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans5.fit(train_docs_embedding[4*i:])

    # kmeans6 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans6.fit(train_docs_embedding[5*i:6*i])

    # kmeans7 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans7.fit(train_docs_embedding[6*i:])

    distances1 = np.zeros(test_docs_embedding.shape[0])
    distances2 = np.zeros(test_docs_embedding.shape[0])
    distances3 = np.zeros(test_docs_embedding.shape[0])
    # distances4 = np.zeros(test_docs_embedding.shape[0])
    # distances5 = np.zeros(test_docs_embedding.shape[0])
    # distances6 = np.zeros(test_docs_embedding.shape[0])
    # distances7 = np.zeros(test_docs_embedding.shape[0])

    predictions1 = kmeans1.predict(test_docs_embedding)
    predictions2 = kmeans2.predict(test_docs_embedding)
    predictions3 = kmeans3.predict(test_docs_embedding)
    # predictions4 = kmeans4.predict(test_docs_embedding)
    # predictions5 = kmeans5.predict(test_docs_embedding)
    # predictions6 = kmeans6.predict(test_docs_embedding)
    # predictions7 = kmeans7.predict(test_docs_embedding)

    for i in range(test_docs_embedding.shape[0]):
        distances1[i] = np.linalg.norm(test_docs_embedding[i] - kmeans1.cluster_centers_[predictions1[i]])
        distances2[i] = np.linalg.norm(test_docs_embedding[i] - kmeans2.cluster_centers_[predictions2[i]])
        distances3[i] = np.linalg.norm(test_docs_embedding[i] - kmeans3.cluster_centers_[predictions3[i]])
        # distances4[i] = np.linalg.norm(test_docs_embedding[i] - kmeans4.cluster_centers_[predictions4[i]])
        # distances5[i] = np.linalg.norm(test_docs_embedding[i] - kmeans5.cluster_centers_[predictions5[i]])
        # distances6[i] = np.linalg.norm(test_docs_embedding[i] - kmeans6.cluster_centers_[predictions6[i]])
        # distances7[i] = np.linalg.norm(test_docs_embedding[i] - kmeans7.cluster_centers_[predictions7[i]])

    predict_ensembel = (distances1>best_params[best_score][1]) * 1 + (distances2>best_params[best_score][1]) * 1 + (distances3>best_params[best_score][1]) * 1 
    # + (distances4>t)* 1 +(distances5>t)* 1
    # +(distances6>t)* 1+(distances7>t)* 1
    predict = np.where(predict_ensembel>=2, 1, 0)

    test_acc_scores = accuracy_score(test_dataset.category, predict)
    test_pre_scores = precision_score(test_dataset.category, predict)
    test_rec_scores = recall_score(test_dataset.category, predict)
    test_f1_scores = f1_score(test_dataset.category, predict)
    final_acc.append(test_acc_scores)
    final_precision.append(test_pre_scores)
    final_recall.append(test_rec_scores)
    final_f1.append(test_f1_scores)

    window_size = 80
    ref_window = predict[:window_size]
    ref_ratio = np.count_nonzero(ref_window) / len(ref_window)

    first_ab_idx = test_dataset[test_dataset.category==1].index[0]-window_size
    for delay in range(len(predict) - first_ab_idx):
        compare_window = predict[first_ab_idx:first_ab_idx+window_size]
        compare_ratio = np.count_nonzero(compare_window) / len(compare_window)
        first_ab_idx+=1
        t, p = ttest_ind(ref_window, compare_window)
        if p<=0.005:
            print('몇 번째인지:', delay+first_ab_idx)
            print('Delay:', delay)
            print(f"Group 1 proportion: {ref_ratio:.3f}")
            print(f"Group 2 proportion: {compare_ratio:.3f}")
            print(f"t-statistic: {t:.3f}")
            print(f"p-value: {p:.3f}")
            final_delay.append(delay)
            break
    if len(final_delay) != n:
        final_delay.append('none')
        
    epoch_time.append(round(time.time() - start_time, 1))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 1027/1027 [05:27<00:00,  3.14it/s]
100%|██████████| 788/788 [04:04<00:00,  3.22it/s]
 10%|█         | 1/10 [10:12<1:31:56, 612.92s/it]

몇 번째인지: 317
Delay: 27
Group 1 proportion: 0.013
Group 2 proportion: 0.125
t-statistic: -2.866
p-value: 0.005


100%|██████████| 1027/1027 [05:24<00:00,  3.17it/s]
100%|██████████| 788/788 [04:04<00:00,  3.22it/s]
 20%|██        | 2/10 [20:21<1:21:20, 610.11s/it]

몇 번째인지: 407
Delay: 68
Group 1 proportion: 0.062
Group 2 proportion: 0.225
t-statistic: -2.992
p-value: 0.003


100%|██████████| 1027/1027 [05:24<00:00,  3.16it/s]
100%|██████████| 788/788 [04:06<00:00,  3.20it/s]
 30%|███       | 3/10 [30:32<1:11:13, 610.55s/it]

몇 번째인지: 343
Delay: 35
Group 1 proportion: 0.013
Group 2 proportion: 0.125
t-statistic: -2.866
p-value: 0.005


100%|██████████| 1027/1027 [05:23<00:00,  3.17it/s]
Token indices sequence length is longer than the specified maximum sequence length for this model (535 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 788/788 [04:16<00:00,  3.08it/s]
 40%|████      | 4/10 [40:56<1:01:36, 616.09s/it]

몇 번째인지: 462
Delay: 99
Group 1 proportion: 0.075
Group 2 proportion: 0.237
t-statistic: -2.886
p-value: 0.004


100%|██████████| 1027/1027 [05:46<00:00,  2.96it/s]
100%|██████████| 788/788 [04:26<00:00,  2.96it/s]
 50%|█████     | 5/10 [51:54<52:35, 631.16s/it]  

몇 번째인지: 460
Delay: 98
Group 1 proportion: 0.025
Group 2 proportion: 0.150
t-statistic: -2.851
p-value: 0.005


100%|██████████| 1027/1027 [05:52<00:00,  2.92it/s]
100%|██████████| 788/788 [04:25<00:00,  2.96it/s]
 60%|██████    | 6/10 [1:02:56<42:46, 641.72s/it]

몇 번째인지: 465
Delay: 94
Group 1 proportion: 0.025
Group 2 proportion: 0.150
t-statistic: -2.851
p-value: 0.005


100%|██████████| 1027/1027 [05:43<00:00,  2.99it/s]
100%|██████████| 788/788 [04:24<00:00,  2.98it/s]
 70%|███████   | 7/10 [1:13:49<32:15, 645.23s/it]

몇 번째인지: 816
Delay: 276
Group 1 proportion: 0.138
Group 2 proportion: 0.325
t-statistic: -2.867
p-value: 0.005


100%|██████████| 1027/1027 [05:44<00:00,  2.98it/s]
100%|██████████| 788/788 [04:28<00:00,  2.93it/s]
 80%|████████  | 8/10 [1:24:46<21:38, 649.06s/it]

몇 번째인지: 677
Delay: 205
Group 1 proportion: 0.013
Group 2 proportion: 0.125
t-statistic: -2.866
p-value: 0.005


100%|██████████| 1027/1027 [05:47<00:00,  2.95it/s]
100%|██████████| 788/788 [04:21<00:00,  3.01it/s]
 90%|█████████ | 9/10 [1:35:37<10:49, 649.61s/it]

몇 번째인지: 460
Delay: 96
Group 1 proportion: 0.062
Group 2 proportion: 0.225
t-statistic: -2.992
p-value: 0.003


100%|██████████| 1027/1027 [05:38<00:00,  3.03it/s]
100%|██████████| 788/788 [04:17<00:00,  3.07it/s]
100%|██████████| 10/10 [1:46:13<00:00, 637.36s/it]

몇 번째인지: 447
Delay: 92
Group 1 proportion: 0.050
Group 2 proportion: 0.200
t-statistic: -2.927
p-value: 0.004





In [243]:
print('각10번 time:', epoch_time)
print('평균', np.mean(epoch_time))

각10번 time: [612.9, 608.1, 611.1, 624.6, 657.9, 662.2, 652.5, 657.3, 650.8, 636.2]
평균 637.3599999999999


In [244]:
print('각10번 delay:', final_delay)
# print('평균', np.mean(final_delay))

각10번 delay: [27, 68, 35, 99, 98, 94, 276, 205, 96, 92]


In [260]:
np.mean([27, 68, 35, 99, 98, 94, 276, 205, 96, 92])

109.0

In [245]:
print('각10번 acc:', final_acc)
print('평균', np.mean(final_acc))

각10번 acc: [0.9365482233502538, 0.9111675126903553, 0.932741116751269, 0.916243654822335, 0.9416243654822335, 0.9378172588832487, 0.8934010152284264, 0.9251269035532995, 0.9238578680203046, 0.9175126903553299]
평균 0.9236040609137056


In [246]:
print('각10번 precision:', final_precision)
print('평균', np.mean(final_precision))

각10번 precision: [0.7954545454545454, 0.6333333333333333, 0.7951807228915663, 0.6875, 0.8783783783783784, 0.8533333333333334, 0.5608108108108109, 0.8028169014084507, 0.7019230769230769, 0.6761904761904762]
평균 0.7384921578723971


In [247]:
print('각10번 f1:', final_f1)
print('평균', np.mean(final_f1))

각10번 f1: [0.736842105263158, 0.6846846846846847, 0.7135135135135136, 0.6666666666666667, 0.7386363636363635, 0.7231638418079096, 0.6640000000000001, 0.6589595375722543, 0.7087378640776699, 0.6859903381642511]
평균 0.6981194915386473


#### Entertain

In [248]:
normal_dataset = df_business+df_sport+df_tech+df_politics

In [249]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [250]:
txt = []

for lst in tqdm(normal_dataset):
    file = open(lst, 'r')
    data = file.read()
    data = data.lower()
    data = data.strip()
    data = re.compile('<.*?>').sub('', data)
    data = re.sub('\s+', ' ', data)  
    data = ' '.join([contractions[t] if t in contractions else t for t in data.split(" ")]) # 약어 정규화
    data = re.sub(r"'s\b","",data) # 소유격 제거. Ex) roland's -> roland
    
    data = data.replace('\n\n', '\n')
    data = data.replace('\n', '. ')
    data = data.replace('..', '.')

    txt.append(data)

txt_2 = []

for abnormal in tqdm(df_entertain):
    file = open(abnormal, 'r')
    data_ = file.read()
    data_ = data_.lower()
    data_ = data_.strip()
    data_ = re.compile('<.*?>').sub('', data_)
    data_ = re.sub('\s+', ' ', data_)  
    data_ = ' '.join([contractions[t] if t in contractions else t for t in data_.split(" ")]) # 약어 정규화
    data_ = re.sub(r"'s\b","",data_) # 소유격 제거. Ex) roland's -> roland
    
    data_ = data_.replace('\n\n', '\n')
    data_ = data_.replace('\n', '. ')
    data_ = data_.replace('..', '.')

    txt_2.append(data_)

df_normal = pd.DataFrame(txt, columns=['origin'])
df_normal = df_normal.reset_index(drop=True)
df_abnormal = pd.DataFrame(txt_2, columns=['origin'])

100%|██████████| 1839/1839 [00:00<00:00, 8690.82it/s] 
100%|██████████| 386/386 [00:00<00:00, 7723.10it/s]


In [251]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
clu = [5,10,15,20,25,30,35,40,45,50]
threshold = np.arange(0,4.5,0.1)
final_acc = []
final_precision = []
final_recall = []
final_f1 = []
final_delay = []
epoch_time = []

for n in tqdm(range(1, 11)):
    start_time = time.time()
    train_dataset, test_dataset = make_dataset(df_normal, df_abnormal)
    
    train_docs_embedding = make_vector(train_dataset.origin)
    test_docs_embedding = make_vector(test_dataset.origin)


    with open(f'entertain_{n}_test_docs_embedding.pickle', 'wb') as f:
        pickle.dump(test_docs_embedding, f)

    with open(f'entertain_{n}_train_docs_embedding.pickle', 'wb') as f:
        pickle.dump(train_docs_embedding, f)

    with open(f'entertain_{n}_test_ans_embedding.pickle', 'wb') as f:
        pickle.dump(test_dataset.category.values, f)

    with open(f'entertain_{n}_train_ans_embedding.pickle', 'wb') as f:
        pickle.dump(train_dataset.category.values, f)


    best_score = 0
    i = len(train_dataset)//3
    
    for c in clu:
        
        kmeans1 = KMeans(n_clusters=c, random_state=42)
        kmeans1.fit(train_docs_embedding[:i])

        kmeans2 = KMeans(n_clusters=c, random_state=42)
        kmeans2.fit(train_docs_embedding[i:2*i])

        kmeans3 = KMeans(n_clusters=c, random_state=42)
        kmeans3.fit(train_docs_embedding[2*i:])

        # kmeans4 = KMeans(n_clusters=c, random_state=42)
        # kmeans4.fit(train_docs_embedding[3*i:4*i])

        # kmeans5 = KMeans(n_clusters=c, random_state=42)
        # kmeans5.fit(train_docs_embedding[4*i:])

        # kmeans6 = KMeans(n_clusters=c, random_state=42)
        # kmeans6.fit(train_docs_embedding[5*i:6*i])

        # kmeans7 = KMeans(n_clusters=c, random_state=42)
        # kmeans7.fit(train_docs_embedding[6*i:])

        distances1 = np.zeros(test_docs_embedding.shape[0])
        distances2 = np.zeros(test_docs_embedding.shape[0])
        distances3 = np.zeros(test_docs_embedding.shape[0])
        # distances4 = np.zeros(test_docs_embedding.shape[0])
        # distances5 = np.zeros(test_docs_embedding.shape[0])
        # distances6 = np.zeros(test_docs_embedding.shape[0])
        # distances7 = np.zeros(test_docs_embedding.shape[0])

        for t in threshold:
            predictions1 = kmeans1.predict(test_docs_embedding)
            predictions2 = kmeans2.predict(test_docs_embedding)
            predictions3 = kmeans3.predict(test_docs_embedding)
            # predictions4 = kmeans4.predict(test_docs_embedding)
            # predictions5 = kmeans5.predict(test_docs_embedding)
            # predictions6 = kmeans4.predict(test_docs_embedding)
            # predictions7 = kmeans5.predict(test_docs_embedding)
            
            for idx in range(test_docs_embedding.shape[0]):
                distances1[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans1.cluster_centers_[predictions1[idx]])
                distances2[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans2.cluster_centers_[predictions2[idx]])
                distances3[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans3.cluster_centers_[predictions3[idx]])
                # distances4[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans4.cluster_centers_[predictions4[idx]])
                # distances5[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans5.cluster_centers_[predictions5[idx]])
                # distances6[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans4.cluster_centers_[predictions4[idx]])
                # distances7[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans5.cluster_centers_[predictions5[idx]])
            
            predict_ensembel = (distances1>t) * 1 + (distances2>t) * 1 + (distances3>t) * 1 
            # + (distances4>t)* 1 +(distances5>t)* 1
            
            # +(distances6>t)* 1+(distances7>t)* 1
            predict = np.where(predict_ensembel>=2, 1, 0)
            
            acc_scores = accuracy_score(test_dataset.category, predict)
            f1_s = f1_score(test_dataset.category, predict)
            # if acc_scores>best_score:
            #     best_params = {acc_scores:[c,t]}
            #     best_score = acc_scores
            #     whole_window_ensemble = predict_ensembel
            #     whole_window = predict
            
            if f1_s>best_score:
                best_params = {f1_s:[c,t]}
                best_score = f1_s
                whole_window_ensemble = predict_ensembel
                whole_window = predict

    
    kmeans1 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    kmeans1.fit(train_docs_embedding[:i])

    kmeans2 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    kmeans2.fit(train_docs_embedding[i:2*i])

    kmeans3 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    kmeans3.fit(train_docs_embedding[2*i:3*i])

    # kmeans4 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans4.fit(train_docs_embedding[3*i:4*i])

    # kmeans5 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans5.fit(train_docs_embedding[4*i:])

    # kmeans6 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans6.fit(train_docs_embedding[5*i:6*i])

    # kmeans7 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans7.fit(train_docs_embedding[6*i:])

    distances1 = np.zeros(test_docs_embedding.shape[0])
    distances2 = np.zeros(test_docs_embedding.shape[0])
    distances3 = np.zeros(test_docs_embedding.shape[0])
    # distances4 = np.zeros(test_docs_embedding.shape[0])
    # distances5 = np.zeros(test_docs_embedding.shape[0])
    # distances6 = np.zeros(test_docs_embedding.shape[0])
    # distances7 = np.zeros(test_docs_embedding.shape[0])

    predictions1 = kmeans1.predict(test_docs_embedding)
    predictions2 = kmeans2.predict(test_docs_embedding)
    predictions3 = kmeans3.predict(test_docs_embedding)
    # predictions4 = kmeans4.predict(test_docs_embedding)
    # predictions5 = kmeans5.predict(test_docs_embedding)
    # predictions6 = kmeans6.predict(test_docs_embedding)
    # predictions7 = kmeans7.predict(test_docs_embedding)

    for i in range(test_docs_embedding.shape[0]):
        distances1[i] = np.linalg.norm(test_docs_embedding[i] - kmeans1.cluster_centers_[predictions1[i]])
        distances2[i] = np.linalg.norm(test_docs_embedding[i] - kmeans2.cluster_centers_[predictions2[i]])
        distances3[i] = np.linalg.norm(test_docs_embedding[i] - kmeans3.cluster_centers_[predictions3[i]])
        # distances4[i] = np.linalg.norm(test_docs_embedding[i] - kmeans4.cluster_centers_[predictions4[i]])
        # distances5[i] = np.linalg.norm(test_docs_embedding[i] - kmeans5.cluster_centers_[predictions5[i]])
        # distances6[i] = np.linalg.norm(test_docs_embedding[i] - kmeans6.cluster_centers_[predictions6[i]])
        # distances7[i] = np.linalg.norm(test_docs_embedding[i] - kmeans7.cluster_centers_[predictions7[i]])

    predict_ensembel = (distances1>best_params[best_score][1]) * 1 + (distances2>best_params[best_score][1]) * 1 + (distances3>best_params[best_score][1]) * 1 
    # + (distances4>t)* 1 +(distances5>t)* 1
    # +(distances6>t)* 1+(distances7>t)* 1
    predict = np.where(predict_ensembel>=2, 1, 0)

    test_acc_scores = accuracy_score(test_dataset.category, predict)
    test_pre_scores = precision_score(test_dataset.category, predict)
    test_rec_scores = recall_score(test_dataset.category, predict)
    test_f1_scores = f1_score(test_dataset.category, predict)
    final_acc.append(test_acc_scores)
    final_precision.append(test_pre_scores)
    final_recall.append(test_rec_scores)
    final_f1.append(test_f1_scores)

    window_size = 80
    ref_window = predict[:window_size]
    ref_ratio = np.count_nonzero(ref_window) / len(ref_window)

    first_ab_idx = test_dataset[test_dataset.category==1].index[0]-window_size
    for delay in range(len(predict) - first_ab_idx):
        compare_window = predict[first_ab_idx:first_ab_idx+window_size]
        compare_ratio = np.count_nonzero(compare_window) / len(compare_window)
        first_ab_idx+=1
        t, p = ttest_ind(ref_window, compare_window)
        if p<=0.005:
            print('몇 번째인지:', delay+first_ab_idx)
            print('Delay:', delay)
            print(f"Group 1 proportion: {ref_ratio:.3f}")
            print(f"Group 2 proportion: {compare_ratio:.3f}")
            print(f"t-statistic: {t:.3f}")
            print(f"p-value: {p:.3f}")
            final_delay.append(delay)
            break
    if len(final_delay) != n:
        final_delay.append('none')
        
    epoch_time.append(round(time.time() - start_time, 1))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 0/10 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (535 > 512). Runnin

몇 번째인지: 443
Delay: 74
Group 1 proportion: 0.013
Group 2 proportion: 0.125
t-statistic: -2.866
p-value: 0.005


100%|██████████| 1102/1102 [05:49<00:00,  3.15it/s]
100%|██████████| 813/813 [04:33<00:00,  2.98it/s]
 20%|██        | 2/10 [22:13<1:28:51, 666.41s/it]

몇 번째인지: 460
Delay: 77
Group 1 proportion: 0.037
Group 2 proportion: 0.175
t-statistic: -2.877
p-value: 0.005


100%|██████████| 1102/1102 [05:58<00:00,  3.07it/s]
100%|██████████| 813/813 [04:13<00:00,  3.21it/s]
 30%|███       | 3/10 [33:08<1:17:08, 661.25s/it]

몇 번째인지: 483
Delay: 95
Group 1 proportion: 0.037
Group 2 proportion: 0.175
t-statistic: -2.877
p-value: 0.005


100%|██████████| 1102/1102 [05:56<00:00,  3.09it/s]
100%|██████████| 813/813 [04:33<00:00,  2.97it/s]
 40%|████      | 4/10 [44:22<1:06:37, 666.25s/it]

몇 번째인지: 485
Delay: 96
Group 1 proportion: 0.025
Group 2 proportion: 0.150
t-statistic: -2.851
p-value: 0.005


100%|██████████| 1102/1102 [06:13<00:00,  2.95it/s]
100%|██████████| 813/813 [04:31<00:00,  2.99it/s]
 50%|█████     | 5/10 [55:52<56:14, 674.86s/it]  

몇 번째인지: 398
Delay: 55
Group 1 proportion: 0.013
Group 2 proportion: 0.125
t-statistic: -2.866
p-value: 0.005


100%|██████████| 1102/1102 [06:04<00:00,  3.02it/s]
100%|██████████| 813/813 [04:26<00:00,  3.05it/s]
 60%|██████    | 6/10 [1:07:08<45:01, 675.47s/it]

몇 번째인지: 380
Delay: 45
Group 1 proportion: 0.037
Group 2 proportion: 0.175
t-statistic: -2.877
p-value: 0.005


100%|██████████| 1102/1102 [06:22<00:00,  2.88it/s]
100%|██████████| 813/813 [04:42<00:00,  2.88it/s]
 70%|███████   | 7/10 [1:19:00<34:22, 687.35s/it]

몇 번째인지: 665
Delay: 188
Group 1 proportion: 0.025
Group 2 proportion: 0.150
t-statistic: -2.851
p-value: 0.005


100%|██████████| 1102/1102 [06:25<00:00,  2.86it/s]
100%|██████████| 813/813 [04:50<00:00,  2.80it/s]
 80%|████████  | 8/10 [1:31:02<23:16, 698.23s/it]

몇 번째인지: 527
Delay: 117
Group 1 proportion: 0.050
Group 2 proportion: 0.200
t-statistic: -2.927
p-value: 0.004


100%|██████████| 1102/1102 [06:05<00:00,  3.02it/s]
100%|██████████| 813/813 [04:15<00:00,  3.18it/s]
 90%|█████████ | 9/10 [1:42:05<11:27, 687.18s/it]

몇 번째인지: 816
Delay: 261
Group 1 proportion: 0.062
Group 2 proportion: 0.225
t-statistic: -2.992
p-value: 0.003


100%|██████████| 1102/1102 [06:05<00:00,  3.01it/s]
100%|██████████| 813/813 [04:29<00:00,  3.02it/s]
100%|██████████| 10/10 [1:53:25<00:00, 680.56s/it]

몇 번째인지: 422
Delay: 58
Group 1 proportion: 0.013
Group 2 proportion: 0.125
t-statistic: -2.866
p-value: 0.005





In [252]:
print('각10번 time:', epoch_time)
print('평균', np.mean(epoch_time))

각10번 time: [667.5, 665.7, 655.1, 673.9, 690.1, 676.6, 711.8, 721.5, 662.9, 680.4]
평균 680.55


In [253]:
print('각10번 delay:', final_delay)
# print('평균', np.mean(final_delay))

각10번 delay: [74, 77, 95, 96, 55, 45, 188, 117, 261, 58]


In [261]:
np.mean([74, 77, 95, 96, 55, 45, 188, 117, 261, 58])

106.6

In [254]:
print('각10번 acc:', final_acc)
print('평균', np.mean(final_acc))

각10번 acc: [0.940959409594096, 0.9446494464944649, 0.933579335793358, 0.9348093480934809, 0.9618696186961869, 0.940959409594096, 0.9458794587945879, 0.9372693726937269, 0.9360393603936039, 0.9348093480934809]
평균 0.9410824108241082


In [255]:
print('각10번 precision:', final_precision)
print('평균', np.mean(final_precision))

각10번 precision: [0.7101449275362319, 0.7666666666666667, 0.6533333333333333, 0.7142857142857143, 0.8709677419354839, 0.6933333333333334, 0.746268656716418, 0.6354166666666666, 0.6288659793814433, 0.65]
평균 0.7069283019855291


In [256]:
print('각10번 f1:', final_f1)
print('평균', np.mean(final_f1))

각10번 f1: [0.6712328767123288, 0.6715328467153285, 0.6447368421052633, 0.6015037593984963, 0.7769784172661871, 0.6842105263157895, 0.6944444444444445, 0.7052023121387284, 0.7011494252873565, 0.6624203821656051]
평균 0.6813411832549529


## 요약 데이터

In [14]:
from glob import glob
import re
import pandas as pd
from tqdm import tqdm

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
import spacy
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from nltk.stem import PorterStemmer

import pytextrank
import spacy
nlp = spacy.load("en_core_web_sm")

import torch
from transformers import BartTokenizer, BartForConditionalGeneration

import warnings
warnings.filterwarnings("ignore")

from icecream import ic
from math import sqrt
from operator import itemgetter
nlp.add_pipe("textrank", last=True)

<pytextrank.base.BaseTextRankFactory at 0x7fd0b1432680>

In [15]:
df_politics = glob('../origin_data//politics/*.txt')
df_sport = glob('../origin_data//sport/*.txt')
df_tech = glob('../origin_data//tech/*.txt')
df_entertain = glob('../origin_data//entertainment/*.txt')
df_business = glob('../origin_data//business/*.txt')

In [72]:
def text_r(text):
    doc = nlp(text)
    sent_bounds = [ [s.start, s.end, set([])] for s in doc.sents ]
    # limit_phrases = 4
    limit_phrases = len(sent_tokenize(text))//2

    phrase_id = 0
    unit_vector = []

    for p in doc._.phrases:
        # ic(phrase_id, p.text, p.rank)

        unit_vector.append(p.rank)

        for chunk in p.chunks:
            # ic(chunk.start, chunk.end)

            for sent_start, sent_end, sent_vector in sent_bounds:
                if chunk.start >= sent_start and chunk.end <= sent_end:
                    # ic(sent_start, chunk.start, chunk.end, sent_end)
                    sent_vector.add(phrase_id)
                    break

        phrase_id += 1

        if phrase_id == limit_phrases:
            break

    sum_ranks = sum(unit_vector)

    unit_vector = [ rank/sum_ranks for rank in unit_vector ]

    sent_rank = {}
    sent_id = 0

    for sent_start, sent_end, sent_vector in sent_bounds:
        # ic(sent_vector)
        sum_sq = 0.0
        for phrase_id in range(len(unit_vector)):
            # ic(phrase_id, unit_vector[phrase_id])

            if phrase_id not in sent_vector:
                sum_sq += unit_vector[phrase_id]**2.0

        sent_rank[sent_id] = sqrt(sum_sq)
        sent_id += 1

    sorted(sent_rank.items(), key=itemgetter(1)) 

    # limit_sentences = len(sent_tokenize(text))//3
    limit_sentences = len(sent_tokenize(text))//2

    sent_text = {}
    sent_id = 0

    for sent in doc.sents:
        sent_text[sent_id] = sent.text
        sent_id += 1

    num_sent = 0
    sum_text = []
        
    for sent_id, rank in sorted(sent_rank.items(), key=itemgetter(1)):
        # ic(sent_id, sent_text[sent_id])
        sum_text.append(sent_text[sent_id])
        num_sent += 1

        if num_sent == limit_sentences:
            break
    return sum_text

#### Business

In [73]:
normal_dataset = df_politics+df_sport+df_tech+df_entertain

In [74]:
txt = []

for lst in tqdm(normal_dataset):
    file = open(lst, 'r')
    data = file.read()
    data = data.lower()
    data = data.strip()
    data = re.compile('<.*?>').sub('', data)
    data = re.sub('\s+', ' ', data)  
    data = ' '.join([contractions[t] if t in contractions else t for t in data.split(" ")]) # 약어 정규화
    data = re.sub(r"'s\b","",data) # 소유격 제거. Ex) roland's -> roland
    
    data = data.replace('\n\n', '\n')
    data = data.replace('\n', '. ')
    data = data.replace('..', '.')

    txt.append(data)

txt_2 = []

for abnormal in tqdm(df_business):
    file = open(abnormal, 'r')
    data_ = file.read()
    data_ = data_.lower()
    data_ = data_.strip()
    data_ = re.compile('<.*?>').sub('', data_)
    data_ = re.sub('\s+', ' ', data_)  
    data_ = ' '.join([contractions[t] if t in contractions else t for t in data_.split(" ")]) # 약어 정규화
    data_ = re.sub(r"'s\b","",data_) # 소유격 제거. Ex) roland's -> roland
    
    data_ = data_.replace('\n\n', '\n')
    data_ = data_.replace('\n', '. ')
    data_ = data_.replace('..', '.')

    txt_2.append(data_)

df_normal = pd.DataFrame(txt, columns=['origin'])
df_normal = df_normal.reset_index(drop=True)
df_abnormal = pd.DataFrame(txt_2, columns=['origin'])

100%|██████████| 1715/1715 [00:00<00:00, 9226.34it/s]
100%|██████████| 510/510 [00:00<00:00, 10162.84it/s]


In [75]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
clu = [5,10,15,20,25,30,35,40,45,50]
threshold = np.arange(0,4.5,0.1)
final_acc = []
final_precision = []
final_recall = []
final_f1 = []
final_delay = []
epoch_time = []

for n in tqdm(range(1, 11)):
    start_time = time.time()

    train_dataset, test_dataset = make_dataset(df_normal, df_abnormal)
    
    train_sum = []
    for i in range(len(train_dataset)):
        summ1 = text_r(train_dataset.origin.iloc[i])
        train_sum.append(' '.join(summ1))
    train_dataset['summary'] = train_sum

    test_sum = []
    for i in range(len(test_dataset)):
        summ2 = text_r(test_dataset.origin.iloc[i])
        test_sum.append(' '.join(summ2))
    test_dataset['summary'] = test_sum

    
    train_docs_embedding = make_vector(train_dataset.summary)
    test_docs_embedding = make_vector(test_dataset.summary)


    with open(f'summary_business_{n}_test_docs_embedding.pickle', 'wb') as f:
        pickle.dump(test_docs_embedding, f)

    with open(f'summary_business_{n}_train_docs_embedding.pickle', 'wb') as f:
        pickle.dump(train_docs_embedding, f)

    with open(f'summary_business_{n}_test_ans_embedding.pickle', 'wb') as f:
        pickle.dump(test_dataset.category.values, f)

    with open(f'summary_business_{n}_train_ans_embedding.pickle', 'wb') as f:
        pickle.dump(train_dataset.category.values, f)


    best_score = 0
    i = len(train_dataset)//3
    
    for c in clu:
        
        kmeans1 = KMeans(n_clusters=c, random_state=42)
        kmeans1.fit(train_docs_embedding[:i])

        kmeans2 = KMeans(n_clusters=c, random_state=42)
        kmeans2.fit(train_docs_embedding[i:2*i])

        kmeans3 = KMeans(n_clusters=c, random_state=42)
        kmeans3.fit(train_docs_embedding[2*i:])

        # kmeans4 = KMeans(n_clusters=c, random_state=42)
        # kmeans4.fit(train_docs_embedding[3*i:4*i])

        # kmeans5 = KMeans(n_clusters=c, random_state=42)
        # kmeans5.fit(train_docs_embedding[4*i:])

        # kmeans6 = KMeans(n_clusters=c, random_state=42)
        # kmeans6.fit(train_docs_embedding[5*i:6*i])

        # kmeans7 = KMeans(n_clusters=c, random_state=42)
        # kmeans7.fit(train_docs_embedding[6*i:])

        distances1 = np.zeros(test_docs_embedding.shape[0])
        distances2 = np.zeros(test_docs_embedding.shape[0])
        distances3 = np.zeros(test_docs_embedding.shape[0])
        # distances4 = np.zeros(test_docs_embedding.shape[0])
        # distances5 = np.zeros(test_docs_embedding.shape[0])
        # distances6 = np.zeros(test_docs_embedding.shape[0])
        # distances7 = np.zeros(test_docs_embedding.shape[0])

        for t in threshold:
            predictions1 = kmeans1.predict(test_docs_embedding)
            predictions2 = kmeans2.predict(test_docs_embedding)
            predictions3 = kmeans3.predict(test_docs_embedding)
            # predictions4 = kmeans4.predict(test_docs_embedding)
            # predictions5 = kmeans5.predict(test_docs_embedding)
            # predictions6 = kmeans4.predict(test_docs_embedding)
            # predictions7 = kmeans5.predict(test_docs_embedding)
            
            for idx in range(test_docs_embedding.shape[0]):
                distances1[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans1.cluster_centers_[predictions1[idx]])
                distances2[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans2.cluster_centers_[predictions2[idx]])
                distances3[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans3.cluster_centers_[predictions3[idx]])
                # distances4[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans4.cluster_centers_[predictions4[idx]])
                # distances5[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans5.cluster_centers_[predictions5[idx]])
                # distances6[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans4.cluster_centers_[predictions4[idx]])
                # distances7[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans5.cluster_centers_[predictions5[idx]])
            
            predict_ensembel = (distances1>t) * 1 + (distances2>t) * 1 + (distances3>t) * 1 
            # + (distances4>t)* 1 +(distances5>t)* 1
            
            # +(distances6>t)* 1+(distances7>t)* 1
            predict = np.where(predict_ensembel>=2, 1, 0)
            
            acc_scores = accuracy_score(test_dataset.category, predict)
            f1_s = f1_score(test_dataset.category, predict)
            # if acc_scores>best_score:
            #     best_params = {acc_scores:[c,t]}
            #     best_score = acc_scores
            #     whole_window_ensemble = predict_ensembel
            #     whole_window = predict
            
            if f1_s>best_score:
                best_score = f1_s
                best_params = {best_score:[c,t]}
                whole_window_ensemble = predict_ensembel
                whole_window = predict

    
    kmeans1 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    kmeans1.fit(train_docs_embedding[:i])

    kmeans2 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    kmeans2.fit(train_docs_embedding[i:2*i])

    kmeans3 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    kmeans3.fit(train_docs_embedding[2*i:])

    # kmeans4 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans4.fit(train_docs_embedding[3*i:4*i])

    # kmeans5 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans5.fit(train_docs_embedding[4*i:])

    # kmeans6 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans6.fit(train_docs_embedding[5*i:6*i])

    # kmeans7 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans7.fit(train_docs_embedding[6*i:])

    distances1 = np.zeros(test_docs_embedding.shape[0])
    distances2 = np.zeros(test_docs_embedding.shape[0])
    distances3 = np.zeros(test_docs_embedding.shape[0])
    # distances4 = np.zeros(test_docs_embedding.shape[0])
    # distances5 = np.zeros(test_docs_embedding.shape[0])
    # distances6 = np.zeros(test_docs_embedding.shape[0])
    # distances7 = np.zeros(test_docs_embedding.shape[0])

    predictions1 = kmeans1.predict(test_docs_embedding)
    predictions2 = kmeans2.predict(test_docs_embedding)
    predictions3 = kmeans3.predict(test_docs_embedding)
    # predictions4 = kmeans4.predict(test_docs_embedding)
    # predictions5 = kmeans5.predict(test_docs_embedding)
    # predictions6 = kmeans6.predict(test_docs_embedding)
    # predictions7 = kmeans7.predict(test_docs_embedding)

    for i in range(test_docs_embedding.shape[0]):
        distances1[i] = np.linalg.norm(test_docs_embedding[i] - kmeans1.cluster_centers_[predictions1[i]])
        distances2[i] = np.linalg.norm(test_docs_embedding[i] - kmeans2.cluster_centers_[predictions2[i]])
        distances3[i] = np.linalg.norm(test_docs_embedding[i] - kmeans3.cluster_centers_[predictions3[i]])
        # distances4[i] = np.linalg.norm(test_docs_embedding[i] - kmeans4.cluster_centers_[predictions4[i]])
        # distances5[i] = np.linalg.norm(test_docs_embedding[i] - kmeans5.cluster_centers_[predictions5[i]])
        # distances6[i] = np.linalg.norm(test_docs_embedding[i] - kmeans6.cluster_centers_[predictions6[i]])
        # distances7[i] = np.linalg.norm(test_docs_embedding[i] - kmeans7.cluster_centers_[predictions7[i]])

    predict_ensembel = (distances1>best_params[best_score][1]) * 1 + (distances2>best_params[best_score][1]) * 1 + (distances3>best_params[best_score][1]) * 1 
    # + (distances4>t)* 1 +(distances5>t)* 1
    # +(distances6>t)* 1+(distances7>t)* 1
    predict = np.where(predict_ensembel>=2, 1, 0)

    test_acc_scores = accuracy_score(test_dataset.category, predict)
    test_pre_scores = precision_score(test_dataset.category, predict)
    test_rec_scores = recall_score(test_dataset.category, predict)
    test_f1_scores = f1_score(test_dataset.category, predict)
    final_acc.append(test_acc_scores)
    final_precision.append(test_pre_scores)
    final_recall.append(test_rec_scores)
    final_f1.append(test_f1_scores)

    window_size = 80
    ref_window = predict[:window_size]
    ref_ratio = np.count_nonzero(ref_window) / len(ref_window)

    first_ab_idx = test_dataset[test_dataset.category==1].index[0]-window_size
    for delay in range(len(predict) - first_ab_idx):
        compare_window = predict[first_ab_idx:first_ab_idx+window_size]
        compare_ratio = np.count_nonzero(compare_window) / len(compare_window)
        first_ab_idx+=1
        t, p = ttest_ind(ref_window, compare_window)
        if p<=0.005:
            print('몇 번째인지:', delay+first_ab_idx)
            print('Delay:', delay)
            print(f"Group 1 proportion: {ref_ratio:.3f}")
            print(f"Group 2 proportion: {compare_ratio:.3f}")
            print(f"t-statistic: {t:.3f}")
            print(f"p-value: {p:.3f}")
            final_delay.append(delay)
            break
    if len(final_delay) != n:
        final_delay.append('none')
        
    epoch_time.append(round(time.time() - start_time, 1))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 0/10 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (564 > 512). Runnin

몇 번째인지: 351
Delay: 41
Group 1 proportion: 0.075
Group 2 proportion: 0.237
t-statistic: -2.886
p-value: 0.004


100%|██████████| 1029/1029 [02:50<00:00,  6.04it/s]
100%|██████████| 788/788 [06:11<00:00,  2.12it/s]
 20%|██        | 2/10 [19:09<1:20:33, 604.13s/it]

몇 번째인지: 386
Delay: 58
Group 1 proportion: 0.062
Group 2 proportion: 0.225
t-statistic: -2.992
p-value: 0.003


100%|██████████| 1029/1029 [02:43<00:00,  6.28it/s]
100%|██████████| 788/788 [02:00<00:00,  6.53it/s]
 30%|███       | 3/10 [25:58<1:00:06, 515.17s/it]

몇 번째인지: 1023
Delay: 379
Group 1 proportion: 0.087
Group 2 proportion: 0.263
t-statistic: -2.975
p-value: 0.003


100%|██████████| 1029/1029 [02:37<00:00,  6.53it/s]
100%|██████████| 788/788 [02:15<00:00,  5.83it/s]
 40%|████      | 4/10 [32:44<47:12, 472.12s/it]  

몇 번째인지: 420
Delay: 77
Group 1 proportion: 0.075
Group 2 proportion: 0.237
t-statistic: -2.886
p-value: 0.004


100%|██████████| 1029/1029 [06:09<00:00,  2.78it/s]
100%|██████████| 788/788 [04:52<00:00,  2.70it/s]
 50%|█████     | 5/10 [45:38<48:23, 580.70s/it]

몇 번째인지: 590
Delay: 162
Group 1 proportion: 0.037
Group 2 proportion: 0.175
t-statistic: -2.877
p-value: 0.005


100%|██████████| 1029/1029 [02:45<00:00,  6.21it/s]
100%|██████████| 788/788 [02:04<00:00,  6.32it/s]
 60%|██████    | 6/10 [52:19<34:39, 519.88s/it]

몇 번째인지: 1305
Delay: 519
Group 1 proportion: 0.087
Group 2 proportion: 0.667
t-statistic: -3.337
p-value: 0.001


100%|██████████| 1029/1029 [02:55<00:00,  5.87it/s]
100%|██████████| 788/788 [02:15<00:00,  5.81it/s]
 70%|███████   | 7/10 [59:27<24:29, 489.81s/it]

몇 번째인지: 640
Delay: 185
Group 1 proportion: 0.050
Group 2 proportion: 0.200
t-statistic: -2.927
p-value: 0.004


100%|██████████| 1029/1029 [03:11<00:00,  5.38it/s]
100%|██████████| 788/788 [02:15<00:00,  5.80it/s]
100%|██████████| 1029/1029 [02:59<00:00,  5.73it/s]
100%|██████████| 788/788 [02:01<00:00,  6.50it/s]
 90%|█████████ | 9/10 [1:13:43<07:35, 455.51s/it]

몇 번째인지: 957
Delay: 342
Group 1 proportion: 0.062
Group 2 proportion: 0.225
t-statistic: -2.992
p-value: 0.003


100%|██████████| 1029/1029 [02:37<00:00,  6.55it/s]
100%|██████████| 788/788 [01:58<00:00,  6.63it/s]
100%|██████████| 10/10 [1:20:10<00:00, 481.01s/it]

몇 번째인지: 447
Delay: 91
Group 1 proportion: 0.125
Group 2 proportion: 0.312
t-statistic: -2.927
p-value: 0.004





In [76]:
print('각10번 time:', epoch_time)
print('평균', np.mean(epoch_time))

각10번 time: [407.8, 741.5, 409.3, 406.1, 773.2, 401.8, 427.9, 443.5, 412.5, 386.3]
평균 480.99000000000007


In [77]:
print('각10번 delay:', final_delay)
# print('평균', np.mean(final_delay))

각10번 delay: [41, 58, 379, 77, 162, 519, 185, 'none', 342, 91]


In [78]:
print('각10번 acc:', final_acc)
print('평균', np.mean(final_acc))

각10번 acc: [0.8401015228426396, 0.8642131979695431, 0.8908629441624365, 0.8604060913705583, 0.8908629441624365, 0.8984771573604061, 0.8895939086294417, 0.8807106598984772, 0.8984771573604061, 0.8489847715736041]
평균 0.876269035532995


In [79]:
print('각10번 precision:', final_precision)
print('평균', np.mean(final_precision))

각10번 precision: [0.42857142857142855, 0.48031496062992124, 0.5769230769230769, 0.4722222222222222, 0.5833333333333334, 0.6145833333333334, 0.5714285714285714, 0.5307692307692308, 0.6195652173913043, 0.445859872611465]
평균 0.5323571247213887


In [80]:
print('각10번 f1:', final_f1)
print('평균', np.mean(final_f1))

각10번 f1: [0.5333333333333333, 0.5327510917030568, 0.5825242718446602, 0.5528455284552845, 0.5656565656565657, 0.595959595959596, 0.5797101449275363, 0.5948275862068965, 0.5876288659793814, 0.5405405405405405]
평균 0.5665777524606851


### Politic

In [81]:
normal_dataset = df_business+df_sport+df_tech+df_entertain

In [82]:
txt = []

for lst in tqdm(normal_dataset):
    file = open(lst, 'r')
    data = file.read()
    data = data.lower()
    data = data.strip()
    data = re.compile('<.*?>').sub('', data)
    data = re.sub('\s+', ' ', data)  
    data = ' '.join([contractions[t] if t in contractions else t for t in data.split(" ")]) # 약어 정규화
    data = re.sub(r"'s\b","",data) # 소유격 제거. Ex) roland's -> roland
    
    data = data.replace('\n\n', '\n')
    data = data.replace('\n', '. ')
    data = data.replace('..', '.')

    txt.append(data)

txt_2 = []

for abnormal in tqdm(df_politics):
    file = open(abnormal, 'r')
    data_ = file.read()
    data_ = data_.lower()
    data_ = data_.strip()
    data_ = re.compile('<.*?>').sub('', data_)
    data_ = re.sub('\s+', ' ', data_)  
    data_ = ' '.join([contractions[t] if t in contractions else t for t in data_.split(" ")]) # 약어 정규화
    data_ = re.sub(r"'s\b","",data_) # 소유격 제거. Ex) roland's -> roland
    
    data_ = data_.replace('\n\n', '\n')
    data_ = data_.replace('\n', '. ')
    data_ = data_.replace('..', '.')

    txt_2.append(data_)

df_normal = pd.DataFrame(txt, columns=['origin'])
df_normal = df_normal.drop(df_normal.index[928])
df_normal = df_normal.reset_index(drop=True)
df_abnormal = pd.DataFrame(txt_2, columns=['origin'])

100%|██████████| 1808/1808 [00:00<00:00, 9384.25it/s] 
100%|██████████| 417/417 [00:00<00:00, 7238.41it/s]


In [83]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
clu = [5,10,15,20,25,30,35,40,45,50]
threshold = np.arange(0,4.5,0.1)
final_acc = []
final_precision = []
final_recall = []
final_f1 = []
final_delay = []
epoch_time = []

for n in tqdm(range(1, 11)):
    start_time = time.time()

    train_dataset, test_dataset = make_dataset(df_normal, df_abnormal)
    
    train_sum = []
    for i in range(len(train_dataset)):
        summ1 = text_r(train_dataset.origin.iloc[i])
        train_sum.append(' '.join(summ1))
    train_dataset['summary'] = train_sum

    test_sum = []
    for i in range(len(test_dataset)):
        summ2 = text_r(test_dataset.origin.iloc[i])
        test_sum.append(' '.join(summ2))
    test_dataset['summary'] = test_sum

    
    train_docs_embedding = make_vector(train_dataset.summary)
    test_docs_embedding = make_vector(test_dataset.summary)


    with open(f'summary_politic_{n}_test_docs_embedding.pickle', 'wb') as f:
        pickle.dump(test_docs_embedding, f)

    with open(f'summary_politic_{n}_train_docs_embedding.pickle', 'wb') as f:
        pickle.dump(train_docs_embedding, f)

    with open(f'summary_politic_{n}_test_ans_embedding.pickle', 'wb') as f:
        pickle.dump(test_dataset.category.values, f)

    with open(f'summary_politic_{n}_train_ans_embedding.pickle', 'wb') as f:
        pickle.dump(train_dataset.category.values, f)


    best_score = 0
    i = len(train_dataset)//3
    
    for c in clu:
        
        kmeans1 = KMeans(n_clusters=c, random_state=42)
        kmeans1.fit(train_docs_embedding[:i])

        kmeans2 = KMeans(n_clusters=c, random_state=42)
        kmeans2.fit(train_docs_embedding[i:2*i])

        kmeans3 = KMeans(n_clusters=c, random_state=42)
        kmeans3.fit(train_docs_embedding[2*i:])

        # kmeans4 = KMeans(n_clusters=c, random_state=42)
        # kmeans4.fit(train_docs_embedding[3*i:4*i])

        # kmeans5 = KMeans(n_clusters=c, random_state=42)
        # kmeans5.fit(train_docs_embedding[4*i:])

        # kmeans6 = KMeans(n_clusters=c, random_state=42)
        # kmeans6.fit(train_docs_embedding[5*i:6*i])

        # kmeans7 = KMeans(n_clusters=c, random_state=42)
        # kmeans7.fit(train_docs_embedding[6*i:])

        distances1 = np.zeros(test_docs_embedding.shape[0])
        distances2 = np.zeros(test_docs_embedding.shape[0])
        distances3 = np.zeros(test_docs_embedding.shape[0])
        # distances4 = np.zeros(test_docs_embedding.shape[0])
        # distances5 = np.zeros(test_docs_embedding.shape[0])
        # distances6 = np.zeros(test_docs_embedding.shape[0])
        # distances7 = np.zeros(test_docs_embedding.shape[0])

        for t in threshold:
            predictions1 = kmeans1.predict(test_docs_embedding)
            predictions2 = kmeans2.predict(test_docs_embedding)
            predictions3 = kmeans3.predict(test_docs_embedding)
            # predictions4 = kmeans4.predict(test_docs_embedding)
            # predictions5 = kmeans5.predict(test_docs_embedding)
            # predictions6 = kmeans4.predict(test_docs_embedding)
            # predictions7 = kmeans5.predict(test_docs_embedding)
            
            for idx in range(test_docs_embedding.shape[0]):
                distances1[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans1.cluster_centers_[predictions1[idx]])
                distances2[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans2.cluster_centers_[predictions2[idx]])
                distances3[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans3.cluster_centers_[predictions3[idx]])
                # distances4[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans4.cluster_centers_[predictions4[idx]])
                # distances5[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans5.cluster_centers_[predictions5[idx]])
                # distances6[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans4.cluster_centers_[predictions4[idx]])
                # distances7[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans5.cluster_centers_[predictions5[idx]])
            
            predict_ensembel = (distances1>t) * 1 + (distances2>t) * 1 + (distances3>t) * 1 
            # + (distances4>t)* 1 +(distances5>t)* 1
            
            # +(distances6>t)* 1+(distances7>t)* 1
            predict = np.where(predict_ensembel>=2, 1, 0)
            
            acc_scores = accuracy_score(test_dataset.category, predict)
            f1_s = f1_score(test_dataset.category, predict)
            # if acc_scores>best_score:
            #     best_params = {acc_scores:[c,t]}
            #     best_score = acc_scores
            #     whole_window_ensemble = predict_ensembel
            #     whole_window = predict
            
            if f1_s>best_score:
                best_score = f1_s
                best_params = {best_score:[c,t]}
                whole_window_ensemble = predict_ensembel
                whole_window = predict

    
    kmeans1 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    kmeans1.fit(train_docs_embedding[:i])

    kmeans2 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    kmeans2.fit(train_docs_embedding[i:2*i])

    kmeans3 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    kmeans3.fit(train_docs_embedding[2*i:])

    # kmeans4 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans4.fit(train_docs_embedding[3*i:4*i])

    # kmeans5 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans5.fit(train_docs_embedding[4*i:])

    # kmeans6 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans6.fit(train_docs_embedding[5*i:6*i])

    # kmeans7 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans7.fit(train_docs_embedding[6*i:])

    distances1 = np.zeros(test_docs_embedding.shape[0])
    distances2 = np.zeros(test_docs_embedding.shape[0])
    distances3 = np.zeros(test_docs_embedding.shape[0])
    # distances4 = np.zeros(test_docs_embedding.shape[0])
    # distances5 = np.zeros(test_docs_embedding.shape[0])
    # distances6 = np.zeros(test_docs_embedding.shape[0])
    # distances7 = np.zeros(test_docs_embedding.shape[0])

    predictions1 = kmeans1.predict(test_docs_embedding)
    predictions2 = kmeans2.predict(test_docs_embedding)
    predictions3 = kmeans3.predict(test_docs_embedding)
    # predictions4 = kmeans4.predict(test_docs_embedding)
    # predictions5 = kmeans5.predict(test_docs_embedding)
    # predictions6 = kmeans6.predict(test_docs_embedding)
    # predictions7 = kmeans7.predict(test_docs_embedding)

    for i in range(test_docs_embedding.shape[0]):
        distances1[i] = np.linalg.norm(test_docs_embedding[i] - kmeans1.cluster_centers_[predictions1[i]])
        distances2[i] = np.linalg.norm(test_docs_embedding[i] - kmeans2.cluster_centers_[predictions2[i]])
        distances3[i] = np.linalg.norm(test_docs_embedding[i] - kmeans3.cluster_centers_[predictions3[i]])
        # distances4[i] = np.linalg.norm(test_docs_embedding[i] - kmeans4.cluster_centers_[predictions4[i]])
        # distances5[i] = np.linalg.norm(test_docs_embedding[i] - kmeans5.cluster_centers_[predictions5[i]])
        # distances6[i] = np.linalg.norm(test_docs_embedding[i] - kmeans6.cluster_centers_[predictions6[i]])
        # distances7[i] = np.linalg.norm(test_docs_embedding[i] - kmeans7.cluster_centers_[predictions7[i]])

    predict_ensembel = (distances1>best_params[best_score][1]) * 1 + (distances2>best_params[best_score][1]) * 1 + (distances3>best_params[best_score][1]) * 1 
    # + (distances4>t)* 1 +(distances5>t)* 1
    # +(distances6>t)* 1+(distances7>t)* 1
    predict = np.where(predict_ensembel>=2, 1, 0)

    test_acc_scores = accuracy_score(test_dataset.category, predict)
    test_pre_scores = precision_score(test_dataset.category, predict)
    test_rec_scores = recall_score(test_dataset.category, predict)
    test_f1_scores = f1_score(test_dataset.category, predict)
    final_acc.append(test_acc_scores)
    final_precision.append(test_pre_scores)
    final_recall.append(test_rec_scores)
    final_f1.append(test_f1_scores)

    window_size = 80
    ref_window = predict[:window_size]
    ref_ratio = np.count_nonzero(ref_window) / len(ref_window)

    first_ab_idx = test_dataset[test_dataset.category==1].index[0]-window_size
    for delay in range(len(predict) - first_ab_idx):
        compare_window = predict[first_ab_idx:first_ab_idx+window_size]
        compare_ratio = np.count_nonzero(compare_window) / len(compare_window)
        first_ab_idx+=1
        t, p = ttest_ind(ref_window, compare_window)
        if p<=0.005:
            print('몇 번째인지:', delay+first_ab_idx)
            print('Delay:', delay)
            print(f"Group 1 proportion: {ref_ratio:.3f}")
            print(f"Group 2 proportion: {compare_ratio:.3f}")
            print(f"t-statistic: {t:.3f}")
            print(f"p-value: {p:.3f}")
            final_delay.append(delay)
            break
    if len(final_delay) != n:
        final_delay.append('none')
        
    epoch_time.append(round(time.time() - start_time, 1))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 0/10 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (564 > 512). Runnin

몇 번째인지: 1307
Delay: 507
Group 1 proportion: 0.225
Group 2 proportion: 0.714
t-statistic: -2.918
p-value: 0.005


100%|██████████| 1084/1084 [02:33<00:00,  7.08it/s]
100%|██████████| 806/806 [01:57<00:00,  6.86it/s]
100%|██████████| 1084/1084 [02:44<00:00,  6.57it/s]
100%|██████████| 806/806 [02:00<00:00,  6.72it/s]
100%|██████████| 1084/1084 [02:31<00:00,  7.16it/s]
100%|██████████| 806/806 [01:59<00:00,  6.74it/s]
100%|██████████| 1084/1084 [02:26<00:00,  7.40it/s]
100%|██████████| 806/806 [01:55<00:00,  6.99it/s]
 80%|████████  | 8/10 [51:14<12:40, 380.23s/it]

몇 번째인지: 537
Delay: 117
Group 1 proportion: 0.175
Group 2 proportion: 0.375
t-statistic: -2.888
p-value: 0.004


100%|██████████| 1084/1084 [02:30<00:00,  7.20it/s]
100%|██████████| 806/806 [01:49<00:00,  7.35it/s]
 90%|█████████ | 9/10 [57:17<06:14, 374.77s/it]

몇 번째인지: 905
Delay: 311
Group 1 proportion: 0.225
Group 2 proportion: 0.438
t-statistic: -2.913
p-value: 0.004


100%|██████████| 1084/1084 [02:32<00:00,  7.10it/s]
100%|██████████| 806/806 [01:55<00:00,  6.99it/s]
100%|██████████| 10/10 [1:03:30<00:00, 381.06s/it]


In [84]:
print('각10번 time:', epoch_time)
print('평균', np.mean(epoch_time))

각10번 time: [377.1, 379.0, 398.1, 394.3, 382.2, 398.6, 380.8, 364.2, 362.8, 373.5]
평균 381.06000000000006


In [85]:
print('각10번 delay:', final_delay)
# print('평균', np.mean(final_delay))

각10번 delay: ['none', 'none', 'none', 507, 'none', 'none', 'none', 117, 311, 'none']


In [86]:
print('각10번 acc:', final_acc)
print('평균', np.mean(final_acc))

각10번 acc: [0.7208436724565757, 0.7704714640198511, 0.6600496277915633, 0.7965260545905707, 0.8039702233250621, 0.7642679900744417, 0.7481389578163772, 0.7741935483870968, 0.7617866004962779, 0.7555831265508685]
평균 0.7555831265508683


In [87]:
print('각10번 precision:', final_precision)
print('평균', np.mean(final_precision))

각10번 precision: [0.2204724409448819, 0.2616822429906542, 0.19873817034700317, 0.2658959537572254, 0.2754491017964072, 0.24401913875598086, 0.2540983606557377, 0.25125628140703515, 0.2511415525114155, 0.2409090909090909]
평균 0.24636623340754316


In [88]:
print('각10번 f1:', final_f1)
print('평균', np.mean(final_f1))

각10번 f1: [0.3323442136498516, 0.37710437710437705, 0.315, 0.359375, 0.36800000000000005, 0.3493150684931507, 0.3792048929663609, 0.35460992907801414, 0.36423841059602646, 0.3498349834983498]
평균 0.3549026875386131


### Tech

In [89]:
normal_dataset = df_business+df_sport+df_politics+df_entertain

In [90]:
txt = []

for lst in tqdm(normal_dataset):
    file = open(lst, 'r')
    data = file.read()
    data = data.lower()
    data = data.strip()
    data = re.compile('<.*?>').sub('', data)
    data = re.sub('\s+', ' ', data)  
    data = ' '.join([contractions[t] if t in contractions else t for t in data.split(" ")]) # 약어 정규화
    data = re.sub(r"'s\b","",data) # 소유격 제거. Ex) roland's -> roland
    
    data = data.replace('\n\n', '\n')
    data = data.replace('\n', '. ')
    data = data.replace('..', '.')

    txt.append(data)

txt_2 = []

for abnormal in tqdm(df_tech):
    file = open(abnormal, 'r')
    data_ = file.read()
    data_ = data_.lower()
    data_ = data_.strip()
    data_ = re.compile('<.*?>').sub('', data_)
    data_ = re.sub('\s+', ' ', data_)  
    data_ = ' '.join([contractions[t] if t in contractions else t for t in data_.split(" ")]) # 약어 정규화
    data_ = re.sub(r"'s\b","",data_) # 소유격 제거. Ex) roland's -> roland
    
    data_ = data_.replace('\n\n', '\n')
    data_ = data_.replace('\n', '. ')
    data_ = data_.replace('..', '.')

    txt_2.append(data_)

df_normal = pd.DataFrame(txt, columns=['origin'])
df_normal = df_normal.drop(df_normal.index[928])
df_normal = df_normal.reset_index(drop=True)
df_abnormal = pd.DataFrame(txt_2, columns=['origin'])

100%|██████████| 1824/1824 [00:00<00:00, 4467.44it/s]
100%|██████████| 401/401 [00:00<00:00, 4760.12it/s]


In [91]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
clu = [5,10,15,20,25,30,35,40,45,50]
threshold = np.arange(0,4.5,0.1)
final_acc = []
final_precision = []
final_recall = []
final_f1 = []
final_delay = []
epoch_time = []

for n in tqdm(range(1, 11)):
    start_time = time.time()

    train_dataset, test_dataset = make_dataset(df_normal, df_abnormal)
    
    train_sum = []
    for i in range(len(train_dataset)):
        summ1 = text_r(train_dataset.origin.iloc[i])
        train_sum.append(' '.join(summ1))
    train_dataset['summary'] = train_sum

    test_sum = []
    for i in range(len(test_dataset)):
        summ2 = text_r(test_dataset.origin.iloc[i])
        test_sum.append(' '.join(summ2))
    test_dataset['summary'] = test_sum

    
    train_docs_embedding = make_vector(train_dataset.summary)
    test_docs_embedding = make_vector(test_dataset.summary)


    with open(f'summary_tech_{n}_test_docs_embedding.pickle', 'wb') as f:
        pickle.dump(test_docs_embedding, f)

    with open(f'summary_tech_{n}_train_docs_embedding.pickle', 'wb') as f:
        pickle.dump(train_docs_embedding, f)

    with open(f'summary_tech_{n}_test_ans_embedding.pickle', 'wb') as f:
        pickle.dump(test_dataset.category.values, f)

    with open(f'summary_tech_{n}_train_ans_embedding.pickle', 'wb') as f:
        pickle.dump(train_dataset.category.values, f)


    best_score = 0
    i = len(train_dataset)//3
    
    for c in clu:
        
        kmeans1 = KMeans(n_clusters=c, random_state=42)
        kmeans1.fit(train_docs_embedding[:i])

        kmeans2 = KMeans(n_clusters=c, random_state=42)
        kmeans2.fit(train_docs_embedding[i:2*i])

        kmeans3 = KMeans(n_clusters=c, random_state=42)
        kmeans3.fit(train_docs_embedding[2*i:])

        # kmeans4 = KMeans(n_clusters=c, random_state=42)
        # kmeans4.fit(train_docs_embedding[3*i:4*i])

        # kmeans5 = KMeans(n_clusters=c, random_state=42)
        # kmeans5.fit(train_docs_embedding[4*i:])

        # kmeans6 = KMeans(n_clusters=c, random_state=42)
        # kmeans6.fit(train_docs_embedding[5*i:6*i])

        # kmeans7 = KMeans(n_clusters=c, random_state=42)
        # kmeans7.fit(train_docs_embedding[6*i:])

        distances1 = np.zeros(test_docs_embedding.shape[0])
        distances2 = np.zeros(test_docs_embedding.shape[0])
        distances3 = np.zeros(test_docs_embedding.shape[0])
        # distances4 = np.zeros(test_docs_embedding.shape[0])
        # distances5 = np.zeros(test_docs_embedding.shape[0])
        # distances6 = np.zeros(test_docs_embedding.shape[0])
        # distances7 = np.zeros(test_docs_embedding.shape[0])

        for t in threshold:
            predictions1 = kmeans1.predict(test_docs_embedding)
            predictions2 = kmeans2.predict(test_docs_embedding)
            predictions3 = kmeans3.predict(test_docs_embedding)
            # predictions4 = kmeans4.predict(test_docs_embedding)
            # predictions5 = kmeans5.predict(test_docs_embedding)
            # predictions6 = kmeans4.predict(test_docs_embedding)
            # predictions7 = kmeans5.predict(test_docs_embedding)
            
            for idx in range(test_docs_embedding.shape[0]):
                distances1[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans1.cluster_centers_[predictions1[idx]])
                distances2[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans2.cluster_centers_[predictions2[idx]])
                distances3[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans3.cluster_centers_[predictions3[idx]])
                # distances4[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans4.cluster_centers_[predictions4[idx]])
                # distances5[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans5.cluster_centers_[predictions5[idx]])
                # distances6[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans4.cluster_centers_[predictions4[idx]])
                # distances7[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans5.cluster_centers_[predictions5[idx]])
            
            predict_ensembel = (distances1>t) * 1 + (distances2>t) * 1 + (distances3>t) * 1 
            # + (distances4>t)* 1 +(distances5>t)* 1
            
            # +(distances6>t)* 1+(distances7>t)* 1
            predict = np.where(predict_ensembel>=2, 1, 0)
            
            acc_scores = accuracy_score(test_dataset.category, predict)
            f1_s = f1_score(test_dataset.category, predict)
            # if acc_scores>best_score:
            #     best_params = {acc_scores:[c,t]}
            #     best_score = acc_scores
            #     whole_window_ensemble = predict_ensembel
            #     whole_window = predict
            
            if f1_s>best_score:
                best_score = f1_s
                best_params = {best_score:[c,t]}
                whole_window_ensemble = predict_ensembel
                whole_window = predict

    
    kmeans1 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    kmeans1.fit(train_docs_embedding[:i])

    kmeans2 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    kmeans2.fit(train_docs_embedding[i:2*i])

    kmeans3 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    kmeans3.fit(train_docs_embedding[2*i:])

    # kmeans4 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans4.fit(train_docs_embedding[3*i:4*i])

    # kmeans5 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans5.fit(train_docs_embedding[4*i:])

    # kmeans6 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans6.fit(train_docs_embedding[5*i:6*i])

    # kmeans7 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans7.fit(train_docs_embedding[6*i:])

    distances1 = np.zeros(test_docs_embedding.shape[0])
    distances2 = np.zeros(test_docs_embedding.shape[0])
    distances3 = np.zeros(test_docs_embedding.shape[0])
    # distances4 = np.zeros(test_docs_embedding.shape[0])
    # distances5 = np.zeros(test_docs_embedding.shape[0])
    # distances6 = np.zeros(test_docs_embedding.shape[0])
    # distances7 = np.zeros(test_docs_embedding.shape[0])

    predictions1 = kmeans1.predict(test_docs_embedding)
    predictions2 = kmeans2.predict(test_docs_embedding)
    predictions3 = kmeans3.predict(test_docs_embedding)
    # predictions4 = kmeans4.predict(test_docs_embedding)
    # predictions5 = kmeans5.predict(test_docs_embedding)
    # predictions6 = kmeans6.predict(test_docs_embedding)
    # predictions7 = kmeans7.predict(test_docs_embedding)

    for i in range(test_docs_embedding.shape[0]):
        distances1[i] = np.linalg.norm(test_docs_embedding[i] - kmeans1.cluster_centers_[predictions1[i]])
        distances2[i] = np.linalg.norm(test_docs_embedding[i] - kmeans2.cluster_centers_[predictions2[i]])
        distances3[i] = np.linalg.norm(test_docs_embedding[i] - kmeans3.cluster_centers_[predictions3[i]])
        # distances4[i] = np.linalg.norm(test_docs_embedding[i] - kmeans4.cluster_centers_[predictions4[i]])
        # distances5[i] = np.linalg.norm(test_docs_embedding[i] - kmeans5.cluster_centers_[predictions5[i]])
        # distances6[i] = np.linalg.norm(test_docs_embedding[i] - kmeans6.cluster_centers_[predictions6[i]])
        # distances7[i] = np.linalg.norm(test_docs_embedding[i] - kmeans7.cluster_centers_[predictions7[i]])

    predict_ensembel = (distances1>best_params[best_score][1]) * 1 + (distances2>best_params[best_score][1]) * 1 + (distances3>best_params[best_score][1]) * 1 
    # + (distances4>t)* 1 +(distances5>t)* 1
    # +(distances6>t)* 1+(distances7>t)* 1
    predict = np.where(predict_ensembel>=2, 1, 0)

    test_acc_scores = accuracy_score(test_dataset.category, predict)
    test_pre_scores = precision_score(test_dataset.category, predict)
    test_rec_scores = recall_score(test_dataset.category, predict)
    test_f1_scores = f1_score(test_dataset.category, predict)
    final_acc.append(test_acc_scores)
    final_precision.append(test_pre_scores)
    final_recall.append(test_rec_scores)
    final_f1.append(test_f1_scores)

    window_size = 80
    ref_window = predict[:window_size]
    ref_ratio = np.count_nonzero(ref_window) / len(ref_window)

    first_ab_idx = test_dataset[test_dataset.category==1].index[0]-window_size
    for delay in range(len(predict) - first_ab_idx):
        compare_window = predict[first_ab_idx:first_ab_idx+window_size]
        compare_ratio = np.count_nonzero(compare_window) / len(compare_window)
        first_ab_idx+=1
        t, p = ttest_ind(ref_window, compare_window)
        if p<=0.005:
            print('몇 번째인지:', delay+first_ab_idx)
            print('Delay:', delay)
            print(f"Group 1 proportion: {ref_ratio:.3f}")
            print(f"Group 2 proportion: {compare_ratio:.3f}")
            print(f"t-statistic: {t:.3f}")
            print(f"p-value: {p:.3f}")
            final_delay.append(delay)
            break
    if len(final_delay) != n:
        final_delay.append('none')
        
    epoch_time.append(round(time.time() - start_time, 1))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 0/10 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (564 > 512). Runnin

몇 번째인지: 1097
Delay: 406
Group 1 proportion: 0.188
Group 2 proportion: 0.388
t-statistic: -2.848
p-value: 0.005


100%|██████████| 1093/1093 [02:18<00:00,  7.91it/s]
100%|██████████| 810/810 [01:58<00:00,  6.84it/s]
 40%|████      | 4/10 [24:19<36:18, 363.01s/it]

몇 번째인지: 603
Delay: 159
Group 1 proportion: 0.050
Group 2 proportion: 0.200
t-statistic: -2.927
p-value: 0.004


100%|██████████| 1093/1093 [02:24<00:00,  7.54it/s]
100%|██████████| 810/810 [01:51<00:00,  7.24it/s]
 50%|█████     | 5/10 [30:17<30:06, 361.25s/it]

몇 번째인지: 633
Delay: 174
Group 1 proportion: 0.150
Group 2 proportion: 0.350
t-statistic: -2.984
p-value: 0.003


100%|██████████| 1093/1093 [02:20<00:00,  7.78it/s]
100%|██████████| 810/810 [01:58<00:00,  6.82it/s]
100%|██████████| 1093/1093 [02:25<00:00,  7.51it/s]
100%|██████████| 810/810 [01:52<00:00,  7.21it/s]
100%|██████████| 1093/1093 [02:29<00:00,  7.30it/s]
100%|██████████| 810/810 [01:47<00:00,  7.51it/s]
100%|██████████| 1093/1093 [02:27<00:00,  7.42it/s]
100%|██████████| 810/810 [01:51<00:00,  7.27it/s]
100%|██████████| 1093/1093 [02:26<00:00,  7.47it/s]
100%|██████████| 810/810 [01:51<00:00,  7.28it/s]
100%|██████████| 10/10 [1:00:17<00:00, 361.73s/it]


In [92]:
print('각10번 time:', epoch_time)
print('평균', np.mean(epoch_time))

각10번 time: [368.9, 373.1, 358.2, 359.4, 358.1, 360.8, 359.1, 359.3, 361.1, 359.2]
평균 361.71999999999997


In [93]:
print('각10번 delay:', final_delay)
# print('평균', np.mean(final_delay))

각10번 delay: ['none', 'none', 406, 159, 174, 'none', 'none', 'none', 'none', 'none']


In [94]:
print('각10번 acc:', final_acc)
print('평균', np.mean(final_acc))

각10번 acc: [0.8802469135802469, 0.8641975308641975, 0.7901234567901234, 0.8950617283950617, 0.7913580246913581, 0.817283950617284, 0.8074074074074075, 0.7765432098765432, 0.8024691358024691, 0.8814814814814815]
평균 0.8306172839506173


In [95]:
print('각10번 precision:', final_precision)
print('평균', np.mean(final_precision))

각10번 precision: [0.38961038961038963, 0.3584905660377358, 0.27722772277227725, 0.463768115942029, 0.28502415458937197, 0.2976190476190476, 0.27647058823529413, 0.2672811059907834, 0.25903614457831325, 0.40476190476190477]
평균 0.3279289740137147


In [96]:
print('각10번 f1:', final_f1)
print('평균', np.mean(final_f1))

각10번 f1: [0.3821656050955414, 0.4086021505376344, 0.3971631205673759, 0.4295302013422819, 0.41114982578397213, 0.4032258064516129, 0.376, 0.39057239057239057, 0.3495934959349593, 0.41463414634146345]
평균 0.3962636742627232


### Sport

In [97]:
normal_dataset = df_business+df_tech+df_politics+df_entertain

In [98]:
txt = []

for lst in tqdm(normal_dataset):
    file = open(lst, 'r')
    data = file.read()
    data = data.lower()
    data = data.strip()
    data = re.compile('<.*?>').sub('', data)
    data = re.sub('\s+', ' ', data)  
    data = ' '.join([contractions[t] if t in contractions else t for t in data.split(" ")]) # 약어 정규화
    data = re.sub(r"'s\b","",data) # 소유격 제거. Ex) roland's -> roland
    
    data = data.replace('\n\n', '\n')
    data = data.replace('\n', '. ')
    data = data.replace('..', '.')

    txt.append(data)

txt_2 = []

for abnormal in tqdm(df_sport):
    file = open(abnormal, 'r')
    data_ = file.read()
    data_ = data_.lower()
    data_ = data_.strip()
    data_ = re.compile('<.*?>').sub('', data_)
    data_ = re.sub('\s+', ' ', data_)  
    data_ = ' '.join([contractions[t] if t in contractions else t for t in data_.split(" ")]) # 약어 정규화
    data_ = re.sub(r"'s\b","",data_) # 소유격 제거. Ex) roland's -> roland
    
    data_ = data_.replace('\n\n', '\n')
    data_ = data_.replace('\n', '. ')
    data_ = data_.replace('..', '.')

    txt_2.append(data_)

df_normal = pd.DataFrame(txt, columns=['origin'])
df_normal = df_normal.drop(df_normal.index[928])
df_normal = df_normal.reset_index(drop=True)
df_abnormal = pd.DataFrame(txt_2, columns=['origin'])

100%|██████████| 1714/1714 [00:00<00:00, 7470.08it/s]
100%|██████████| 511/511 [00:00<00:00, 7184.31it/s]


In [99]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
clu = [5,10,15,20,25,30,35,40,45,50]
threshold = np.arange(0,4.5,0.1)
final_acc = []
final_precision = []
final_recall = []
final_f1 = []
final_delay = []
epoch_time = []

for n in tqdm(range(1, 11)):
    start_time = time.time()

    train_dataset, test_dataset = make_dataset(df_normal, df_abnormal)
    
    train_sum = []
    for i in range(len(train_dataset)):
        summ1 = text_r(train_dataset.origin.iloc[i])
        train_sum.append(' '.join(summ1))
    train_dataset['summary'] = train_sum

    test_sum = []
    for i in range(len(test_dataset)):
        summ2 = text_r(test_dataset.origin.iloc[i])
        test_sum.append(' '.join(summ2))
    test_dataset['summary'] = test_sum

    
    train_docs_embedding = make_vector(train_dataset.summary)
    test_docs_embedding = make_vector(test_dataset.summary)


    with open(f'summary_sport_{n}_test_docs_embedding.pickle', 'wb') as f:
        pickle.dump(test_docs_embedding, f)

    with open(f'summary_sport_{n}_train_docs_embedding.pickle', 'wb') as f:
        pickle.dump(train_docs_embedding, f)

    with open(f'summary_sport_{n}_test_ans_embedding.pickle', 'wb') as f:
        pickle.dump(test_dataset.category.values, f)

    with open(f'summary_sport_{n}_train_ans_embedding.pickle', 'wb') as f:
        pickle.dump(train_dataset.category.values, f)


    best_score = 0
    i = len(train_dataset)//3
    
    for c in clu:
        
        kmeans1 = KMeans(n_clusters=c, random_state=42)
        kmeans1.fit(train_docs_embedding[:i])

        kmeans2 = KMeans(n_clusters=c, random_state=42)
        kmeans2.fit(train_docs_embedding[i:2*i])

        kmeans3 = KMeans(n_clusters=c, random_state=42)
        kmeans3.fit(train_docs_embedding[2*i:])

        # kmeans4 = KMeans(n_clusters=c, random_state=42)
        # kmeans4.fit(train_docs_embedding[3*i:4*i])

        # kmeans5 = KMeans(n_clusters=c, random_state=42)
        # kmeans5.fit(train_docs_embedding[4*i:])

        # kmeans6 = KMeans(n_clusters=c, random_state=42)
        # kmeans6.fit(train_docs_embedding[5*i:6*i])

        # kmeans7 = KMeans(n_clusters=c, random_state=42)
        # kmeans7.fit(train_docs_embedding[6*i:])

        distances1 = np.zeros(test_docs_embedding.shape[0])
        distances2 = np.zeros(test_docs_embedding.shape[0])
        distances3 = np.zeros(test_docs_embedding.shape[0])
        # distances4 = np.zeros(test_docs_embedding.shape[0])
        # distances5 = np.zeros(test_docs_embedding.shape[0])
        # distances6 = np.zeros(test_docs_embedding.shape[0])
        # distances7 = np.zeros(test_docs_embedding.shape[0])

        for t in threshold:
            predictions1 = kmeans1.predict(test_docs_embedding)
            predictions2 = kmeans2.predict(test_docs_embedding)
            predictions3 = kmeans3.predict(test_docs_embedding)
            # predictions4 = kmeans4.predict(test_docs_embedding)
            # predictions5 = kmeans5.predict(test_docs_embedding)
            # predictions6 = kmeans4.predict(test_docs_embedding)
            # predictions7 = kmeans5.predict(test_docs_embedding)
            
            for idx in range(test_docs_embedding.shape[0]):
                distances1[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans1.cluster_centers_[predictions1[idx]])
                distances2[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans2.cluster_centers_[predictions2[idx]])
                distances3[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans3.cluster_centers_[predictions3[idx]])
                # distances4[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans4.cluster_centers_[predictions4[idx]])
                # distances5[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans5.cluster_centers_[predictions5[idx]])
                # distances6[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans4.cluster_centers_[predictions4[idx]])
                # distances7[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans5.cluster_centers_[predictions5[idx]])
            
            predict_ensembel = (distances1>t) * 1 + (distances2>t) * 1 + (distances3>t) * 1 
            # + (distances4>t)* 1 +(distances5>t)* 1
            
            # +(distances6>t)* 1+(distances7>t)* 1
            predict = np.where(predict_ensembel>=2, 1, 0)
            
            acc_scores = accuracy_score(test_dataset.category, predict)
            f1_s = f1_score(test_dataset.category, predict)
            # if acc_scores>best_score:
            #     best_params = {acc_scores:[c,t]}
            #     best_score = acc_scores
            #     whole_window_ensemble = predict_ensembel
            #     whole_window = predict
            
            if f1_s>best_score:
                best_score = f1_s
                best_params = {best_score:[c,t]}
                whole_window_ensemble = predict_ensembel
                whole_window = predict

    
    kmeans1 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    kmeans1.fit(train_docs_embedding[:i])

    kmeans2 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    kmeans2.fit(train_docs_embedding[i:2*i])

    kmeans3 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    kmeans3.fit(train_docs_embedding[2*i:])

    # kmeans4 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans4.fit(train_docs_embedding[3*i:4*i])

    # kmeans5 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans5.fit(train_docs_embedding[4*i:])

    # kmeans6 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans6.fit(train_docs_embedding[5*i:6*i])

    # kmeans7 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans7.fit(train_docs_embedding[6*i:])

    distances1 = np.zeros(test_docs_embedding.shape[0])
    distances2 = np.zeros(test_docs_embedding.shape[0])
    distances3 = np.zeros(test_docs_embedding.shape[0])
    # distances4 = np.zeros(test_docs_embedding.shape[0])
    # distances5 = np.zeros(test_docs_embedding.shape[0])
    # distances6 = np.zeros(test_docs_embedding.shape[0])
    # distances7 = np.zeros(test_docs_embedding.shape[0])

    predictions1 = kmeans1.predict(test_docs_embedding)
    predictions2 = kmeans2.predict(test_docs_embedding)
    predictions3 = kmeans3.predict(test_docs_embedding)
    # predictions4 = kmeans4.predict(test_docs_embedding)
    # predictions5 = kmeans5.predict(test_docs_embedding)
    # predictions6 = kmeans6.predict(test_docs_embedding)
    # predictions7 = kmeans7.predict(test_docs_embedding)

    for i in range(test_docs_embedding.shape[0]):
        distances1[i] = np.linalg.norm(test_docs_embedding[i] - kmeans1.cluster_centers_[predictions1[i]])
        distances2[i] = np.linalg.norm(test_docs_embedding[i] - kmeans2.cluster_centers_[predictions2[i]])
        distances3[i] = np.linalg.norm(test_docs_embedding[i] - kmeans3.cluster_centers_[predictions3[i]])
        # distances4[i] = np.linalg.norm(test_docs_embedding[i] - kmeans4.cluster_centers_[predictions4[i]])
        # distances5[i] = np.linalg.norm(test_docs_embedding[i] - kmeans5.cluster_centers_[predictions5[i]])
        # distances6[i] = np.linalg.norm(test_docs_embedding[i] - kmeans6.cluster_centers_[predictions6[i]])
        # distances7[i] = np.linalg.norm(test_docs_embedding[i] - kmeans7.cluster_centers_[predictions7[i]])

    predict_ensembel = (distances1>best_params[best_score][1]) * 1 + (distances2>best_params[best_score][1]) * 1 + (distances3>best_params[best_score][1]) * 1 
    # + (distances4>t)* 1 +(distances5>t)* 1
    # +(distances6>t)* 1+(distances7>t)* 1
    predict = np.where(predict_ensembel>=2, 1, 0)

    test_acc_scores = accuracy_score(test_dataset.category, predict)
    test_pre_scores = precision_score(test_dataset.category, predict)
    test_rec_scores = recall_score(test_dataset.category, predict)
    test_f1_scores = f1_score(test_dataset.category, predict)
    final_acc.append(test_acc_scores)
    final_precision.append(test_pre_scores)
    final_recall.append(test_rec_scores)
    final_f1.append(test_f1_scores)

    window_size = 80
    ref_window = predict[:window_size]
    ref_ratio = np.count_nonzero(ref_window) / len(ref_window)

    first_ab_idx = test_dataset[test_dataset.category==1].index[0]-window_size
    for delay in range(len(predict) - first_ab_idx):
        compare_window = predict[first_ab_idx:first_ab_idx+window_size]
        compare_ratio = np.count_nonzero(compare_window) / len(compare_window)
        first_ab_idx+=1
        t, p = ttest_ind(ref_window, compare_window)
        if p<=0.005:
            print('몇 번째인지:', delay+first_ab_idx)
            print('Delay:', delay)
            print(f"Group 1 proportion: {ref_ratio:.3f}")
            print(f"Group 2 proportion: {compare_ratio:.3f}")
            print(f"t-statistic: {t:.3f}")
            print(f"p-value: {p:.3f}")
            final_delay.append(delay)
            break
    if len(final_delay) != n:
        final_delay.append('none')
        
    epoch_time.append(round(time.time() - start_time, 1))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 1027/1027 [02:26<00:00,  7.03it/s]
100%|██████████| 788/788 [01:54<00:00,  6.89it/s]
 10%|█         | 1/10 [06:04<54:36, 364.07s/it]

몇 번째인지: 397
Delay: 66
Group 1 proportion: 0.037
Group 2 proportion: 0.175
t-statistic: -2.877
p-value: 0.005


100%|██████████| 1027/1027 [02:30<00:00,  6.84it/s]
100%|██████████| 788/788 [01:50<00:00,  7.10it/s]
100%|██████████| 1027/1027 [02:31<00:00,  6.80it/s]
100%|██████████| 788/788 [01:50<00:00,  7.15it/s]
 30%|███       | 3/10 [18:11<42:27, 363.96s/it]

몇 번째인지: 409
Delay: 72
Group 1 proportion: 0.013
Group 2 proportion: 0.125
t-statistic: -2.866
p-value: 0.005


100%|██████████| 1027/1027 [02:28<00:00,  6.92it/s]
Token indices sequence length is longer than the specified maximum sequence length for this model (564 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 788/788 [01:53<00:00,  6.96it/s]
 40%|████      | 4/10 [24:16<36:25, 364.26s/it]

몇 번째인지: 482
Delay: 109
Group 1 proportion: 0.025
Group 2 proportion: 0.150
t-statistic: -2.851
p-value: 0.005


100%|██████████| 1027/1027 [02:28<00:00,  6.90it/s]
100%|██████████| 788/788 [01:58<00:00,  6.65it/s]
 50%|█████     | 5/10 [30:31<30:40, 368.19s/it]

몇 번째인지: 341
Delay: 35
Group 1 proportion: 0.000
Group 2 proportion: 0.100
t-statistic: -2.963
p-value: 0.004




In [None]:
print('각10번 time:', epoch_time)
print('평균', np.mean(epoch_time))

In [None]:
print('각10번 delay:', final_delay)
# print('평균', np.mean(final_delay))

In [None]:
print('각10번 acc:', final_acc)
print('평균', np.mean(final_acc))

In [None]:
print('각10번 precision:', final_precision)
print('평균', np.mean(final_precision))

In [None]:
print('각10번 f1:', final_f1)
print('평균', np.mean(final_f1))

### Entertain

In [None]:
normal_dataset = df_business+df_tech+df_politics+df_sport

In [None]:
txt = []

for lst in tqdm(normal_dataset):
    file = open(lst, 'r')
    data = file.read()
    data = data.lower()
    data = data.strip()
    data = re.compile('<.*?>').sub('', data)
    data = re.sub('\s+', ' ', data)  
    data = ' '.join([contractions[t] if t in contractions else t for t in data.split(" ")]) # 약어 정규화
    data = re.sub(r"'s\b","",data) # 소유격 제거. Ex) roland's -> roland
    
    data = data.replace('\n\n', '\n')
    data = data.replace('\n', '. ')
    data = data.replace('..', '.')

    txt.append(data)

txt_2 = []

for abnormal in tqdm(df_entertain):
    file = open(abnormal, 'r')
    data_ = file.read()
    data_ = data_.lower()
    data_ = data_.strip()
    data_ = re.compile('<.*?>').sub('', data_)
    data_ = re.sub('\s+', ' ', data_)  
    data_ = ' '.join([contractions[t] if t in contractions else t for t in data_.split(" ")]) # 약어 정규화
    data_ = re.sub(r"'s\b","",data_) # 소유격 제거. Ex) roland's -> roland
    
    data_ = data_.replace('\n\n', '\n')
    data_ = data_.replace('\n', '. ')
    data_ = data_.replace('..', '.')

    txt_2.append(data_)

df_normal = pd.DataFrame(txt, columns=['origin'])
df_normal = df_normal.drop(df_normal.index[928])
df_normal = df_normal.reset_index(drop=True)
df_abnormal = pd.DataFrame(txt_2, columns=['origin'])

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
clu = [5,10,15,20,25,30,35,40,45,50]
threshold = np.arange(0,4.5,0.1)
final_acc = []
final_precision = []
final_recall = []
final_f1 = []
final_delay = []
epoch_time = []

for n in tqdm(range(1, 11)):
    start_time = time.time()

    train_dataset, test_dataset = make_dataset(df_normal, df_abnormal)
    
    train_sum = []
    for i in range(len(train_dataset)):
        summ1 = text_r(train_dataset.origin.iloc[i])
        train_sum.append(' '.join(summ1))
    train_dataset['summary'] = train_sum

    test_sum = []
    for i in range(len(test_dataset)):
        summ2 = text_r(test_dataset.origin.iloc[i])
        test_sum.append(' '.join(summ2))
    test_dataset['summary'] = test_sum

    
    train_docs_embedding = make_vector(train_dataset.summary)
    test_docs_embedding = make_vector(test_dataset.summary)


    with open(f'summary_entertain_{n}_test_docs_embedding.pickle', 'wb') as f:
        pickle.dump(test_docs_embedding, f)

    with open(f'summary_entertain_{n}_train_docs_embedding.pickle', 'wb') as f:
        pickle.dump(train_docs_embedding, f)

    with open(f'summary_entertain_{n}_test_ans_embedding.pickle', 'wb') as f:
        pickle.dump(test_dataset.category.values, f)

    with open(f'summary_entertain_{n}_train_ans_embedding.pickle', 'wb') as f:
        pickle.dump(train_dataset.category.values, f)


    best_score = 0
    i = len(train_dataset)//3
    
    for c in clu:
        
        kmeans1 = KMeans(n_clusters=c, random_state=42)
        kmeans1.fit(train_docs_embedding[:i])

        kmeans2 = KMeans(n_clusters=c, random_state=42)
        kmeans2.fit(train_docs_embedding[i:2*i])

        kmeans3 = KMeans(n_clusters=c, random_state=42)
        kmeans3.fit(train_docs_embedding[2*i:])

        # kmeans4 = KMeans(n_clusters=c, random_state=42)
        # kmeans4.fit(train_docs_embedding[3*i:4*i])

        # kmeans5 = KMeans(n_clusters=c, random_state=42)
        # kmeans5.fit(train_docs_embedding[4*i:])

        # kmeans6 = KMeans(n_clusters=c, random_state=42)
        # kmeans6.fit(train_docs_embedding[5*i:6*i])

        # kmeans7 = KMeans(n_clusters=c, random_state=42)
        # kmeans7.fit(train_docs_embedding[6*i:])

        distances1 = np.zeros(test_docs_embedding.shape[0])
        distances2 = np.zeros(test_docs_embedding.shape[0])
        distances3 = np.zeros(test_docs_embedding.shape[0])
        # distances4 = np.zeros(test_docs_embedding.shape[0])
        # distances5 = np.zeros(test_docs_embedding.shape[0])
        # distances6 = np.zeros(test_docs_embedding.shape[0])
        # distances7 = np.zeros(test_docs_embedding.shape[0])

        for t in threshold:
            predictions1 = kmeans1.predict(test_docs_embedding)
            predictions2 = kmeans2.predict(test_docs_embedding)
            predictions3 = kmeans3.predict(test_docs_embedding)
            # predictions4 = kmeans4.predict(test_docs_embedding)
            # predictions5 = kmeans5.predict(test_docs_embedding)
            # predictions6 = kmeans4.predict(test_docs_embedding)
            # predictions7 = kmeans5.predict(test_docs_embedding)
            
            for idx in range(test_docs_embedding.shape[0]):
                distances1[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans1.cluster_centers_[predictions1[idx]])
                distances2[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans2.cluster_centers_[predictions2[idx]])
                distances3[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans3.cluster_centers_[predictions3[idx]])
                # distances4[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans4.cluster_centers_[predictions4[idx]])
                # distances5[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans5.cluster_centers_[predictions5[idx]])
                # distances6[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans4.cluster_centers_[predictions4[idx]])
                # distances7[idx] = np.linalg.norm(test_docs_embedding[idx] - kmeans5.cluster_centers_[predictions5[idx]])
            
            predict_ensembel = (distances1>t) * 1 + (distances2>t) * 1 + (distances3>t) * 1 
            # + (distances4>t)* 1 +(distances5>t)* 1
            
            # +(distances6>t)* 1+(distances7>t)* 1
            predict = np.where(predict_ensembel>=2, 1, 0)
            
            acc_scores = accuracy_score(test_dataset.category, predict)
            f1_s = f1_score(test_dataset.category, predict)
            # if acc_scores>best_score:
            #     best_params = {acc_scores:[c,t]}
            #     best_score = acc_scores
            #     whole_window_ensemble = predict_ensembel
            #     whole_window = predict
            
            if f1_s>best_score:
                best_score = f1_s
                best_params = {best_score:[c,t]}
                whole_window_ensemble = predict_ensembel
                whole_window = predict

    
    kmeans1 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    kmeans1.fit(train_docs_embedding[:i])

    kmeans2 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    kmeans2.fit(train_docs_embedding[i:2*i])

    kmeans3 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    kmeans3.fit(train_docs_embedding[2*i:])

    # kmeans4 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans4.fit(train_docs_embedding[3*i:4*i])

    # kmeans5 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans5.fit(train_docs_embedding[4*i:])

    # kmeans6 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans6.fit(train_docs_embedding[5*i:6*i])

    # kmeans7 = KMeans(n_clusters=best_params[best_score][0], random_state=42)
    # kmeans7.fit(train_docs_embedding[6*i:])

    distances1 = np.zeros(test_docs_embedding.shape[0])
    distances2 = np.zeros(test_docs_embedding.shape[0])
    distances3 = np.zeros(test_docs_embedding.shape[0])
    # distances4 = np.zeros(test_docs_embedding.shape[0])
    # distances5 = np.zeros(test_docs_embedding.shape[0])
    # distances6 = np.zeros(test_docs_embedding.shape[0])
    # distances7 = np.zeros(test_docs_embedding.shape[0])

    predictions1 = kmeans1.predict(test_docs_embedding)
    predictions2 = kmeans2.predict(test_docs_embedding)
    predictions3 = kmeans3.predict(test_docs_embedding)
    # predictions4 = kmeans4.predict(test_docs_embedding)
    # predictions5 = kmeans5.predict(test_docs_embedding)
    # predictions6 = kmeans6.predict(test_docs_embedding)
    # predictions7 = kmeans7.predict(test_docs_embedding)

    for i in range(test_docs_embedding.shape[0]):
        distances1[i] = np.linalg.norm(test_docs_embedding[i] - kmeans1.cluster_centers_[predictions1[i]])
        distances2[i] = np.linalg.norm(test_docs_embedding[i] - kmeans2.cluster_centers_[predictions2[i]])
        distances3[i] = np.linalg.norm(test_docs_embedding[i] - kmeans3.cluster_centers_[predictions3[i]])
        # distances4[i] = np.linalg.norm(test_docs_embedding[i] - kmeans4.cluster_centers_[predictions4[i]])
        # distances5[i] = np.linalg.norm(test_docs_embedding[i] - kmeans5.cluster_centers_[predictions5[i]])
        # distances6[i] = np.linalg.norm(test_docs_embedding[i] - kmeans6.cluster_centers_[predictions6[i]])
        # distances7[i] = np.linalg.norm(test_docs_embedding[i] - kmeans7.cluster_centers_[predictions7[i]])

    predict_ensembel = (distances1>best_params[best_score][1]) * 1 + (distances2>best_params[best_score][1]) * 1 + (distances3>best_params[best_score][1]) * 1 
    # + (distances4>t)* 1 +(distances5>t)* 1
    # +(distances6>t)* 1+(distances7>t)* 1
    predict = np.where(predict_ensembel>=2, 1, 0)

    test_acc_scores = accuracy_score(test_dataset.category, predict)
    test_pre_scores = precision_score(test_dataset.category, predict)
    test_rec_scores = recall_score(test_dataset.category, predict)
    test_f1_scores = f1_score(test_dataset.category, predict)
    final_acc.append(test_acc_scores)
    final_precision.append(test_pre_scores)
    final_recall.append(test_rec_scores)
    final_f1.append(test_f1_scores)

    window_size = 80
    ref_window = predict[:window_size]
    ref_ratio = np.count_nonzero(ref_window) / len(ref_window)

    first_ab_idx = test_dataset[test_dataset.category==1].index[0]-window_size
    for delay in range(len(predict) - first_ab_idx):
        compare_window = predict[first_ab_idx:first_ab_idx+window_size]
        compare_ratio = np.count_nonzero(compare_window) / len(compare_window)
        first_ab_idx+=1
        t, p = ttest_ind(ref_window, compare_window)
        if p<=0.005:
            print('몇 번째인지:', delay+first_ab_idx)
            print('Delay:', delay)
            print(f"Group 1 proportion: {ref_ratio:.3f}")
            print(f"Group 2 proportion: {compare_ratio:.3f}")
            print(f"t-statistic: {t:.3f}")
            print(f"p-value: {p:.3f}")
            final_delay.append(delay)
            break
    if len(final_delay) != n:
        final_delay.append('none')
        
    epoch_time.append(round(time.time() - start_time, 1))

In [None]:
print('각10번 time:', epoch_time)
print('평균', np.mean(epoch_time))

In [None]:
print('각10번 delay:', final_delay)
# print('평균', np.mean(final_delay))

In [None]:
print('각10번 acc:', final_acc)
print('평균', np.mean(final_acc))

In [None]:
print('각10번 precision:', final_precision)
print('평균', np.mean(final_precision))

In [None]:
print('각10번 f1:', final_f1)
print('평균', np.mean(final_f1))