In [1]:
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import re
from scipy.stats import gmean, hmean
from tqdm import tqdm

import torch
from scipy.spatial.distance import cdist
from transformers import RobertaTokenizer
from transformers import RobertaModel

In [2]:
np.random.seed(42)
random.seed(42)

In [3]:
WINDOW_SIZE = 100
DEVICE = "cpu"

In [4]:
tokenizer = RobertaTokenizer.from_pretrained('./roberta_base')
model = RobertaModel.from_pretrained("./roberta_base").eval()
model.to(DEVICE)

def tokenize(text):
    return tokenizer(text, truncation=True, max_length=512, return_tensors="pt")

  return self.fget.__get__(instance, owner)()
Some weights of RobertaModel were not initialized from the model checkpoint at ./roberta_base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
df = pd.read_csv("roft_duplicates_removed.csv")

In [6]:
'''
Get MLE for one text
Parameters:
        text  --- text
Returns:
    real number or NumPy.nan  --- Intrinsic dimension value of the text in the input data
                                                    estimated by Maximum Likelihood Estimation method.'''

def get_mle_single(text):
    inputs = tokenizer(text.replace('\n', ' '), truncation=True, max_length=512, return_tensors="pt")
    with torch.no_grad():
        outp = model(**inputs)
    mx_points = inputs['input_ids'].shape[1] - 2
        
    # Computations for shorter texts are unstable and we want to avoid them
    if mx_points < MINIMAL_STABLE_LENGTH:     
        return 0
            
    return MLE().fit_transform(outp[0][0].numpy()[1:-1])

'''
Get MLE for all texts in df[key] Pandas DataSeries (MLE method)
Parameters:
        df  --- Pandas DataFrame
        key --- Name of the column
        is_list --- Check if the elements of the df[key] are lists (appears in some data)
        
Returns:
    numpy.array of shape (number_of_texts, 1) --- Intrinsic dimension values for all texts in the input data
                                                    estimated by Maximum Likelihood Estimation method.
'''

def get_mle(df, key='text', is_list=False):
    dims = []
    for s in df[key]:
        if is_list:
            text = s[0]
        else:
            text = s
            
        dims.append(get_mle_single(text))
    return np.array(dims).reshape(-1, 1)

## Обработка датасета

In [7]:
df

Unnamed: 0,date,model,dataset,annotator,group,dec_strat_value,predicted_boundary_index,true_boundary_index,points,reason,...,prompt_body,generation,gen_body,recipe_familiarity,news_familiarity,stories_familiarity,gen_familiarity,native_speaker,read_guide,label
0,2021-08-31 17:11:39.095000+00:00,finetuned,Recipes,1666,A,0.4,0,2,0,['9123971792800820313'],...,HOW TO MAKE: Baby Shell Pasta Salad With Kalam...,22877,"Meanwhile, combine all dressing ingredients in...",2,3,5,2,Yes,,2
1,2021-09-06 21:54:48.912000+00:00,finetuned,Recipes,1666,A,0.4,8,8,5,['irrelevant'],...,HOW TO MAKE: Nest Cookies\nIngredients:\n1 12 ...,26444,Photograph by fans blistering bens down!_SEP_F...,2,3,5,2,Yes,,8
2,2021-09-06 21:55:07.069000+00:00,finetuned,Recipes,1666,A,0.4,0,7,0,['irrelevant'],...,HOW TO MAKE: Pink Lemonade Cupcakes\nIngredien...,26089,Fill prepared pans two-thirds full._SEP_Bake f...,2,3,5,2,Yes,,7
3,2021-09-06 21:58:44.944000+00:00,finetuned,Recipes,1666,A,0.4,1,7,0,['326860638652886185'],...,HOW TO MAKE: Beef Stroganaff\nIngredients:\n1 ...,25963,"I have added some green peppers, red peppers, ...",2,3,5,2,Yes,,7
4,2021-09-06 21:59:16.230000+00:00,finetuned,Recipes,1666,A,0.4,1,2,0,['repetition'],...,HOW TO MAKE: One-Pan Creamy Chicken and Veggie...,23225,Add frozen veggies and pasta._SEP_Pour in chic...,2,3,5,2,Yes,,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8938,2022-06-14 06:01:13.813000+00:00,gpt2-xl,Short Stories,15114,C,0.4,5,6,0,['generic'],...,"Leraje, captain of thirty legions of demon war...",35197,"But this time, there would be no standing arou...",2,4,2,1,No,Yes,6
8939,2022-06-14 06:16:31.704000+00:00,davinci,Short Stories,15114,C,1.0,2,7,0,['generic'],...,The sky shined bright red._SEP_The sun blazed ...,34934,"""My only true love You'll be._SEP_I have searc...",2,4,2,1,No,Yes,7
8940,2022-06-14 06:22:54.638000+00:00,gpt2-xl,Short Stories,15114,C,0.4,5,4,4,['irrelevant'],...,People think I'm either eternally ill or just ...,35060,and I didn't even have my glasses on._SEP_I'm ...,2,4,2,1,No,Yes,4
8941,2022-06-14 06:26:21.071000+00:00,davinci,Short Stories,15114,C,0.0,5,7,0,['repetition'],...,"The writer sat hunched over his desk, his quil...",34231,"He felt... **important**._SEP_He stood up, and...",2,4,2,1,No,Yes,7


In [8]:
print(len(df[df["dataset"] == "Short Stories"]),
len(df[df["dataset"] == "Recipes"]), 
len(df[df["dataset"] == "New York Times"]))

2665 4257 1724


In [9]:
def clean_string(input_string):
    input_string_2 = re.sub(r"\n" , " ", input_string)
    input_string_2 = re.sub(r"[^A-Za-z0-9 !\"$%&\'()\*+,-./:;?@^_`~]" , "", input_string_2)
    input_string_2 = re.sub(r"[ ]+", " ", input_string_2)
    input_string_2 = input_string_2.strip()
    input_string_2
    return input_string_2

In [10]:
clean_string(df["prompt_body"][0]) + "_SEP_" + clean_string(df["gen_body"][0])

"HOW TO MAKE: Baby Shell Pasta Salad With Kalamata Olives and Roasted Fennel Ingredients: 1 lb small shell pasta, uncooked 1 12 cups fennel bulbs, cut in half lengthwise 2 medium onions or 1 large onion, chopped 1/2 inch thick 3 tablespoons olive oil or 3 tablespoons vegetable oil 1 cup fresh plum tomato, diced (do nto use canned for this recipe) 34-1 cup kalamata olive (more if desired) 12 cup assorted fresh herb, finely chopped (any kind you desire, I like to include basil with the herbs) 3 -4 garlic cloves, with the peel on (leave whole) 13 cup balsamic vinegar 13 cup olive oil 2 tablespoons barbecue sauce (any flavor desired) 2 tablespoons lime juice 14 cup romano cheese, grated or shredded or 1/4 cup parmesan cheese, grated or shredded dried chili pepper flakes (any amount for sprinkling)._SEP_Cook the pasta until eldente (don't over cook); drain, and rinse with cold water; set aside._SEP_Heat the oven to 375 degrees._SEP_Meanwhile, combine all dressing ingredients in a blender or

In [11]:
def tokenize(text):
    return tokenizer(text, max_length=512, truncation = True)

def get_example(i):
    human_written = clean_string(df['prompt_body'][i].replace("_SEP_", ' ')) + " "
    try:
        model_written = clean_string(df['gen_body'][i].replace("_SEP_", ' '))
    except:
        model_written = ""
    human_input_ids = tokenize(human_written).input_ids[:-1]
    model_written_ids = tokenize(model_written).input_ids[1:]
    input_ids = (human_input_ids + model_written_ids[:-1])[:511] + [model_written_ids[-1]]
    input_ids = torch.tensor([input_ids])
    tags = np.append(np.zeros(len(human_input_ids)),  np.ones(len(model_written_ids)))
    return input_ids, tags

def get_embeddings(input_ids):
    #device = "cuda:0"
    inputs = input_ids.to(DEVICE)
    with torch.no_grad():
        out = model(inputs)
    embeddings = out.last_hidden_state.detach().cpu().numpy()[0][1:-1]
    return embeddings

def process_embeddings(embeddings, tags):
    mles = []
    mle_tags = []

    center_point = 0

    for i in range(embeddings.shape[0] - WINDOW_SIZE + 1):
        sub_embedding = embeddings[i: i + WINDOW_SIZE]
        mle = MLE().fit_transform(sub_embedding)
        mles.append(mle)
        mle_tags.append(int(tags[center_point+i]))

    mle_tags = np.array(mle_tags)
    mles = np.array(mles)

    ids_human = np.argwhere(mle_tags == 0).squeeze()
    ids_gen = np.argwhere(mle_tags == 1).squeeze()
    return mles, ids_human, ids_gen

def plot_result(i):
    plt.figure(figsize=(6, 2))
    input_ids, tags = get_example(i)
    embeddings = get_embeddings(input_ids)
    mles, ids_human, ids_gen = process_embeddings(embeddings, tags)

    plt.plot(ids_human, mles[ids_human], label = 'human')
    plt.plot(ids_gen, mles[ids_gen], label = 'gen')
    plt.legend()
    print(tokenizer.decode(input_ids[0]))

In [12]:
def prim_tree(adj_matrix, power=1.0):
    infty = np.max(adj_matrix) + 1.0
    
    dst = np.ones(adj_matrix.shape[0]) * infty
    visited = np.zeros(adj_matrix.shape[0], dtype=bool)
    ancestor = -np.ones(adj_matrix.shape[0], dtype=int)

    v, s = 0, 0.0
    for i in range(adj_matrix.shape[0] - 1):
        visited[v] = 1
        ancestor[dst > adj_matrix[v]] = v
        dst = np.minimum(dst, adj_matrix[v])
        dst[visited] = infty
        
        v = np.argmin(dst)
        
        s += adj_matrix[v][ancestor[v]] ** power
    return s.item()

def sample_W(W, nSamples, isRandom=True):
    n = W.shape[0]
    random_indices = np.random.choice(n, size=nSamples, replace=False)
    return W[random_indices]

def calculate_ph_dim(W, min_points=40, max_points=510, point_jump=20, alpha=1.0, restarts=3, resamples=7):
    # Computations for shorter texts are unstable and we want to avoid them
    if W.shape[0] < MINIMAL_STABLE_LENGTH: 
        return np.nan
    
    m_candidates = []
    for i in range(restarts): 
        test_n = range(min_points, max_points, point_jump)
        lengths = []

        for n in test_n:
            reruns = np.ones(resamples)
            for i in range(resamples):
                tmp = sample_W(W, n)
                reruns[i] = prim_tree(cdist(tmp, tmp), power=alpha)
            lengths.append(np.median(reruns))

        lengths = np.array(lengths)
        x = np.log(np.array(list(test_n)))
        y = np.log(lengths)

        N = len(x)
        m_candidates.append((N * (x * y).sum() - x.sum() * y.sum()) / (N * (x ** 2).sum() - x.sum() ** 2))
    m = np.mean(m_candidates)
    return alpha / (1 - m)

In [13]:
'''
Get CLS-tokens for all texts in df[key] Pandas DataSeries (RoBERTa-CLS baseline)
Parameters:
        df  --- Pandas DataFrame
        key --- Name of the column
        is_list --- Check if the elements of the df[key] are lists (appears in some data)
        
Returns:
    numpy.array of shape (number_of_texts, size_of_embedding=768)
'''

def get_cls(df, key='text', is_list=False):
    dims = np.zeros((len(df[key]),768))
    cnt = 0
    for text in tqdm(df[key]):
        if is_list:
            s = text[0]
        else:
            s = text
        inputs = tokenizer(s.replace('\n', ' '), truncation=True, max_length=512, return_tensors="pt")
        with torch.no_grad():
            outp = model(**inputs)
        dims[cnt] = outp[0][0].numpy()[0]
        cnt += 1
    return dims

In [16]:
'''
Get PHD for one text
Parameters:
        text  --- text
        alpha --- Parameter alpha for PHD computattion

Returns:
    real number or NumPy.nan  --- Intrinsic dimension value of the text in the input data
                                                    estimated by Persistence Homology Dimension method.'''
def get_phd_single(text, alpha=1.0):
    inputs = tokenizer(text.replace('\n', ' '), truncation=True, max_length=512, return_tensors="pt")
    with torch.no_grad():
        outp = model(**inputs)
        
    mx_points = inputs['input_ids'].shape[1] - 2        
    mn_points = 40
    step = ( mx_points - mn_points ) // 7
        
    return calculate_ph_dim(outp[0][0].numpy()[1:-1],  min_points=mn_points, max_points=mx_points, \
                                     point_jump=step, alpha=alpha)

def process_text_phd_with_window(text, alpha=1.0):
    inputs = tokenizer(text.replace('\n', ' '), truncation=True, max_length=512, return_tensors="pt")
    with torch.no_grad():
        outp = model(**inputs)
        
    embeddings = outp.last_hidden_state.detach().cpu().numpy()[0][1:-1]
        
    phds = []

    center_point = 0
    
    mx_points = inputs['input_ids'].shape[1] - 2        
    mn_points = 40
    step = ( mx_points - mn_points ) // 7
    
    for i in range(0, embeddings.shape[0] - WINDOW_SIZE + 1, 5):
        sub_embedding = embeddings[i: i + WINDOW_SIZE]
        phd = calculate_ph_dim(outp[0][0].numpy()[1:-1],  min_points=mn_points, max_points=mx_points, \
                               point_jump=step, alpha=alpha,
                               restarts=3, resamples=1)
        phds.append(phd)

    phds = np.array(phds)

    return phds

'''
Get PHD for all texts in df[key] Pandas DataSeries (PHD method)
Parameters:
        df  --- Pandas DataFrame
        key --- Name of the column
        is_list --- Check if the elements of the df[key] are lists (appears in some data)
        
        alpha --- Parameter alpha for PHD computattion

Returns:
    numpy.array of shape (number_of_texts, 1) --- Intrinsic dimension values for all texts in the input data
                                                    estimated by Persistence Homology Dimension method.
'''

def get_phd(df, key='text', is_list=False, alpha=1.0):
    dims = []
    for s in tqdm(df[key]):
        if is_list:
            text = s[0]
        else:
            text = s
        dims.append(get_phd_single(text, alpha=alpha))

    return np.array(dims).reshape(-1, 1)

#### Вычисление размерностей

In [17]:
MINIMAL_STABLE_LENGTH = 60

maxlim = len(df)
dims_arr = []

for i in tqdm(range(maxlim)):
    human = (" ".join(df['prompt_body'][i].split("_SEP_")[:10])).strip()
    human_written = clean_string(human.replace("_SEP_", ' ')) + " "
    try:
        model_written = clean_string(df['gen_body'][i].replace("_SEP_", ' '))
    except:
        model_written = ""
    text = human_written + model_written
    dims_arr.append(process_text_phd_with_window(text))

  0%|                                                                                                                                                                   | 3/8943 [02:16<113:04:13, 45.53s/it]

KeyboardInterrupt



In [None]:
len(dims_arr)

In [None]:
print(len(dims_arr))

In [None]:
plt.hist(list(map(len, dims_arr)))
plt.show()

In [None]:
def time_series_df_from_lists(lists):
    time_series_df = pd.DataFrame(columns=["id", "time", "dim"])

    for i in tqdm(range(len(lists))):
        for j in range(len(lists[i])):
            new_row = {"id": i, "time": j, "dim": lists[i][j]}
            time_series_df = time_series_df.append(new_row, ignore_index=True)
    return time_series_df

    # ????? super-slow

In [None]:
def time_series_dict_from_lists(lists):
    time_series_dict = {"id": [], "time": [], "dim": []}

    for i in tqdm(range(len(lists))):
        try:
            for j in range(len(lists[i])):
                time_series_dict["id"].append(i)
                time_series_dict["time"].append(j)
                time_series_dict["dim"].append(lists[i][j])
        except:
            time_series_dict["id"].append(i)
            time_series_dict["time"].append(-1)
            time_series_dict["dim"].append(-1)
    return time_series_dict

In [None]:
dims_dict = time_series_dict_from_lists(dims_arr)
dims_df = pd.DataFrame.from_dict(dims_dict)
dims_df

In [None]:
dims_df.to_csv("sliding_window_data/roft_dublicates_removed_dims_timeseries_PHD_" + str(WINDOW_SIZE) + ".csv", 
               index=False
              )

In [None]:
id_humans_dict = time_series_dict_from_lists(ids_humans)
id_humans_df = pd.DataFrame.from_dict(id_humans_dict)
id_humans_df.to_csv("sliding_window_data/chatgpt_roft_id_humans_timeseries_" + str(WINDOW_SIZE) + ".csv",
                    index=False
                   )

In [None]:
id_humans_df