In [1]:
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import re
from scipy.stats import gmean, hmean
from tqdm import tqdm
from time import time
import torch
import pickle as pk
from scipy.spatial.distance import cdist

from tslearn.utils import to_time_series_dataset
from tslearn.datasets import CachedDatasets
from tslearn.preprocessing import TimeSeriesScalerMinMax
from tslearn.svm import TimeSeriesSVC, TimeSeriesSVR

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier

In [2]:
import sklearn

In [3]:
np.__version__, sklearn.__version__

('1.24.4', '1.4.1.post1')

In [4]:
np.random.seed(42)
random.seed(42)

## Chosing dataset and domain

In [5]:
df = pd.read_csv("roft_duplicates_removed.csv")
print(list(set(df["model"])))
timeseries_df_1 = pd.read_csv("sliding_window_data/roft_filtered_dims_timeseries_PHD_100.csv")
chosen_model = 'davinci'

['human', 'davinci', 'baseline', 'ctrl-nocode', 'gpt2-xl', 'ctrl-Politics', 'finetuned', 'gpt2']


In [6]:
print(len(df))
df.head()

8943


Unnamed: 0,date,model,dataset,annotator,group,dec_strat_value,predicted_boundary_index,true_boundary_index,points,reason,...,prompt_body,generation,gen_body,recipe_familiarity,news_familiarity,stories_familiarity,gen_familiarity,native_speaker,read_guide,label
0,2021-08-31 17:11:39.095000+00:00,finetuned,Recipes,1666,A,0.4,0,2,0,['9123971792800820313'],...,HOW TO MAKE: Baby Shell Pasta Salad With Kalam...,22877,"Meanwhile, combine all dressing ingredients in...",2,3,5,2,Yes,,2
1,2021-09-06 21:54:48.912000+00:00,finetuned,Recipes,1666,A,0.4,8,8,5,['irrelevant'],...,HOW TO MAKE: Nest Cookies\nIngredients:\n1 12 ...,26444,Photograph by fans blistering bens down!_SEP_F...,2,3,5,2,Yes,,8
2,2021-09-06 21:55:07.069000+00:00,finetuned,Recipes,1666,A,0.4,0,7,0,['irrelevant'],...,HOW TO MAKE: Pink Lemonade Cupcakes\nIngredien...,26089,Fill prepared pans two-thirds full._SEP_Bake f...,2,3,5,2,Yes,,7
3,2021-09-06 21:58:44.944000+00:00,finetuned,Recipes,1666,A,0.4,1,7,0,['326860638652886185'],...,HOW TO MAKE: Beef Stroganaff\nIngredients:\n1 ...,25963,"I have added some green peppers, red peppers, ...",2,3,5,2,Yes,,7
4,2021-09-06 21:59:16.230000+00:00,finetuned,Recipes,1666,A,0.4,1,2,0,['repetition'],...,HOW TO MAKE: One-Pan Creamy Chicken and Veggie...,23225,Add frozen veggies and pasta._SEP_Pour in chic...,2,3,5,2,Yes,,2


In [7]:
df[df["model"] == "human"].index[int(len(df[df["model"] == "human"].index)*0.9)]

7391

In [8]:
len(df[df["model"] == "human"])

1273

In [9]:
try:
    df["label"] = df["true_boundary_index"]
except:
    pass

In [10]:
CONDITION = df["model"] == chosen_model
NOT_CONDITION = df["model"] != chosen_model
len(CONDITION), sum(CONDITION), sum(NOT_CONDITION)

(8943, 1830, 7113)

In [11]:
def timeseries_df_division(timeseries_df):
    timeseries_df = timeseries_df.astype({"id": "int32", "time": "int32"})

    chosen_ids = list(CONDITION)
    train_ids_nums = set()
    test_ids_nums = set()

    for i in range(len(chosen_ids)):
        if chosen_ids[i]:
            test_ids_nums.add(i)
        else:
            train_ids_nums.add(i)

    df_for_train = df[NOT_CONDITION]
    df_for_test = df[CONDITION]

    timeseries_df_for_train = timeseries_df[timeseries_df["id"].isin(train_ids_nums)]
    timeseries_df_for_test = timeseries_df[timeseries_df["id"].isin(test_ids_nums)]
    print(len(timeseries_df_for_test), len(timeseries_df_for_train), len(timeseries_df))

    y = df_for_train["label"]
    y_test = df_for_test["label"]
    
    return timeseries_df, timeseries_df_for_train, timeseries_df_for_test, y, y_test

In [12]:
timeseries_df, timeseries_df_for_train, timeseries_df_for_test, y, y_test = timeseries_df_division(timeseries_df_1)
#timeseries_df_2, timeseries_df_for_train_2, \
#timeseries_df_for_test_2, y_2, y_test_2 = timeseries_df_division(timeseries_df_2)

49908 310007 359915


In [13]:
timeseries_df_for_train

Unnamed: 0,id,time,dim
0,0,0,11.549488
1,0,1,10.722549
2,0,2,9.650902
3,0,3,9.989564
4,0,4,9.427729
...,...,...,...
359887,8940,28,9.686847
359888,8940,29,10.426505
359889,8940,30,10.989002
359890,8940,31,9.022205


In [14]:
def lists_from_timeseries_df(timeseries_df, df):
    X = []
    y = []

    for index in tqdm(set(timeseries_df["id"])):
        subdf = timeseries_df[timeseries_df["id"] == index]
        sample = []
        for element in subdf["dim"]:
            sample.append(element)
        X.append(sample)
        y.append(df["label"][index])

    return X, y

In [15]:
X_id, y_id = lists_from_timeseries_df(timeseries_df_for_train, df)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7090/7090 [00:03<00:00, 2237.25it/s]


In [16]:
X_ood, y_ood = lists_from_timeseries_df(timeseries_df_for_test, df)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1802/1802 [00:00<00:00, 3182.62it/s]


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_id, y_id, test_size=.2, random_state=42)
#X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=.33, random_state=42)
len(X_train), len(X_test) #len(X_valid)

(5672, 1418)

In [18]:
X_train, X_test, X_ood = list(map(to_time_series_dataset, [X_train, X_test, X_ood]))

len(X_train), len(X_test), len(X_ood)

(5672, 1418, 1802)

In [19]:
t1 = time()
clf = TimeSeriesSVC(kernel="gak", gamma=32768)
clf.fit(X_train, y_train)
t2 = time()
print(int(t2-t1), "seconds")
# 100  -   4 sec
# 300  -  40 sec
# 400  -  66 sec
# 500  - 113 sec
# 1000 - 403 sec

7602 seconds


In [20]:
t1 = time()
prediction_result_test = clf.predict(X_test)
print(accuracy_score(list(map(lambda x:int(round(x)), prediction_result_test)), y_test))
print(mean_squared_error(list(map(lambda x:int(round(x)), prediction_result_test)), y_test))
prediction_result_ood = clf.predict(X_ood)
print(accuracy_score(list(map(lambda x:int(round(x)), prediction_result_ood)), y_ood))
print(mean_squared_error(list(map(lambda x:int(round(x)), prediction_result_ood)), y_ood))
t2 = time()
print(int(t2-t1), "seconds")

#  100 100 -  40 sec
#  300 100 -  50 sec
#  400 100 -  65 sec
#  500 100 -  92 sec
# 1000 100 -  82 sec
# 2000 100 - 169 sec

0.3138222849083216
11.82299012693935
0.020532741398446172
27.030521642619313
9152 seconds


In [21]:
def weak_accuracy_score(y_pred, y):
    acc = 0
    assert(len(y_pred) == len(y))
    for i in range(len(y_pred)):
        if y_pred[i] == y[i] or y_pred[i] + 1 == y[i] or y_pred[i] - 1 == y[i]:
            acc += 1
    acc /= len(y_pred)
    return acc

print(weak_accuracy_score(list(map(lambda x:int(round(x)), prediction_result_test)), y_test))
print(weak_accuracy_score(list(map(lambda x:int(round(x)), prediction_result_ood)), y_ood))

0.5331452750352609
0.1381798002219756


In [22]:
prediction_result_train = clf.predict(X_train)

In [23]:
filehandler = open(chosen_model+"_SVM.pk","wb")
pk.dump(clf,filehandler)
filehandler.close()

In [24]:
t1 = time()
prediction_result_id = clf.predict(to_time_series_dataset(X_id))
print(accuracy_score(list(map(lambda x:int(round(x)), prediction_result_id)), y_id))
print(weak_accuracy_score(list(map(lambda x:int(round(x)), prediction_result_id)), y_id))
print(mean_squared_error(list(map(lambda x:int(round(x)), prediction_result_id)), y_id))
t2 = time()
print(int(t2-t1), "seconds")

0.31396332863187587
0.5375176304654443
11.651198871650212
18797 seconds


In [26]:
np.save("X_id_TimeSeriesSVC_on_" + chosen_model + "_PHD.npy", prediction_result_id)
np.save("X_ood_TimeSeriesSVC_on_" + chosen_model + "_PHD.npy", prediction_result_ood)
np.save("X_test_TimeSeriesSVC_on_" + chosen_model + "_PHD.npy", prediction_result_test)
np.save("X_train_TimeSeriesSVC_on_" + chosen_model + "_PHD.npy", prediction_result_train)