In [1]:
import pandas as pd
import random
import matplotlib.pyplot as plt

In [6]:
# process raw data into single uniform file
base = "output/rawdata/"
allfiles = [
    "bigdsetp1",
    "bigdsetp2",
    "bigdsetp3",
    "bigdsetp4",
    "bigdsetp5",
    "biggerdset",
    "biggerdset2"
]
fulldset = pd.concat([pd.read_json(base+f+".jsonl", orient='records', lines=True) for f in allfiles])
fulldset = fulldset[fulldset['ver']=='first'].drop(columns=['stats', 'ver', 'pref', 'prefix']).reset_index(drop=True)

In [19]:
def random_prefix_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    new_rows = []
    REPEATS = 2
    for index, row in df.iterrows():
        input_str = row['inp']
        # batches of 3 for consistency
        hyps = row['hyps'][:3]
        scos = row['scos'][:3]
        # get more than 1 prefix per example for robustness
        
        for hyp, sco in zip(hyps, scos):
            words = hyp.split(' ')
            if len(words)<=3:
                continue
            for i in range(REPEATS):
                prefix_length = random.randint(3, len(words))
                prefix = ' '.join(words[:prefix_length])

                new_row = {
                    'inp': input_str,
                    'hyp': prefix,
                    'pflen':prefix_length,
                    'sco': sco
                }
                new_rows.append(new_row)

    new_df = pd.DataFrame(new_rows)
    return new_df

In [28]:
prefdataset = random_prefix_dataframe(fulldset)
prefdataset['label']=(prefdataset.sco>0.85).astype(int)

In [29]:
prefdataset.to_json("output/prefmetricdataset.jsonl", orient="records", lines=True)

Unnamed: 0,inp,hyp,pflen,sco,label
0,Why are different tiers (regular < mid < premi...,Is this a practical or theoretical issue? I ha...,14,0.325651,0
1,Why are different tiers (regular < mid < premi...,Is this a practical or theoretical issue?,7,0.325651,0
2,Why are different tiers (regular < mid < premi...,"The fact that the gas is a tier, and that the ...",30,0.930721,1
3,Why are different tiers (regular < mid < premi...,"The fact that the gas is a tier, and that the ...",55,0.930721,1
4,Why are different tiers (regular < mid < premi...,Essentially it's the difference between the co...,21,0.765635,0
...,...,...,...,...,...
122891,Why do sunburns continue to stay hot for so lo...,"The hotter the burn, the longer the pain will ...",43,0.558900,0
122892,Why do sunburns continue to stay hot for so lo...,"After a burn,",3,0.872084,1
122893,Why do sunburns continue to stay hot for so lo...,"After a burn, the skin releases histamine, whi...",13,0.872084,1
122894,Why do sunburns continue to stay hot for so lo...,Many substances are released during the bliste...,14,0.889495,1


In [2]:
prefdataset = pd.read_json("output/prefmetricdataset.jsonl", orient="records", lines=True)
test_exs = pd.read_json("pftest.jsonl", orient="records", lines=True)

In [5]:
test_exs

Unnamed: 0,inp,hyp,pflen,sco,label
0,What are good and bad sides of manual and auto...,The driver has more control over the gearshift...,28,0.868728,1
1,What are good and bad sides of manual and auto...,The driver has more control over the gearshift...,14,0.868728,1
2,What are good and bad sides of manual and auto...,The advantage of a manual gearbox is it allows...,34,0.889394,1
3,What are good and bad sides of manual and auto...,The advantage of a manual gearbox is it allows...,32,0.889394,1
4,What are good and bad sides of manual and auto...,You have to be very good,6,0.553276,0
...,...,...,...,...,...
12289,Why do you get the spins when your drunk?Why d...,Alcohol causes the deterioration of the brain'...,11,0.788086,0
12290,Why do you get the spins when your drunk?Why d...,Drunkenness (intoxication) involves the loss o...,7,0.809169,0
12291,Why do you get the spins when your drunk?Why d...,Drunkenness (intoxication) involves the loss o...,17,0.809169,0
12292,Why do you get the spins when your drunk?Why d...,The spins are caused by,5,0.693356,0


In [9]:
merged_df = prefdataset.merge(test_exs, indicator=True, how='outer')
train_df = merged_df[merged_df['_merge'] == 'left_only'].reset_index(drop=True)

In [11]:
train_df.to_json("pftrain.jsonl", orient='records', lines=True)

In [14]:
train_df

Unnamed: 0,inp,hyp,pflen,sco,label,_merge
0,Why are different tiers (regular < mid < premi...,Is this a practical or theoretical issue? I ha...,14,0.325651,0,left_only
1,Why are different tiers (regular < mid < premi...,Is this a practical or theoretical issue?,7,0.325651,0,left_only
2,Why are different tiers (regular < mid < premi...,"The fact that the gas is a tier, and that the ...",30,0.930721,1,left_only
3,Why are different tiers (regular < mid < premi...,"The fact that the gas is a tier, and that the ...",55,0.930721,1,left_only
4,Why are different tiers (regular < mid < premi...,Essentially it's the difference between the co...,21,0.765635,0,left_only
...,...,...,...,...,...,...
110597,Why do sunburns continue to stay hot for so lo...,"The hotter the burn, the longer the pain will ...",43,0.558900,0,left_only
110598,Why do sunburns continue to stay hot for so lo...,"After a burn,",3,0.872084,1,left_only
110599,Why do sunburns continue to stay hot for so lo...,"After a burn, the skin releases histamine, whi...",13,0.872084,1,left_only
110600,Why do sunburns continue to stay hot for so lo...,Many substances are released during the bliste...,14,0.889495,1,left_only
