# Calculate PPL

In this notebook we shows how we retrieve the cross entropy and perplexity, use perplexity to determine the authorship of question documents, and log the benchmark result against true labels.

Install dependencies.

In [None]:
!pip install -U attrs setuptools wheel
!pip install cupy-cuda11x
!pip install -U spacy
!python -m spacy download en_core_web_trf
!pip3 install tqdm torch transformers datasets torchvision torchaudio 

Load packages.

In [None]:
import os,shutil,glob,math,ast,copy,io,zipfile,csv

from tqdm import tqdm

import torch
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from datasets import Dataset
from transformers import GPT2LMHeadModel,GPT2TokenizerFast,GPT2Tokenizer,AutoTokenizer

import pandas as pd
from pandarallel import pandarallel 
pandarallel.initialize(nb_workers=6)

import spacy
spacy.require_gpu()

import numpy as np

from multiprocess import Pool
proc_num=8

from sklearn.metrics import f1_score,accuracy_score,precision_score,recall_score

import warnings
warnings.filterwarnings(action="ignore")


Set up variables here.

test_data_path: the path to the test.csv for testing.

model_path: the path containing the fine-tuned authorial GPT-2s.

ce_log_home: the directory to hold logs for cross entropy

device: the device to boost inferring. "cuda" by default.

key_var_tag: the id label. For GEFA, use author_tag.

ppl_dfs_buffer_home: the directory to hold ppl calculation results.

test_text_limits: test text lengths used in evaluating.

pred_df_buffer_home: the directory to hold text by text test results against true labels. 

benchmark_results_df_home: the directory to hold evaluation metrics.

In [6]:
test_data_path="corpus/GEFA-full/test.csv"
model_path="model"
ce_log_home="ce_log"
result_path="ppl_result.csv"
device="cuda"
key_var_tag="author_tag"
ppl_dfs_buffer_home="ppl_dfs_buffer"
test_text_limits=[100,200,300,400,50,150,250,350,450,10,20,30,40,60,70,80,90]
pred_df_buffer_home="pred_df_buffer"
benchmark_results_df_home="benchmark_results_df_home"


Load working environments & datasets.

In [None]:
model_author_tags=os.listdir(model_path)
model_author_tags.sort()
model_author_tags=model_author_tags

total_dataset=Dataset.from_csv(test_data_path)
text_author_tags=list(set(total_dataset[key_var_tag]))

if(not(os.path.isfile(result_path))):
    with open(result_path,"w",-1,"utf-8") as f:
        f.write("")
      
if(not(os.path.isdir(ce_log_home))):os.mkdir(ce_log_home)

test_sets={}
for text_author_tag in text_author_tags:
    test_sets[str(text_author_tag)]=total_dataset.filter(lambda daton:str(daton[key_var_tag])==str(text_author_tag))

if(not(os.path.isdir(ppl_dfs_buffer_home))):os.mkdir(ppl_dfs_buffer_home)
if(not(os.path.isdir(pred_df_buffer_home))):os.mkdir(pred_df_buffer_home)
if(not(os.path.isdir(benchmark_results_df_home))):os.mkdir(benchmark_results_df_home)

Initiate spacy for on-the-fly tagging.

In [None]:
spacy.require_gpu()
nlp=spacy.load("en_core_web_trf")

Start calculating cross entropy and on the fly tagging.

In [None]:
for model_author_tag in model_author_tags:

    model_id=os.path.join(model_path,model_author_tag)

    model=GPT2LMHeadModel.from_pretrained(model_id).to(device)
    tokenizer=AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token = tokenizer.eos_token

    for text_author_tag in text_author_tags:
        
        # 
        logged_tag_pairs=[]
        with open(result_path,"r",-1,"utf-8") as f:
            csvr=csv.reader(f)
            for r in csvr:
                logged_tag_pairs.append((r[0],r[1]))
        selected_pairs=list(filter(lambda unit:(str(unit[0])==str(model_author_tag))and(str(unit[1])==str(text_author_tag)),logged_tag_pairs))
        if(selected_pairs.__len__()>0):
            print(f"skipped logged pair {str(model_author_tag)},{str(text_author_tag)}")
            continue
        else:
            print(f"start processing pair {str(model_author_tag)},{str(text_author_tag)}")       
        
        # set up test set
        
        test_set=test_sets[str(text_author_tag)]
        records=[]
        nlls=[]
        for test_text_sample in tqdm(test_set["text"]):
            test_text=test_text_sample.replace("<BOS>","").replace("<EOS>","")
            encodings=tokenizer(test_text,return_tensors="pt")# tokenize
            
            # On the fly pos-tagging & dep & ner, using the tokens from gpt2 tokenizer
            
            words=tokenizer.batch_decode(encodings.input_ids[0][:]) # decode text for spacy tagger
            tokens=list(words)
            lemmas=[]
            poss=[]
            tags=[]
            shapes=[]
            alphas=[]
            stops=[]
            morphs=[]
            deps=[]
            ent_types=[]
            raw_doc=spacy.tokens.Doc(nlp.vocab,words)
            doc=nlp(raw_doc)
            for token in doc:
                lemmas.append(token.lemma_)
                poss.append(token.pos_)
                tags.append(token.tag_)
                shapes.append(token.shape_)
                alphas.append(token.is_alpha)
                stops.append(token.is_stop)
                morphs.append(str(token.morph))
                deps.append(token.dep_)
                ent_types.append(token.ent_type_)

            # Calculate and record cross entropy loss
            losses=[]
            max_length=model.config.n_positions
            stride=128
            seq_len=encodings.input_ids.size(1)
            prev_end_loc = 0
            for begin_loc in range(0, seq_len, stride):
                end_loc = min(begin_loc + max_length, seq_len)
                trg_len = end_loc - prev_end_loc
                input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
                target_ids=input_ids.clone()
                target_ids[:, :-trg_len]=-100
                with torch.no_grad():
                    outputs=model(input_ids, labels=target_ids)
                    pooled_logits=outputs.logits
                    shift_labels=input_ids[..., 1:].contiguous()
                    shift_logits=pooled_logits[..., :-1, :].contiguous()
                    for i in range(0,shift_logits.size()[1]):
                        target_labels=shift_labels[...,i].contiguous()
                        target_logits=shift_logits[...,i,:].contiguous()
                        loss_fct=CrossEntropyLoss()
                        loss=loss_fct(target_logits.view(-1, target_logits.size(-1)), target_labels.view(-1))
                        losses.append(float(loss))
                neg_log_likelihood=outputs.loss
                nlls.append(neg_log_likelihood)
                prev_end_loc = end_loc
                if end_loc == seq_len:
                    break
            record=[tokens,losses,lemmas,poss,tags,shapes,alphas,stops,morphs,deps,ent_types]
            records.append(record)
        
        # Write text_by_text cross entropy data
        
        ce_log_fn="-".join([str(model_author_tag),str(text_author_tag)])+".csv"
        ce_log_fn_zip=ce_log_fn+".7z"
        with zipfile.ZipFile(os.path.join(ce_log_home,ce_log_fn_zip),"w",compression=zipfile.ZIP_LZMA) as z:
            with z.open(ce_log_fn,"w") as f:
                text_f=io.TextIOWrapper(f,"utf-8")
                csvw=csv.writer(text_f)
                csvw.writerows(records)
                text_f.flush()
        
        # Write result log to the csv at result_path

        ppl=torch.exp(torch.stack(nlls).mean())# caluclate per author perplexity -- this is for earlier stage research
        result_record=[str(model_author_tag),str(text_author_tag),str(float(stride)),str(float(ppl))]
        record_line=",".join(result_record)
        print(record_line)

        with open(result_path,"a",-1,"utf-8") as f:
            f.write(record_line+"\n")


Start calculating perplexity per text.

In [None]:
output_path_base=os.path.join(ppl_dfs_buffer_home,"ppl_dfs_buffer-full-[TEST_TEXT_LIMITS].csv")

template_feature_categories=["tokens","losses","lemmas","poss","tags","shapes","alphas","stops","morphs","deps","ent_types"]
selected_all=['losses_shifted','losses']

def resstr2list(item):
    uneq_counts=[]
    for feature_category in feature_categories:
        if(type(test_text_limit)==type(0)):
            item[feature_category]=ast.literal_eval(str(item[feature_category]))[0:test_text_limit]
            item[feature_category+"_count"]=item[feature_category].__len__()
        else:
            item[feature_category]=ast.literal_eval(str(item[feature_category]))
            item[feature_category+"_count"]=item[feature_category].__len__()
        uneq_counts.append(item[feature_category+"_count"])
    if(not(item["losses_count"]==(item["tokens_count"]-1))):
        item["losses"]=item["losses"][0:(item["tokens_count"]-1)]
    item["losses"]=item["losses"]+[(float(0))]
    item["losses_count"]=item["losses"].__len__()
    uneq_counts.append(item["losses_count"])
    item["losses_shifted"]=[(float(0))]+item["losses"][:-1]
    item["losses_shifted_count"]=item["losses_shifted"].__len__()
    uneq_counts.append(item["losses_shifted_count"])
    item["uneq_alarm"]=(list(set(uneq_counts)).__len__()==1)
    return item

def corpusrow2textdf(item):
    item["text_df"]=pd.DataFrame([item[feature_categories]]).explode(feature_categories)
    return item

def cal_ppl(losses):
    if(losses.__len__()>0):
        ppl=math.exp(sum(losses)/losses.__len__())
    else:
        ppl=0
    return ppl

def cal_ppl_global(item):
    for aim_category in selected_all:
        losses=list(item["text_df"][aim_category])
        ppl=cal_ppl(losses)
        ppl_info=f"global-ppl:({aim_category})"
        item[ppl_info]=ppl
    return item

fps=glob.glob(os.path.join(ce_log_home,"*.csv.7z"),recursive=True)
model_tags=[]
text_tags=[]
for fp in fps:
    bn=os.path.basename(fp).split(".csv.7z")[0]
    tags=bn.split("-")
    model_tags.append(tags[0])
    text_tags.append(tags[1])
model_tags=list(set(model_tags))
text_tags=list(set(text_tags))

tasks=[(model_tag,text_tag) for model_tag in model_tags for text_tag in text_tags]

for test_text_limit in test_text_limits:

    output_path=output_path_base.replace("[TEST_TEXT_LIMITS]",str(test_text_limit))

    ppl_dfs=pd.DataFrame()

    for task in tqdm(tasks):

        feature_categories=copy.deepcopy(template_feature_categories)

        fp=os.path.join(ce_log_home,f"{task[0]}-{task[1]}.csv.7z")

        corpus_df=pd.read_csv(fp,names=feature_categories,compression="zip")

        corpus_df=corpus_df.parallel_apply(resstr2list,axis=1)

        feature_categories=feature_categories+["losses_shifted"]

        text_dfs=pd.DataFrame([corpus_df.parallel_apply(corpusrow2textdf,axis=1)["text_df"]]).transpose()

        ppl_df=ppl_df.parallel_apply(cal_ppl_global,axis=1)
        ppl_df=ppl_df.filter(regex="ppl:",axis=1)

        ppl_df["candidate_tag"]=task[0]
        ppl_df["true_tag"]=task[1]

        ppl_df=ppl_df.reset_index().rename(columns={"index":"text_num"}).set_index(["true_tag","candidate_tag","text_num"])

        ppl_dfs=pd.concat([ppl_dfs,ppl_df])

    ppl_dfs.to_csv(output_path)

Use perplexity per text to determine author, benchmark result against true labels, and then generate evaluation metrics.

In [None]:
ppl_dfs_buffer_template=os.path.join(ppl_dfs_buffer_home,".csv")
pred_df_buffer_template=os.path.join(pred_df_buffer_home,"pred_df_buffer-[TRAIN_TEXT_LIMIT]-[TEST_TEXT_LIMIT].csv")

def ppl_dfs_2_pred_df(ppl_dfs_buffer_paths):

    for ppl_dfs_buffer_path in tqdm(ppl_dfs_buffer_paths):

        train_text_limit=os.path.basename(ppl_dfs_buffer_path).split(".csv")[0].split("-")[1]
        test_text_limit=os.path.basename(ppl_dfs_buffer_path).split(".csv")[0].split("-")[2]
        pred_df_buffer_path=pred_df_buffer_template.replace("[TEST_TEXT_LIMIT]",test_text_limit).replace("[TRAIN_TEXT_LIMIT]",train_text_limit)
        # print(pred_df_buffer_path)

        ppl_dfs=pd.read_csv(ppl_dfs_buffer_path)
        true_tags=list(set(ppl_dfs["true_tag"]))
        candidate_tags=list(set(ppl_dfs["candidate_tag"]))
        ppl_dfs=ppl_dfs.set_index("candidate_tag")

        by_features=[column for column in ppl_dfs.columns if "ppl:" in column]

        pred_df=pd.DataFrame()

        for by_feature in by_features: 

            for true_tag in true_tags:

                ppl_dfs_batch=ppl_dfs[ppl_dfs["true_tag"]==true_tag]

                text_nums=list(set(ppl_dfs_batch["text_num"]))

                pred_logs=[]

                for text_num in text_nums:
                    
                    text_res=ppl_dfs_batch[ppl_dfs_batch["text_num"]==text_num]
                    feat_vec=text_res[by_feature]
                    pred_tag=feat_vec.idxmin()

                    pred_log={"by":by_feature,"true_tag":true_tag,"text_num":text_num,"pred_tag":pred_tag}
                    pred_logs.append(pred_log)

                pred_logs=pd.DataFrame(pred_logs)
                pred_df=pd.concat([pred_df,pred_logs])

        pred_df.to_csv(pred_df_buffer_path)

ppl_dfs_buffer_paths=glob.glob(ppl_dfs_buffer_template,recursive=True)
existing_pred_df_buffer_names=[os.path.basename(path) for path in glob.glob(pred_df_buffer_template.replace("pred_df_buffer-[TRAIN_TEXT_LIMIT]-[TEST_TEXT_LIMIT].csv","*.csv"),recursive=True)]
ppl_dfs_buffer_paths=[path for path in ppl_dfs_buffer_paths if os.path.basename(path).replace("ppl_dfs_buffer-","pred_df_buffer-") not in existing_pred_df_buffer_names]

ppl_dfs_buffer_paths_batches=[list(i) for i in list((np.array_split(ppl_dfs_buffer_paths,proc_num)))]

with Pool(proc_num) as p:
    p.map(ppl_dfs_2_pred_df,ppl_dfs_buffer_paths_batches)

pred_df_buffer_template=os.path.join(pred_df_buffer_home,"*.csv")
benchmark_results_df_buffer_template=os.path.join(benchmark_results_df_home,"benchmark_results_df_buffer-[TRAIN_TEXT_LIMIT]-[TEST_TEXT_LIMIT].csv")

def pred_df_2_benchmark_results_df(pred_df_buffer_paths):

    for pred_df_buffer_path in tqdm(pred_df_buffer_paths):

        train_text_limit=os.path.basename(pred_df_buffer_path).split(".csv")[0].split("-")[1]
        test_text_limit=os.path.basename(pred_df_buffer_path).split(".csv")[0].split("-")[2]
        benchmark_results_df_buffer_path=benchmark_results_df_buffer_template.replace("[TEST_TEXT_LIMIT]",test_text_limit).replace("[TRAIN_TEXT_LIMIT]",train_text_limit)

        pred_df=pd.read_csv(pred_df_buffer_path)
        pred_df=pred_df.drop(columns=["Unnamed: 0"]).set_index("by")

        benchmark_results_df=pd.DataFrame()

        features=list(set(list(pred_df.index)))
        true_tags=list(set(pred_df["true_tag"]))

        for feature in features:

            fscore=f1_score(pred_df["true_tag"].loc[feature],pred_df["pred_tag"].loc[feature],average="macro")
            precision=precision_score(pred_df["true_tag"].loc[feature],pred_df["pred_tag"].loc[feature],average="macro")
            recall=recall_score(pred_df["true_tag"].loc[feature],pred_df["pred_tag"].loc[feature],average="macro")
            accuracy=accuracy_score(pred_df["true_tag"].loc[feature],pred_df["pred_tag"].loc[feature])

            benchmark_result={"feature":feature,"true_tag":"GLOBAL","fscore":fscore,"precision":precision,"recall":recall,"accuracy":accuracy}
            benchmark_results_df=pd.concat([benchmark_results_df,pd.DataFrame([benchmark_result])])

            for true_tag in true_tags:

                selected_pred_df=pred_df[pred_df["true_tag"]==true_tag]

                fscore=f1_score(selected_pred_df["true_tag"].loc[feature],selected_pred_df["pred_tag"].loc[feature],average="macro")
                precision=precision_score(selected_pred_df["true_tag"].loc[feature],selected_pred_df["pred_tag"].loc[feature],average="macro")
                recall=recall_score(selected_pred_df["true_tag"].loc[feature],selected_pred_df["pred_tag"].loc[feature],average="macro")
                accuracy=accuracy_score(selected_pred_df["true_tag"].loc[feature],selected_pred_df["pred_tag"].loc[feature])

                benchmark_result={"feature":feature,"true_tag":true_tag,"fscore":fscore,"precision":precision,"recall":recall,"accuracy":accuracy}
                benchmark_results_df=pd.concat([benchmark_results_df,pd.DataFrame([benchmark_result])])

        benchmark_results_df.to_csv(benchmark_results_df_buffer_path) 

pred_df_buffer_paths=glob.glob(pred_df_buffer_template,recursive=True)
existing_benchmark_results_df_buffer_names=[os.path.basename(path) for path in glob.glob(benchmark_results_df_buffer_template.replace("benchmark_results_df_buffer-[TRAIN_TEXT_LIMIT]-[TEST_TEXT_LIMIT].csv","*.csv"),recursive=True)]
pred_df_buffer_paths=[path for path in pred_df_buffer_paths if os.path.basename(path).replace("pred_df_buffer-","benchmark_results_df_buffer-") not in existing_benchmark_results_df_buffer_names]

pred_df_buffer_paths_batches=[list(i) for i in list((np.array_split(pred_df_buffer_paths,proc_num)))]

with Pool(proc_num) as p:
    p.map(pred_df_2_benchmark_results_df,pred_df_buffer_paths_batches)