# Imports

In [41]:
import matplotlib.pyplot as plt
from matplotlib.figure import figaspect
import sys
import datetime
import time
import numpy as np
from typing import Any

#To import modules
sys.path.append('../')
from notebooks.dataset_helper import DatasetHelper
from codetector.src.features.shared.data.models.code_detection_sample_model import CodeDetectionSampleModel

# Remove Later

In [2]:
from codetector.src.features.shared.data.models.dataset.parquet_dataset import ParquetDataset
class TestDetectionParquetDataset(ParquetDataset):
    def getContentType(self):
        return CodeDetectionSampleModel

    def preProcess(self):
        pass

    def getTag(self):
        return 'test_detection_parquet'

# Main

In [3]:
parq = TestDetectionParquetDataset('../data/detection_parquet')
parq.loadDataset()

print('Loaded dataset')

df = parq.toDataframe()


print('Converted to dataframe')

helper = DatasetHelper()

Loaded dataset
Converted to dataframe


In [4]:
datasets = ['stackoverflow-post','leetcode-post'] #,'stackoverflow-pre','hf_apps','hf_codesearchnet-python'
filtered = df.loc[(((df['TopP']== 0.95) & (df['Temperature'] == 0.97)) | (df['Generator'] == 'human')) & (df['Dataset'].isin(datasets))]
auroc : dict[str,Any] = helper.calculateAUROCScores(parq,df=filtered, flipList=['binoculars', 'detectcodegpt', 'rank'],loadingBar=True)
print(auroc)

Calculating AUROC: 100%|█████████████████████████████████████████████████████████████████████████| 438887/438887 [00:07<00:00, 57401.35it/s]

{'fastdetectgpt': {'codellama-instruct-13b': 0.7303432913655665, 'codellama-7b': 0.902500927254344, 'codellama-13b': 0.8202821208264957, 'codellama-instruct-7b': 0.8229034135681444, 'llama3-8b': 0.830007172937202, 'llama3-instruct-8b': 0.7673870679189828, 'starcoder2-3b': 0.8927916113933511, 'starcoder2-7b': 0.8527887135941847, 'incoder-1b': 0.9734039602795174, 'incoder-6b': 0.8663277042713474, 'phi3mini4k-instruct-4b': 0.7378783654105234, 'wavecoderultra-7b': 0.9273340688437774, 'codegen2_5-7b': 0.926412603305785, 'codegeex2-6b': 0.9216723510686676, 'phi-1b': 0.9633760570824523, 'codegemma-instruct-7b': 0.40508492208398794}, 'binoculars': {'codellama-instruct-13b': 0.7313060962736336, 'codellama-7b': 0.9000593923271274, 'codellama-13b': 0.8088300979204173, 'codellama-instruct-7b': 0.8376945091940659, 'llama3-8b': 0.8190973261010833, 'llama3-instruct-8b': 0.7541281104161043, 'starcoder2-3b': 0.8522316581093597, 'starcoder2-7b': 0.8050357826345669, 'incoder-1b': 0.9596680892058089, 'inc




In [42]:



def scatterPlot(mapping:dict[str,Any], auroc:dict[str,float], xLabel:str=None, title:str=None, suffix:str='', xLim:tuple=None, yLim:tuple=None, fit:bool=False, **kargs) -> None:
    xs:list[Any] = []
    ys:list[float] = []

    w, h = figaspect(1)
    fig = plt.figure(figsize=(w,h))
    
    for model in auroc:
        if model in mapping and mapping[model]:
            xs.append(mapping[model])
            ys.append(auroc[model])
    
    if title:
        plt.title(title)

    if xLim:
        plt.ylim(xLim[0],xLim[1])     
     
    if yLim:
        plt.ylim(yLim[0],yLim[1])
    plt.scatter(xs,ys,**kargs)

    plt.ylabel('AUROC')
    
    if xLabel:
        plt.xlabel(xLabel)
    
    if fit:
        xs = np.array(xs)
        a, b = np.polyfit(xs, ys, 1)
        plt.plot(xs, xs*a + b, color='red', linestyle='--')
        
    plt.rcParams['svg.fonttype'] = 'none'
    # plt.gca().set_aspect('equal', adjustable='box')
    
    plt.savefig(f'./figures/auroc_vs/{title}_{suffix}.png')
    plt.savefig(f'./figures/auroc_vs/{title}_{suffix}.svg')
    plt.close()
    # plt.show()

## AUROC vs Training Date

In [43]:


trainingDates = {
    'codellama-13b':int(time.mktime(datetime.date(2023,7,1).timetuple())),
    'codellama-instruct-13b':int(time.mktime(datetime.date(2023,7,1).timetuple())),
    'llama3-8b':int(time.mktime(datetime.date(2023,7,1).timetuple())),
    'llama3-instruct-8b':int(time.mktime(datetime.date(2023,7,1).timetuple())),
    'codellama-7b':int(time.mktime(datetime.date(2023,7,1).timetuple())),
    'codellama-instruct-7b':int(time.mktime(datetime.date(2023,7,1).timetuple())),
    'codegen2_5-7b':int(time.mktime(datetime.date(2023,7,1).timetuple())),
    'codegeex2-6b':int(time.mktime(datetime.date(2023,7,1).timetuple())),
    'starcoder2-7b':int(time.mktime(datetime.date(2024,2,1).timetuple())),
    'codegemma-instruct-7b':int(time.mktime(datetime.date(2024,4,1).timetuple())),
    'wavecoderultra-7b':int(time.mktime(datetime.date(2024,4,1).timetuple())),
    'incoder-6b':int(time.mktime(datetime.date(2022,4,1).timetuple())),
    'phi3mini4k-instruct-4b':int(time.mktime(datetime.date(2024,4,1).timetuple())),
    'starcoder2-3b':int(time.mktime(datetime.date(2024,2,1).timetuple())),
    'phi-1b':int(time.mktime(datetime.date(2023,6,1).timetuple())),
    'incoder-1b':int(time.mktime(datetime.date(2022,4,1).timetuple())),
    'openaio1-mini':int(time.mktime(datetime.date(2023,10,1).timetuple()))
}


for detector in auroc:
    scatterPlot(trainingDates,auroc[detector],xLabel='Training Date',suffix='training_date', yLim=(0,1),fit=True ,title=detector.capitalize(), s=10, c='black')

## AUROC vs HumanEval

In [44]:
humanEval = {
    'codellama-13b': 0.427,
    'codellama-instruct-13b': 0.427, #?
    'llama3-8b': 0.622,
    'llama3-instruct-8b': 0.622, #?
    'codellama-7b': 0.348,
    'codellama-instruct-7b': 0.348, #?
    'codegen2_5-7b': 0.2836,
    'codegeex2-6b': 0.359,
    'starcoder2-7b': 0.354,
    'codegemma-instruct-7b': 0.561,
    'wavecoderultra-7b': 0.799,
    'incoder-6b': 0.15,
    'phi3mini4k-instruct-4b': 0.585,
    'starcoder2-3b':0.317,
    'phi-1b': 0.45,
    'incoder-1b': 0.08,
    'openaio1-mini': 0.924 # None
}


for detector in auroc:
    scatterPlot(humanEval,auroc[detector],xLabel='Human Eval',suffix='human_eval',xLim=(0,1),yLim=(0,1),fit=True,title=detector.capitalize(),s=10, c='black')

## AUROC vs Model Size

In [45]:
modelSizes = {
    'codellama-13b': 13_000_000_000,
    'codellama-instruct-13b': 13_000_000_000,
    'llama3-8b': 8_000_000_000,
    'llama3-instruct-8b': 13_000_000_000,
    'codellama-7b': 7_000_000_000,
    'codellama-instruct-7b': 7_000_000_000,
    'codegen2_5-7b': 7_000_000_000,
    'codegeex2-6b': 6_000_000_000,
    'starcoder2-7b': 7_000_000_000,
    'codegemma-instruct-7b': 7_000_000_000,
    'wavecoderultra-7b': 7_000_000_000,
    'incoder-6b': 6_000_000_000,
    'phi3mini4k-instruct-4b': 4_000_000_000,
    'starcoder2-3b':3_000_000_000,
    'phi-1b': 1_000_000_000,
    'incoder-1b': 1_000_000_000,
    'openaio1-mini': 100_000_000_000 # Not sure
}


for detector in auroc:
    scatterPlot(modelSizes,auroc[detector],xLabel='Model Size',suffix='model_size',yLim=(0,1),fit=True,title=detector.capitalize(),s=10, c='black')