# Datasets

This notebook compares the initial datasets (Secbench, Pontas et al, Big Vul and CVE Details) with the final dataset.

In [2]:
import pandas as pd

In [3]:
# Secbench
df = pd.read_csv('../sources/secbench_secse17.csv')
print('number of vulnerabilities:', len(df), '(= number of commits)')
projects = set(['{}/{}'.format(row['owner'], row['project']) for _, row in df[['owner', 'project']].iterrows()])
print('number of projects:', len(projects))
print('number of CVEs:', len(df['cve_id'].unique()))
print('number of vulnerability classes (CWEs):', len(df['cwe_id'].unique()))
print('number of languages:', len(df['language'].unique()))

number of vulnerabilities: 676 (= number of commits)
number of projects: 113
number of CVEs: 189
number of vulnerability classes (CWEs): 51
number of languages: 18


In [4]:
# CVE Details 
df = pd.read_csv('../sources//cve_details.csv')

commits, projects = set(), set()
for row in df['github']:
    link = eval(row)
    commits = commits | link
    link_info = list(link)[0].split('/')
    projects = projects | set("{}/{}".format(link_info[3], link_info[4]))

print('number of commits:', len(commits))
print('number of projects:', len(projects))
print('number of CVEs:', len(df['cve_id'].unique()), '(= number of vulnerabilities)')
print('number of vulnerability classes (CWEs):', len(df['cwe_id'].unique()))

number of commits: 4529
number of projects: 66
number of CVEs: 4183 (= number of vulnerabilities)
number of vulnerability classes (CWEs): 119


In [5]:
# Pontas et al.
df = pd.read_csv('../sources//pontas_msr19.csv')
print('number of commits:', len(df))
print('number of projects:', len(df['project'].unique()))
print('number of CVEs:', len(df['cve_id'].unique()))

number of commits: 1282
number of projects: 205
number of CVEs: 624


In [6]:
# Big-Vul 
df = pd.read_csv('../sources//big_vul_msr20.csv')

print('number of CVEs:', len(df['cve_id'].unique()))
print('number of commits:', len(commits))
print('number of projects:', len(projects))
print('number of vulnerability classes (CWEs):', len(df['cwe_id'].unique()))

number of CVEs: 3755
number of commits: 4529
number of projects: 66
number of vulnerability classes (CWEs): 92


In [8]:
df = pd.read_csv('../dataset/positive.csv')

print('Secbench')
print('number of commits and patches:', len(df[df['dataset'] == 'SECBENCH']))

print('\nPontas et al.')
print('number of commits:', len(df[df['dataset'] == 'PONTAS']))
print('number of patches:', len(df[df['dataset'] == 'PONTAS']['cve_id'].unique()))

print('\nBig Vul')
print('number of commits:', len(df[df['dataset'] == 'BIGVUL']))
print('number of patches:', len(df[df['dataset'] == 'BIGVUL']['cve_id'].unique()))

print('\nCVE Details')
print('number of commits:', len(df[df['dataset'] == 'CVEDETAILS']))
print('number of patches:', len(df[df['dataset'] == 'CVEDETAILS']['cve_id'].unique()))

Secbench
number of commits and patches: 659

Pontas et al.
number of commits: 1127
number of patches: 565

Big Vul
number of commits: 4047
number of patches: 3433

CVE Details
number of commits: 2224
number of patches: 1816
