In [27]:
import collections
import json
import pandas as pd

from pathlib import Path

In [28]:
def get_project_name(bug_entry):
    github_url = bug_entry['other']['github_commit_url']
    return '/'.join(github_url.split('/')[3:5])

def get_most_common_extension(bug_entry):
    all_filename_extensions = []
    for filename in bug_entry['filenames']:
        all_filename_extensions.append(Path(filename).suffix)
    if not all_filename_extensions:
        return ''
    return collections.Counter(all_filename_extensions).most_common(1)[0][0]

In [43]:
# Read all data into Pandas table

output_data_dir = Path('../output_data')

list_of_bug_entry = []
for bug_entry_file in output_data_dir.rglob('BugEntry.json'):
    with bug_entry_file.open() as f:
        list_of_bug_entry.append(json.load(f))
        
        vul_data = pd.DataFrame.from_records(list_of_bug_entry)

In [54]:
vul_data.head()

Unnamed: 0,buggy_code,fixing_code,filenames,buggy_code_start_loc,buggy_code_end_loc,fixing_code_start_loc,fixing_code_end_loc,type,message,other
0,[/*\n * . .o...,[/*\n * . .o...,"[src/helpers/utils/index.js, src/models/chat/m...","[21, 15]","[28, 84]","[22, 15]","[34, 91]",CWE-190,Integer Overflow or Wraparound in GitHub repos...,"{'cve': {'id': 'CVE-2022-1926', 'sourceIdentif..."
1,"[import {\n Entity,\n Column,\n PrimaryGene...","[import {\n Entity,\n Column,\n PrimaryGene...","[server/src/entities/comment.entity.ts, server...","[47, 9]","[48, 106]","[47, 9]","[48, 114]",CWE-639,The forgot password token basically just makes...,"{'cve': {'id': 'CVE-2022-3019', 'sourceIdentif..."
2,['use strict'\n\nconst crypto = require('crypt...,['use strict'\n\nconst crypto = require('crypt...,"[plugin.js, test/decorate.test.js]","[19, 22]","[102, 22]","[20, 23]","[112, 31]",CWE-203,@fastify/bearer-auth is a Fastify plugin to re...,"{'cve': {'id': 'CVE-2022-31142', 'sourceIdenti..."
3,[/*\n * card-cac.c: Support for CAC from NIST ...,[/*\n * card-cac.c: Support for CAC from NIST ...,"[src/libopensc/card-cac.c, src/libopensc/card-...","[797, 954, 521, 411, 82, 211, 840, 631, 23, 15...","[798, 979, 531, 437, 83, 212, 846, 632, 386, 1...","[797, 954, 521, 411, 82, 211, 840, 631, 24, 15...","[798, 981, 534, 437, 83, 212, 848, 632, 387, 1...",CWE-119,Several buffer overflows when handling respons...,"{'cve': {'id': 'CVE-2018-16421', 'sourceIdenti..."
4,[/*\n #\n # File : CImg.h\n # ...,[/*\n #\n # File : CImg.h\n # ...,[CImg.h],[53186],[55051],[53187],[55065],CWE-770,"A flaw was found in Clmg, where with the help ...","{'cve': {'id': 'CVE-2022-1325', 'sourceIdentif..."


0
1
2
3
4
...
8482
8483
8484
8485
8486


In [55]:
# Add more columns

vul_data['project_name'] = vul_data.apply(lambda row: get_project_name(row), axis=1)
vul_data['extension'] = vul_data.apply(lambda row: get_most_common_extension(row), axis=1)

In [56]:
vul_data['type']

0              CWE-190
1              CWE-639
2              CWE-203
3              CWE-119
4              CWE-770
             ...      
8482    NVD-CWE-noinfo
8483            CWE-79
8484            CWE-89
8485           CWE-369
8486            CWE-79
Name: type, Length: 8487, dtype: object

In [75]:
projectext = vul_data.filter(items = ['project_name','extension'], axis = 1)
projectext.value_counts()
projectext.to_csv('s.csv')

In [83]:
sc = pd.read_csv('s.csv')
a = sc.groupby('project_name').value_counts()
a.head()

project_name                         Unnamed: 0  extension
01-Scripts/01-Artikelsystem          3327        .php         1
01-Scripts/01ACP                     6444        .php         1
01org/opa-ff                         2793        .sh          1
01org/tpm2.0-tools                   2447        .c           1
10gen-archive/mongo-c-driver-legacy  2279        .c           1
Name: count, dtype: int64

In [76]:
a = projectext.groupby('project_name').value_counts()
a.head()


project_name                         extension
01-Scripts/01-Artikelsystem          .php         1
01-Scripts/01ACP                     .php         1
01org/opa-ff                         .sh          1
01org/tpm2.0-tools                   .c           1
10gen-archive/mongo-c-driver-legacy  .c           1
Name: count, dtype: int64

In [35]:
vul_data['type'].value_counts()[:20]

type
CWE-79            1455
CWE-125            435
CWE-787            336
CWE-476            325
NVD-CWE-noinfo     323
CWE-20             322
CWE-89             314
CWE-119            314
CWE-22             245
CWE-416            201
CWE-200            199
NVD-CWE-Other      186
CWE-190            177
CWE-352            175
CWE-287            109
CWE-400            106
CWE-78              99
CWE-863             95
CWE-362             91
CWE-401             90
Name: count, dtype: int64

In [60]:
projectName = vul_data['project_name'].value_counts()
projectName.to_csv('project.csv')

In [37]:
vul_data['extension'].value_counts()[:20]

extension
.c       2559
.php     1846
.js       642
.py       468
.cc       408
.go       323
.java     309
          257
.rb       219
.cpp      209
.h        148
.md       144
.ts       118
.cs        55
.json      54
.txt       48
.html      44
.rs        38
.xml       38
.pm        30
Name: count, dtype: int64

In [41]:
print (vul_data['extension']=='.java')


0       False
1       False
2       False
3       False
4       False
        ...  
8482    False
8483    False
8484    False
8485    False
8486    False
Name: extension, Length: 8487, dtype: bool
