# Statistics on the output data. 

In [19]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import lizard
import subprocess as sub
from pylibsrcml import srcml
import os
import xml.etree.ElementTree as et 

df = pd.read_csv('../data/output-tunslip.csv')

if not os.path.exists("figure"):
    os.mkdir("figure")

df.columns

Index(['File', 'Line', 'Column', 'DefaultLevel', 'Level', 'Category', 'Name',
       'ToolVersion', 'RuleId', 'HelpUri'],
      dtype='object')

In [20]:
dfp = df[['Category', 'Name', 'CWEs']].groupby(['Category', 'Name', 'CWEs'], group_keys=False).size().reset_index(name='Count')
print(dfp)
fig = px.sunburst(dfp, path=['Category', 'Name', 'CWEs'], values='Count')
# fig.write_image("figure/vul_statistics.pdf")
fig.show()
df.head(3)

    Category       Name              CWEs  Count
0     buffer       char  CWE-119!/CWE-120      6
1     buffer     getopt   CWE-120, CWE-20      1
2     buffer     memcpy           CWE-120     11
3     buffer       read   CWE-120, CWE-20      1
4     buffer     strcat           CWE-120      1
5     buffer     strcpy           CWE-120      4
6     buffer    strncpy           CWE-120      1
7     format  vsnprintf           CWE-134      1
8    integer       atoi           CWE-190      2
9       misc       open           CWE-362      2
10  obsolete     usleep           CWE-676      1
11     shell     system            CWE-78      1


Unnamed: 0,File,Line,Column,DefaultLevel,Level,Category,Name,Warning,Suggestion,Note,CWEs,Context,Fingerprint,ToolVersion,RuleId,HelpUri
0,data/projects/contiki-2.4/tools/tunslip.c,438,3,4,4,format,vsnprintf,If format strings can be influenced by an atta...,Use a constant for the format specification.,,CWE-134,"vsnprintf(cmd, sizeof(cmd), fmt, ap);",5a6ebc53325719b0f59e24f31e308667e9337289bfbc55...,2.0.19,FF1019,https://cwe.mitre.org/data/definitions/134.html
1,data/projects/contiki-2.4/tools/tunslip.c,442,10,4,4,shell,system,This causes a new program to execute and is di...,try using a library call that implements the s...,,CWE-78,return system(cmd);,f3f1fc4af80877aebfc643d7ec0da9b9e96eccff3cad56...,2.0.19,FF1044,https://cwe.mitre.org/data/definitions/78.html
2,data/projects/contiki-2.4/tools/tunslip.c,753,3,4,4,buffer,strcat,Does not check for buffer overflows when conca...,"Consider using strcat_s, strncat, strlcat, or ...",,CWE-120,"strcat(t, dev);",901ebb10ee39a0103c1035a30e7a1a1f2db2bf8d9bf07e...,2.0.19,FF1005,https://cwe.mitre.org/data/definitions/120.html


## Grepping functions from the vulnerability context of the file.

In [21]:
# Function Under Construction:
def srcML_funs(file):
    """ finds function blocks of the given file
    """ 
    fun_ptn = "string(//src:function)"
    funblk_ptn = "string((//src:function/src:name))"
    # file_ptn = "string(//src:unit/@filename)" 

    # cmd = sub.Popen(["srcml", "--xpath", fun_ptn, file], stderr=sub.STDOUT)
    # out, err = cmd.communicate()
    cmd = ["srcml", "--xpath", funblk_ptn, xml_file]
    process = sub.Popen(cmd, stderr=sub.STDOUT)
    return process

file = '../' + df.File[0]
xml_file = "a.cpp.xml"

file_str = ''
with open(file) as f:
    file_str = f.read()

## Find the metrics of the given file

In [22]:
import itertools

# Without checking at CSV output of flawfinder. 
# def find_metrics(source_file):
#     """ split the given file into a list of function blocks and return their metrics into a dataframe
#     """ 
#     df = pd.DataFrame()
#     with open(source_file, "r") as fp:  
#         liz_file = lizard.analyze_file.analyze_source_code(source_file,  fp.read())
        
#         for x in range(len(liz_file.function_list)):
#             fun_metrics = liz_file.function_list[x].__dict__
#             start=fun_metrics['start_line']
#             end=fun_metrics['end_line']
            
#             fp.seek(0) # move header to the initial point of the file
#             df_fun = pd.DataFrame.from_dict(fun_metrics)
#             lines = [line for line in itertools.islice(fp, start, end)]
#             df_fun['code'] =  fun_metrics['long_name'] + ''.join(lines) 
#             df = pd.concat([df, df_fun])
            
#     # <guru> I think there is a problem in lizard detecting the correct full_parameters 
#     # either we have to concatenate two lines of full_parameters or ignore it and take it from long_name if needed. 
#     # drop['full_parameters', 'fan_in', 'fan_out', 'general_fan_out'] because lizard has not properly implemented these parameters yet.
#     cols_filter = ['full_parameters', 'fan_in', 'fan_out', 'general_fan_out']
#     df = df.drop(cols_filter, axis=1).drop_duplicates().reset_index(drop=True)
#     print('Shape of the dataframe: ', df.shape)
#     return df

# df_met = find_metrics(file)
# df_met.head(5)

# Fetching the functions which have given line context/statement.

In [23]:
import itertools
    
    
def file2metrics(source_file, df_flaw):
    """ split the given file into a list of function blocks and return their metrics into a dataframe
    """ 
    lines = list(set(list(df_flaw.Line)))
    df = pd.DataFrame()
    with open(source_file, "r") as fp:  
        liz_file = lizard.analyze_file.analyze_source_code(source_file,  fp.read())
        
        for x in range(len(liz_file.function_list)):
            
            fun_metrics = liz_file.function_list[x].__dict__
            df_fun = pd.DataFrame()
            df_fun = pd.DataFrame.from_dict(fun_metrics)
            
            start = int(fun_metrics['start_line'])
            end = int(fun_metrics['end_line']) 
            fp.seek(0) # move header to the initial point of the file

            vul_statement, cwe, vul_bool = '', [], False
            
            for l in lines:      
                code_lines = [line for line in itertools.islice(fp, start, end)]
                df_fun['code'] =  fun_metrics['long_name'] + ''.join(code_lines) 
                
                # check if the vulnerability content/statement appear in the function block or not.
                if start <= l <= end:
                    vul_bool =  True 
                    # vul_statement = vul_statement + ' \n ' +  df_flaw[df_flaw.Line==l]['Context'].values[0]
                    vul_statement = df_flaw[df_flaw.Line==l]['Context'].values[0]
                    vul_type = df_flaw[df_flaw.Line==l]['CWEs'].values[0]
                    cwe.append((vul_type, vul_statement))

                df_fun['CWEs'] = str(cwe)
                # df_fun['vul_statements'] = vul_statement
                
            df_fun['is_vul'] = vul_bool  
            df = pd.concat([df, df_fun])
            
    # <guru> I think there is a problem in lizard detecting the correct full_parameters 
    # either we have to concatenate two lines of full_parameters or ignore it and take it from long_name if needed. 
    # drop['full_parameters', 'fan_in', 'fan_out', 'general_fan_out'] because lizard has not properly 
    # implemented these parameters yet.
    
    cols_filter = ['full_parameters', 'fan_in', 'fan_out', 'general_fan_out']
    df = df.drop(cols_filter, axis=1).drop_duplicates().reset_index(drop=True)
    print('Shape of the dataframe: ', df.shape)
    return df


df_met = file2metrics(file, df)
df_met.head(3)

Shape of the dataframe:  (20, 12)


Unnamed: 0,cyclomatic_complexity,nloc,token_count,name,long_name,start_line,end_line,filename,top_nesting_level,code,CWEs,is_vul
0,3,43,336,relay_dhcp_to_server,"relay_dhcp_to_server( struct ip * ip , int len)",162,224,../data/projects/contiki-2.4/tools/tunslip.c,0,"relay_dhcp_to_server( struct ip * ip , int len)","[('CWE-120', ' memcpy(&m, inm, DHCP_BASE_LEN)...",True
1,14,91,725,relay_dhcp_to_client,relay_dhcp_to_client( int slipfd),229,338,../data/projects/contiki-2.4/tools/tunslip.c,0,relay_dhcp_to_client( int slipfd),"[('CWE-120', ' memcpy(pkt.m.options, inm.opti...",True
2,5,18,118,ip4sum,"ip4sum( u_int16_t sum , const void * _p , u_in...",344,362,../data/projects/contiki-2.4/tools/tunslip.c,0,"ip4sum( u_int16_t sum , const void * _p , u_in...",[],False


# Parsing CppCheck output:

In [24]:
import csv
from lxml import etree
import pandas as pd
import subprocess as sub


def xml2df(xml):
    """ convert xml file generated by the CppCheck tool to dataframe
    """
    df_loc = pd.read_xml(xml, encoding='utf-8', xpath='./errors/error/location')
    df_err = pd.read_xml(xml, encoding='utf-8', xpath='./errors/error')

    assert df_err['file0'].equals(df_loc['file']), 'Error! dataframe and nested location dataframe are not equals.'
    # add location metrics to the table
    df_err[['file', 'line', 'column', 'info']] = df_loc
    df_err = df_err.drop('file0', axis=1).drop_duplicates().reset_index(drop=True)
    return df_err


def cppcheck_flaws(file_or_dir):
    """ find flaws ini the file using CppCheck tool
    """
    cmd = 'cppcheck --xml ' + file_or_dir  
    process = sub.Popen(cmd,  shell=True, stdout=sub.PIPE)
    output = process.stdout.read()
    return output # xml2df(output)

# !cppcheck --template=gcc ../data/projects/contiki-2.4/apps/ 2> err.txt
# !cppcheck --template="{file}; {line}; {severity}; {message}; {code}" 
# --template-location=" {file};{line}; {info};{code}\n" ../data/projects/contiki-2.4/apps/ 2> err.txt


chk_dir = '../data/projects/contiki-2.4/apps/'
df = cppcheck_flaws(chk_dir)

chk_dir = '../data/projects/contiki-2.4/apps/'
xml = cppcheck_flaws(chk_dir)
# xtree = et.parse("students.xml")
# xroot = xtree.getroot()


<?xml version="1.0" encoding="UTF-8"?>
<results version="2">
    <cppcheck version="2.9"/>
    <errors>
        <error id="wrongPrintfScanfArgNum" severity="error" msg="sprintf format string requires 6 parameters but only 3 are given." verbose="sprintf format string requires 6 parameters but only 3 are given." cwe="685" file0="../data/projects/contiki-2.4/apps/ftp/ftpc.c">
            <location file="../data/projects/contiki-2.4/apps/ftp/ftpc.c" line="288" column="11"/>
        </error>
        <error id="overlappingWriteFunction" severity="error" msg="Overlapping read/write in memcpy() is undefined behavior" verbose="Overlapping read/write in memcpy() is undefined behavior" file0="../data/projects/contiki-2.4/apps/irc/irc.c">
            <location file="../data/projects/contiki-2.4/apps/irc/irc.c" line="129" column="3"/>
        </error>
        <error id="overlappingWriteFunction" severity="error" msg="Overlapping read/write in memcpy() is undefined behavior" verbose="Overlapping rea

In [25]:
def fetch_location(err):
    """ get locations of all the error list generated by CppCeck
    """ 
    dt_loc = {'file': [], 'line':[], 'column':[], 'info':[]}
    for loc in err.findall('location'): 
        for key, val in (loc.attrib).items():
            dt_loc[key].append(val)
    return dt_loc 
            
def xml2df(file):
    xtree = et.fromstring(open('err.xml').read())
    df = pd.DataFrame()
    for errors in xtree.findall('.//errors'):
        for err in errors.findall('error'):
            dt_err = err.attrib
            dt_err.update(fetch_location(err))
            df = pd.concat([df, pd.DataFrame([dt_err])], ignore_index=True).drop(columns=['file'], axis=1)
    return df.rename(columns={'file0':'file'})

df_flaw = xml2df('err.xml')
df_flaw.to_csv('contiki24_cppcheck.csv')

In [28]:
import itertools
    
    
def file2metrics(source_file, df_flaw, tool='cppcheck'):
    """ split the given file into a list of function blocks and return their metrics into a dataframe
    """ 

    if tool=='flawfinder':
        lines = list(set(list(df_flaw.Line)))
    elif tool== 'cppcheck':
        df_flaw = df_flaw.rename(columns={'cwe':'CWEs', 'line': 'Line'})
        lines = list(set(list(df_flaw.Line[0])))
    else:
        print('Please select a valid tool!')
        exit(1)
    
    lines = [eval(l) for l in lines] # python default casting to integer
    df = pd.DataFrame()
    
    with open(source_file, "r") as fp:  
        liz_file = lizard.analyze_file.analyze_source_code(source_file,  fp.read())
        
        for ifun in range(len(liz_file.function_list)):
            # print(liz_file.function_list[ifun].__dict__)
            fun_metrics = liz_file.function_list[ifun].__dict__
            df_fun = pd.DataFrame.from_dict(fun_metrics)
            
            start = int(fun_metrics['start_line'])
            end = int(fun_metrics['end_line']) 
            fp.seek(0) # move header to the initial point of the file

            vul_statement, cwe, vul_bool = '', [], False
            
            # check if any of the lines of the file belong to any functions
            for l in lines:
                fun_block = [line for line in itertools.islice(fp, start, end)]
                df_fun['code'] =  fun_metrics['long_name'] + ''.join(fun_block) 
                
                
                # check if the vulnerability content/statement appear in the function block or not.
                if start <= l <= end:
                    vul_statement = df_flaw[df_flaw.Line==l]['Context'].values[0]
                    vul_type = df_flaw[df_flaw.Line==l]['CWEs'].values[0]
                    cwe.append((vul_type, vul_statement))

                df_fun['CWEs'] = str(cwe)
                
            df_fun['is_vul'] = vul_bool  
            df = pd.concat([df, df_fun])
            return df

    # <guru> I think there is a problem in lizard detecting the correct full_parameters 
    # either we have to concatenate two lines of full_parameters or ignore it and take it from long_name if needed. 
    # drop['full_parameters', 'fan_in', 'fan_out', 'general_fan_out'] because lizard has not properly 
    # implemented these parameters yet.
    
    cols_filter = ['full_parameters', 'fan_in', 'fan_out', 'general_fan_out']
    df = df.drop(cols_filter, axis=1).drop_duplicates().reset_index(drop=True)
    print('Shape of the dataframe: ', df.shape)
    return df

file2metrics(df_flaw.file[0], df_flaw, tool='cppcheck')

df_prj = pd.DataFrame()

for i in range(len(df_flaw)):
    df_file = file2metrics(df_flaw.file[i], df_flaw, tool='cppcheck')
    df_prj = pd.concat([df_prj, df_file])

df_prj.reset_index(drop=True)


Unnamed: 0,cyclomatic_complexity,nloc,token_count,name,long_name,start_line,end_line,full_parameters,filename,top_nesting_level,fan_in,fan_out,general_fan_out,code,CWEs,is_vul
0,1,4,12,ftpc_init,ftpc_init( void),123,127,void,../data/projects/contiki-2.4/apps/ftp/ftpc.c,0,0,0,0,ftpc_init( void){\n memb_init(&connections);\...,[],False
1,1,8,34,quit,quit( void),105,112,void,../data/projects/contiki-2.4/apps/irc/irc.c,0,0,0,0,quit( void){\n ctk_window_close(&window);\n ...,[],False
2,1,8,34,quit,quit( void),105,112,void,../data/projects/contiki-2.4/apps/irc/irc.c,0,0,0,0,quit( void){\n ctk_window_close(&window);\n ...,[],False
3,13,53,294,PROCESS_THREAD,"PROCESS_THREAD( shell_blink_process , ev , data)",55,115,shell_blink_process,../data/projects/contiki-2.4/apps/shell/shell-...,0,0,0,0,"PROCESS_THREAD( shell_blink_process , ev , dat...",[],False
4,13,53,294,PROCESS_THREAD,"PROCESS_THREAD( shell_blink_process , ev , data)",55,115,ev,../data/projects/contiki-2.4/apps/shell/shell-...,0,0,0,0,"PROCESS_THREAD( shell_blink_process , ev , dat...",[],False
5,13,53,294,PROCESS_THREAD,"PROCESS_THREAD( shell_blink_process , ev , data)",55,115,data,../data/projects/contiki-2.4/apps/shell/shell-...,0,0,0,0,"PROCESS_THREAD( shell_blink_process , ev , dat...",[],False
6,5,18,103,write_chunk,"write_chunk( struct rucb_conn * c , int offset...",81,99,struct rucb_conn * c,../data/projects/contiki-2.4/apps/shell/shell-...,0,0,0,0,"write_chunk( struct rucb_conn * c , int offset...",[],False
7,5,18,103,write_chunk,"write_chunk( struct rucb_conn * c , int offset...",81,99,int offset,../data/projects/contiki-2.4/apps/shell/shell-...,0,0,0,0,"write_chunk( struct rucb_conn * c , int offset...",[],False
8,5,18,103,write_chunk,"write_chunk( struct rucb_conn * c , int offset...",81,99,int flag,../data/projects/contiki-2.4/apps/shell/shell-...,0,0,0,0,"write_chunk( struct rucb_conn * c , int offset...",[],False
9,5,18,103,write_chunk,"write_chunk( struct rucb_conn * c , int offset...",81,99,char * data,../data/projects/contiki-2.4/apps/shell/shell-...,0,0,0,0,"write_chunk( struct rucb_conn * c , int offset...",[],False
