# Statistics on the output data. 

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import lizard
import subprocess as sub
from pylibsrcml import srcml

df = pd.read_csv('../data/output-tunslip.csv')

if not os.path.exists("figure"):
    os.mkdir("figure")

df.columns

Index(['File', 'Line', 'Column', 'DefaultLevel', 'Level', 'Category', 'Name',
       'ToolVersion', 'RuleId', 'HelpUri'],
      dtype='object')

In [3]:
dfp = df[['Category', 'Name', 'CWEs']].groupby(['Category', 'Name', 'CWEs'], group_keys=False).size().reset_index(name='Count')
print(dfp)
fig = px.sunburst(dfp, path=['Category', 'Name', 'CWEs'], values='Count')
# fig.write_image("figure/vul_statistics.pdf")
fig.show()
df.head(3)

    Category       Name              CWEs  Count
0     buffer       char  CWE-119!/CWE-120      6
1     buffer     getopt   CWE-120, CWE-20      1
2     buffer     memcpy           CWE-120     11
3     buffer       read   CWE-120, CWE-20      1
4     buffer     strcat           CWE-120      1
5     buffer     strcpy           CWE-120      4
6     buffer    strncpy           CWE-120      1
7     format  vsnprintf           CWE-134      1
8    integer       atoi           CWE-190      2
9       misc       open           CWE-362      2
10  obsolete     usleep           CWE-676      1
11     shell     system            CWE-78      1


Unnamed: 0,File,Line,Column,DefaultLevel,Level,Category,Name,Warning,Suggestion,Note,CWEs,Context,Fingerprint,ToolVersion,RuleId,HelpUri
0,data/projects/contiki-2.4/tools/tunslip.c,438,3,4,4,format,vsnprintf,If format strings can be influenced by an atta...,Use a constant for the format specification.,,CWE-134,"vsnprintf(cmd, sizeof(cmd), fmt, ap);",5a6ebc53325719b0f59e24f31e308667e9337289bfbc55...,2.0.19,FF1019,https://cwe.mitre.org/data/definitions/134.html
1,data/projects/contiki-2.4/tools/tunslip.c,442,10,4,4,shell,system,This causes a new program to execute and is di...,try using a library call that implements the s...,,CWE-78,return system(cmd);,f3f1fc4af80877aebfc643d7ec0da9b9e96eccff3cad56...,2.0.19,FF1044,https://cwe.mitre.org/data/definitions/78.html
2,data/projects/contiki-2.4/tools/tunslip.c,753,3,4,4,buffer,strcat,Does not check for buffer overflows when conca...,"Consider using strcat_s, strncat, strlcat, or ...",,CWE-120,"strcat(t, dev);",901ebb10ee39a0103c1035a30e7a1a1f2db2bf8d9bf07e...,2.0.19,FF1005,https://cwe.mitre.org/data/definitions/120.html


## Grepping functions from the vulnerability context of the file.

In [45]:
file = '../' + df.File[0]
xml_file = "a.cpp.xml"

file_str = ''
with open(file) as f:
    file_str = f.read()

In [5]:
# file_xml = sub.run(['srcml', file])
srcml.srcml(file, "a.cpp.xml")

In [37]:
fun_ptn = "string(//src:function)"
funblk_ptn = "string((//src:function/src:name))"
# file_ptn = "string(//src:unit/@filename)" 

# cmd = sub.Popen(["srcml", "--xpath", fun_ptn, file], stderr=sub.STDOUT)
# out, err = cmd.communicate()

cmd = ["srcml", "--xpath", funblk_ptn, xml_file]

process = sub.Popen(cmd, stderr=sub.STDOUT)

relay_dhcp_to_server


In [50]:



print(liz_file.__dict__)

{'filename': 'file1.c', 'nloc': 788, 'function_list': [<lizard.FunctionInfo object at 0x7fcf0c37d5b0>, <lizard.FunctionInfo object at 0x7fcf0c37d280>, <lizard.FunctionInfo object at 0x7fcf08753c40>, <lizard.FunctionInfo object at 0x7fcf0c32df40>, <lizard.FunctionInfo object at 0x7fcf0581b610>, <lizard.FunctionInfo object at 0x7fcf0c2edf10>, <lizard.FunctionInfo object at 0x7fcf0c2edfd0>, <lizard.FunctionInfo object at 0x7fcf0c2edfa0>, <lizard.FunctionInfo object at 0x7fcf08333790>, <lizard.FunctionInfo object at 0x7fcf088a7850>, <lizard.FunctionInfo object at 0x7fcf0c37d970>, <lizard.FunctionInfo object at 0x7fcf0c37d430>, <lizard.FunctionInfo object at 0x7fcf0c2ede50>, <lizard.FunctionInfo object at 0x7fcf0c32d3a0>, <lizard.FunctionInfo object at 0x7fcf0c32dbb0>, <lizard.FunctionInfo object at 0x7fcf0c32d430>, <lizard.FunctionInfo object at 0x7fcf0c32d940>, <lizard.FunctionInfo object at 0x7fcf0c32d7f0>, <lizard.FunctionInfo object at 0x7fcf0c32d0a0>, <lizard.FunctionInfo object at 0x

In [243]:
import itertools

def find_metrics(source_file):
    """ split the given file into a list of functions blocks and return their metrics
    """ 
    df = pd.DataFrame()
    with open(source_file, "r") as fp:  
        liz_file = lizard.analyze_file.analyze_source_code(source_file,  fp.read())
        
        for x in range(len(liz_file.function_list)):
            fun_metrics = liz_file.function_list[x].__dict__
            start=fun_metrics['start_line']
            end=fun_metrics['end_line']
            
            fp.seek(0) # move header to the initial point of the file
            df_fun = pd.DataFrame.from_dict(fun_metrics)
            lines = [line for line in itertools.islice(fp, start, end)]
            df_fun['code'] =  fun_metrics['long_name'] + ''.join(lines) 
            df = pd.concat([df, df_fun])
            
    # <guru> I think there is a problem in lizard detecting the correct full_parameters 
    # either we have to concatenate two lines of full_parameters or ignore it and take it from long_name if needed. 
    # drop['full_parameters', 'fan_in', 'fan_out', 'general_fan_out'] because lizard has not properly implemented these parameters so far.
    cols_filter = ['full_parameters', 'fan_in', 'fan_out', 'general_fan_out']
    df = df.drop(cols_filter, axis=1).drop_duplicates().reset_index(drop=True)
    print('Shape of the dataframe: ', df.shape)
    return df

df = find_metrics(file)
df.head(5)

Shape of the dataframe:  (20, 10)


Unnamed: 0,cyclomatic_complexity,nloc,token_count,name,long_name,start_line,end_line,filename,top_nesting_level,code
0,3,43,336,relay_dhcp_to_server,"relay_dhcp_to_server( struct ip * ip , int len)",162,224,../data/projects/contiki-2.4/tools/tunslip.c,0,"relay_dhcp_to_server( struct ip * ip , int len..."
1,14,91,725,relay_dhcp_to_client,relay_dhcp_to_client( int slipfd),229,338,../data/projects/contiki-2.4/tools/tunslip.c,0,relay_dhcp_to_client( int slipfd){\n struct d...
2,5,18,118,ip4sum,"ip4sum( u_int16_t sum , const void * _p , u_in...",344,362,../data/projects/contiki-2.4/tools/tunslip.c,0,"ip4sum( u_int16_t sum , const void * _p , u_in..."
3,15,34,256,check_ip,"check_ip( const struct ip * ip , unsigned ip_len)",365,413,../data/projects/contiki-2.4/tools/tunslip.c,0,"check_ip( const struct ip * ip , unsigned ip_l..."
4,8,12,90,is_sensible_string,"is_sensible_string( const unsigned char * s , ...",416,427,../data/projects/contiki-2.4/tools/tunslip.c,0,"is_sensible_string( const unsigned char * s , ..."


In [216]:
df_clean = df.drop_duplicates(subset=['code'], keep='first').reset_index(drop=True)
print(df_clean.code[0])

relay_dhcp_to_server( struct ip * ip , int len){
  struct dhcp_light_msg *inm;
  struct dhcp_msg m;
  int n;
  u_int8_t *optptr;
  
  inm = (void*)(((u_int8_t*)ip) + 20 + 8); /* Skip over IP&UDP headers. */

  if (inm->op != BOOTREQUEST) {
    return;
  }

  inm->flags = ntohs(BOOTP_BROADCAST);

  memcpy(&m, inm, DHCP_BASE_LEN);
  memset(&m.sname, 0x0, DHCP_HOLE_LEN);
  memcpy(&m.options, &inm->options, len - 20 - 8 - DHCP_BASE_LEN);
  n = (len - 20 - 8) + DHCP_HOLE_LEN; /* +HOLE -IP&UDP headers. */

  /*
   * Ideally we would like to use the Relay Agent information option
   * (RFC3046) together with the Link Selection sub-option (RFC3527)
   * to ensure that addresses are allocated for this
   * subnet. Unfortunately ISC-DHCPD does not currently implement
   * RFC3527 and some other mechanism must be used. For this reason
   * this implementation in addition uses the DHCP option for subnet
   * selection (RFC3011) which is really not intended to be used by
   * relays.
   *
   * Find