# Statistics on the output data. 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import lizard
import subprocess as sub
from pylibsrcml import srcml
import os
import re 
import xml.etree.ElementTree as et 
import warnings
warnings.filterwarnings("ignore")


df = pd.read_csv('../data/contiki-master_flaw.csv')
dfm = pd.read_csv('../data/contiki-master_metrics.csv')

if not os.path.exists("figure"):
    os.mkdir("figure")
df.columns

Index(['Unnamed: 0', 'file', 'line', 'column', 'category', 'name', 'msg',
       'note', 'cwe', 'context', 'helpuri', 'tool', 'defaultlevel', 'level',
       'suggestion', 'fingerprint', 'toolversion', 'ruleid'],
      dtype='object')

In [None]:
dfp = df[['category', 'name', 'cwe']].groupby(['category', 'name', 'cwe'], group_keys=False).size().reset_index(name='count')
fig = px.sunburst(dfp, path=['category', 'name', 'cwe'], values='count')
# fig.write_image("figure/vul_statistics.pdf")
fig.show()
dfp.head()

# Analysis of IoTvulCode tool generated output for feeding non-vul statements:


In [None]:
df.nunique()

In [None]:
df.context.str.len().sort_values(ascending=False).reset_index(drop=True).plot(kind='box')

In [None]:

import random

def filter_unusual_statements(statements, min_len, max_len):
    """" 
    # https://peps.python.org/pep-0007/
    # https://www.python.org/dev/peps/pep-0007/ 
    """
    filtered_statements = []
    for statement in statements:
        if len(statement) >= min_len and len(statement) <= max_len:
            filtered_statements.append(statement)
   
    # take size of the vul statements as basis
    df = pd.Series(statements)   
    df = df.str.len()
    
    stat_sizes =  list(df[df.between(max_len, min_len)].reset_index(drop=True))
    random.seed(0)
    stat_sizes = random.sample(set(stat_sizes), 5)
    return stat_sizes


# standard variables:
max_len = 79
# for min_len (eg, 7 characters)
min_len = df.context.str.len().sort_values(ascending=False).reset_index(drop=True).min()


lines = dfm.code[0].splitlines()
stat_sizes = filter_unusual_statements(lines, max_len, min_len)
stat_sizes

In [None]:
def get_benign_context(row):
    """
    filter all lines if it is less than min threshold
    randomly suffled lines
    """ 
    df = pd.DataFrame()
    lines = [x for x in enumerate(row['code'].splitlines()) if len(x[1]) > 7]
    # lines = [(x[0], str(x).strip()) for x in lines]
    
    random.seed(0)
    lines = random.sample(population=lines, 
                        k=int(len(lines)/2))

    # TODO: remove the ambiguous vul line from the 'benign' lines if present
    # vul_line = df.line[i]
    # lines = [x for x in lines if x[0]!=vul_line]

    ## convert it to dataframe and add additional columns
    df = pd.DataFrame(lines, columns=['line', 'context'])
    # remove leading and trailing whitespace
    df['context'] = df['context'].apply(lambda x: re.sub(r'\s+', ' ', x).strip()) 
    df['cwe'] = 'benign'
    df['tool'] = 'sampling'
    df['file'] = row['filename']
    line_col = df['line'].astype(int) + int(row['start_line'])
    
    max_line = max(list(line_col)) if list(line_col) else 0
    end_line = int(row['end_line'])
    
    # print(f"max of lines: {max_line} and end_line: {end_line}")
    assert max_line <= end_line, "Line number shouldn't exceed function length!"
    df['line'] = line_col
    return df

def drop_rows(df):
    """ applied several filters to the dataframe """
    df['context'] = df['context'].apply(lambda x: re.sub(r'\s+', ' ', x).strip()) 
    len_s0 = len(df)
    # Step 1: drop duplicates from all rows
    df = df.drop_duplicates(subset=['cwe', 'context']).reset_index(drop=True)
    len_s1 = len(df)
    print(f'{len_s0-len_s1} duplicate samples were dropped from {len_s0} samples.')

    # Step 2: drop duplicates from ambiguous rows on context column 
    ## (keeping only a first occurrence, i.e, vul/cwe sample)
    df = df.sort_values(by='cwe', ascending=True).drop_duplicates(subset='context', keep='first').reset_index(drop=True)
    len_s2 = len(df)
    print(f'{len_s1-len_s2} ambiquous samples were dropped from {len_s1} samples.')
    return df

def gen_benign(dfm):
    """create benign samples to the dataframe """
    print('-'*50)
    print('#samples: ', len(dfm))
    print('Generating benign samples...')
    df_fun = pd.DataFrame()
    for i in range(len(dfm)):
        df_get = get_benign_context(dict(dfm.iloc[i]))
        df_fun = df_fun.append(df_get).reset_index(drop=True)
    print('-'*50)
    print('#benign samples generated: ', len(df_fun))
    return df_fun


df_fun = gen_benign(dfm)
df = df.append(df_fun).reset_index(drop=True)

df = drop_rows(df) # mutates df
df.head(5)

In [None]:
def save_binary(filename, dfs):
    """ save a dataframe to a binary file """
    dfs['isMalicious'] = dfs['cwe'].apply(lambda x: 1 if x !='benign' else 0)
    dfs[['context', 'isMalicious']].to_csv(filename, index=False)
    return dfs[['context', 'isMalicious']]

dfs = save_binary('../data/contiki-master-binary.csv', df)
dfs

In [2]:
dfm = pd.read_csv('../data/iDetect_refine/DNN_Binary.csv')
dfcon = pd.read_csv('../data/contiki-master-binary.csv')
dfm

Unnamed: 0,code,isMalicious
0,( strlen ( me ),1
1,*pBuffer = MQTT_PACKET_TYPE_CONNECT ; pBuffer...,1
2,*pBuffer1 = MQTT_PACKET_TYPE_CONNECT1 ; pBuff...,1
3,ActualFreq 1 = cc1ComputeFreq ( DesiredFreq1 ) ;,1
4,ActualFreq = cc1000ComputeFreq ( DesiredFreq ) ;,1
...,...,...
4715,"xResultr = xClass ( xSession , ( char * ) pxLa...",1
4716,"xResultt = xClass ( xSession , ( char * ) pxLa...",1
4717,"xResultw = xClass ( xSession , ( char * ) pxLa...",1
4718,"xResultx = C_Create ( xSessionx, ( A ) & D, si...",1


In [3]:
dfm.isMalicious.value_counts()

1    3067
0    1653
Name: isMalicious, dtype: int64

In [4]:
dfcon.isMalicious.value_counts()

0    30866
1     2023
Name: isMalicious, dtype: int64