## Queries on CVE records for the extraction of IoT related referenced repositories:

In [None]:
import collections
import pandas as pd
from matplotlib import pyplot as plt
import json 
import ast
import re
import os
import csv
import subprocess
import requests
import tempfile
from io import BytesIO, StringIO
from zipfile import ZipFile
from guesslang import Guess

In [None]:
df = pd.read_csv('../data/cve-records.csv')
des_str = df['description'][0]

### Search Query: 
"Internet of Things" OR "IoT" OR "Industry 4.0" OR "smart cities" OR "smart city"OR "smart contract" OR "manufacturing" OR "energy" OR "supply chain" 

In [None]:
def get_description(des_str):
    if des_str!=None or des_str!='':
        des_arr_dict = ast.literal_eval(des_str)
        des_cve = ""    #description of a CVE-> 'value' from array of dict.

        for dic in des_arr_dict:
            des_cve = des_cve + dic['value']
        return des_cve
        
    else:
        print('Empty description for CVE: ')
        return 0

def get_iot_cves(df):
    iot_set = ["Internet of Things", "IoT", "Industry 4.0", 
                "smart cities", "smart city", "smart contract", 
                "manufacturing", "energy", "supply chain", "orange pi", "banana pi", "arduino"]
    iot_cves = []

    for row in range(len(df)):
        des_cve = get_description(df['description'][row])
        
        # print if they are IoT related descriptions
        for x in iot_set:
            if x.lower() in des_cve.lower():
                # print(des_cve)
                # print(df['cve_id'][row])
                iot_cves.append(df['cve_id'][row])
                # print(df['reference_json'][row])
                # print('\n')
    return iot_cves

iot_cves = get_iot_cves(df)
print('count_cves:', len(iot_cves))

In [None]:
df_iot = df[df.cve_id.isin(iot_cves)]
len(df_iot)

In [None]:
iot_vcs = ['github', 'bitbucket', 'gitlab']
vcs_list = []

for ref_str in df_iot.reference_json:
    url_dict  = ast.literal_eval(ref_str)
    
    if len(url_dict) > 0:
        for ref in url_dict:
            vcs_list.append(ref['url'])     

## Vulnerabilty reporting databases and number of their occurances in CVEs

In [None]:
url_freq = collections.Counter(url_heads)
df_url = pd.DataFrame(url_freq.items(), columns=['urls', 'count'])
df_url = df_url.sort_values(by=['count'], ascending=False)
df_url.to_csv('../result/top-databases.csv', index=False, sep=';')
df_url.head(5)

# Analysis of Infer output (report.json) file:

In [None]:
import pandas as pd
import json
import os
import subprocess as sub
import time 

############################ Applying infer tool ############################

def json2df(file) -> pd.DataFrame:
    df = pd.DataFrame()
    with open(file) as f:
        data = json.load(f)
        df = pd.DataFrame(data)
    return df

def apply_infer(fname) -> pd.DataFrame:
    """find flaws in the file using infer tool"""
    infer_dir = 'infer-output'
    compiler = 'gcc'

    cmd = f"infer run --results-dir {infer_dir} -- {compiler} -c "
    out_file = f"{infer_dir}/report.json" # output file generated by infer tool

    df = pd.DataFrame() # dataframe to store the results

    if os.path.isfile(fname):
        cmd =  cmd + fname
        process = sub.Popen(
            cmd,
            shell=True,
            stdout=sub.PIPE,
        )
        process.wait() # wait for the process to finish
        
        # check if the output file is generated
        if os.path.isfile(out_file):
            df = json2df(out_file)
        if len(df)!=0:
            df["tool"] = "infer"
        df = df.reset_index(drop=True)

    else:
        print(f'Invalid command for infer tool! \
            \nPlease check the command again! \ncommand: {cmd}')
    return df

# fname = '../data/projects/contiki-2.4/tools/tunslip.c'
fname = '../data/projects/contiki-2.4/core/sys/timetable.c'

apply_infer(fname)

# Encoding target label 

In [None]:
import pickle 
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer

def encode_multiclass(y):
    """encode multiclass target """
    encoder = LabelEncoder()
    encoder.fit(y)
    encoded_y = encoder.transform(y)
    with open('classes.pkl', 'wb') as f:
        pickle.dump(encoder, f)
    return encoded_y

def decode_multiclass(encoded_y):
    """decode multiclass target """
    with open('classes.pkl', 'rb') as f:
        encoder = pickle.load(f)

    decoded_y = encoder.inverse_transform(encoded_y)
    # decoded_y = [x[0] for x in decoded_y]
    return decoded_y

y = ['Benign', 'CWE-120', 'Benign', 'Benign', 'CWE-120', 'CWE-20', 'CWE-19']

# target representation for binary classification
y = [x if x=='Benign' else 'Vulnerable' for x in y]

encoded_y = encode_multiclass(y)
print(y)
print(list(set(list(encoded_y))))
print(decode_multiclass([0, 1, 1]))

In [None]:
import numpy as np 

np.unique(list(y), return_counts=True)
pd.value_counts(y)

In [None]:
len(encoded_y)/(2*np.bincount(encoded_y))

In [None]:
def encode_multiclass_target(y):
    """encode multiclass target """
    encoder = LabelEncoder()
    encoder.fit_transform(y)
    with open('../data/classes.pkl', 'wb') as f:
        pickle.dump(encoder, f)

def decode_multiclass_target(encoded_y):
    """decode multiclass target """
    with open('../data/classes.pkl', 'rb') as f:
        encoder = pickle.load(f)

    decoded_y = encoder.inverse_transform(encoded_y)
    # decoded_y = [x[0] for x in decoded_y]
    return decoded_y

encode_multiclass_target(y)
decode_multiclass_target([0, 1, 2])

# Checking if tokenizer is working fine!

In [None]:
from string import printable
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd 
import numpy as np


def tokenize_data(df, max_len):
    """Dataset tokenization"""
    code_snippet_int_tokens = [
        [printable.index(x) + 1 for x in code_snippet if x in printable]
        for code_snippet in df.code]

    # Pad the sequences (left padded with zeros)
    # to the max length of the code snippet
    # print(code_snippet_int_tokens)
    X = pad_sequences(code_snippet_int_tokens, maxlen=max_len)
    target = np.array(df.label)
    print(f"Shape of X: {X.shape}, Shape of y:{target.shape}")
    return X, target

df = pd.read_csv('../data/TinyVul-v2-statement-multiclass.csv')
tokenize_data(df.head(20), 150)

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf

# function to perform OneHotEncoder to target
def encode_multiclass(y):
    """Encode multiclass labels"""
    # encode class values as integers
    encoder = LabelEncoder()
    encoder.fit(y)
    print(f'Number of classes: {len(encoder.classes_)}')
    encoded_Y = encoder.transform(y)
    # convert integers to dummy variables (i.e. one hot encoded)
    print(encoded_Y)
    dummy_y = tf.keras.utils.to_categorical(encoded_Y)
    return encoder, dummy_y

def decode_multiclass(onehot_y, encoder):
    """decode multiclass target """
    # with open('data/classes.pkl', 'rb') as f:
    #     encoder = pickle.load(f)

    decoded_y = [encoder.inverse_transform([np.argmax(v)])[0] for v in onehot_y]
    return decoded_y

y = ['Benign', 'CWE-20', 'CWE-120', 'CWE-119', 'Benign', 'CWE-20']
encoder, y = encode_multiclass(y)

decode_multiclass(y, encoder)

In [None]:
y = ['Benign', 'CWE-20', 'CWE-120', 'CWE-119', 'Benign', 'CWE-20']
dist = pd.Series(y).value_counts()
print(len(dist))
print(f'Distribution of targets: \n{pd.Series(y).value_counts()}')