## Queries on CVE records for the extraction of IoT related referenced repositories:

In [1]:
import collections
import pandas as pd
from matplotlib import pyplot as plt
import json 
import ast
import re
import os
import csv
import subprocess
import requests
import tempfile
from io import BytesIO, StringIO
from zipfile import ZipFile
from guesslang import Guess

In [2]:
df = pd.read_csv('../data/cve-records.csv')
des_str = df['description'][0]

  df = pd.read_csv('../data/cve-records.csv')


### Search Query: 
"Internet of Things" OR "IoT" OR "Industry 4.0" OR "smart cities" OR "smart city"OR "smart contract" OR "manufacturing" OR "energy" OR "supply chain" 

In [3]:
def get_description(des_str):
    if des_str!=None or des_str!='':
        des_arr_dict = ast.literal_eval(des_str)
        des_cve = ""    #description of a CVE-> 'value' from array of dict.

        for dic in des_arr_dict:
            des_cve = des_cve + dic['value']
        return des_cve
        
    else:
        print('Empty description for CVE: ')
        return 0

def get_iot_cves(df):
    iot_set = ["Internet of Things", "IoT", "Industry 4.0", 
                "smart cities", "smart city", "smart contract", 
                "manufacturing", "energy", "supply chain", "orange pi", "banana pi", "arduino"]
    iot_cves = []

    for row in range(len(df)):
        des_cve = get_description(df['description'][row])
        
        # print if they are IoT related descriptions
        for x in iot_set:
            if x.lower() in des_cve.lower():
                # print(des_cve)
                # print(df['cve_id'][row])
                iot_cves.append(df['cve_id'][row])
                # print(df['reference_json'][row])
                # print('\n')
    return iot_cves

iot_cves = get_iot_cves(df)
print('count_cves:', len(iot_cves))

count_cves: 2175


In [4]:
df_iot = df[df.cve_id.isin(iot_cves)]
len(df_iot)

2167

In [5]:
iot_vcs = ['github', 'bitbucket', 'gitlab']
vcs_list = []

for ref_str in df_iot.reference_json:
    url_dict  = ast.literal_eval(ref_str)
    
    if len(url_dict) > 0:
        for ref in url_dict:
            vcs_list.append(ref['url'])     

## Vulnerabilty reporting databases and number of their occurances in CVEs

In [6]:
url_freq = collections.Counter(url_heads)
df_url = pd.DataFrame(url_freq.items(), columns=['urls', 'count'])
df_url = df_url.sort_values(by=['count'], ascending=False)
df_url.to_csv('../result/top-databases.csv', index=False, sep=';')
df_url.head(5)

NameError: name 'url_heads' is not defined

## Crawl project directories for source-code files and scan them for vulnerabilities.

In [None]:
import pathlib
from os import walk

prj_dir = '../data/projects/contiki-2.4/'

[p for p in pathlib.Path(prj_dir).iterdir()]

[PosixPath('../data/projects/contiki-2.4/tools'),
 PosixPath('../data/projects/contiki-2.4/.DS_Store'),
 PosixPath('../data/projects/contiki-2.4/core'),
 PosixPath('../data/projects/contiki-2.4/README-EXAMPLES'),
 PosixPath('../data/projects/contiki-2.4/cpu'),
 PosixPath('../data/projects/contiki-2.4/platform'),
 PosixPath('../data/projects/contiki-2.4/README-BUILDING'),
 PosixPath('../data/projects/contiki-2.4/README'),
 PosixPath('../data/projects/contiki-2.4/examples'),
 PosixPath('../data/projects/contiki-2.4/Makefile.include'),
 PosixPath('../data/projects/contiki-2.4/doc'),
 PosixPath('../data/projects/contiki-2.4/apps')]

In [None]:
def get_filepaths(directory):
    """
    This function will generate the file names in a directory 
    tree by walking the tree either top-down or bottom-up. For each 
    directory in the tree rooted at directory top (including top itself), 
    it yields a 3-tuple (dirpath, dirnames, filenames).
    # ref: https://stackoverflow.com/questions/3207219/how-do-i-list-all-files-of-a-directory
    """
    file_paths = []  # List which will store all of the full filepaths.

    # Walk the tree.
    for root, directories, files in os.walk(directory):
        for filename in files:
            # Join the two strings in order to form the full filepath.
            filepath = os.path.join(root, filename)
            file_paths.append(filepath)  # Add it to the list.

    return file_paths  # Self-explanatory.

prj_files = get_filepaths(prj_dir)
print('Number of files in the project: ', len(prj_files))

Number of files in the project:  1992


# Fetching list of C/C++ files from zip file of the project url. 

### Guess programming language and scan only C programs

In [8]:
def check_internet(url):
    response = requests.get(url)
    return True if response.status_code < 400 else False
    
    
def retrieve_zip(url):
    """ Fetching list of C/C++ files from zip file of the project url. 
    """
    if check_internet(url):
        r = requests.get(url)
        # BytesIO keeps the file in memory
        return ZipFile(io.BytesIO(r.content))  
    else:
        print('Internet is not working!')
        return None


def guess_pl(file, zip_obj=None):
    """ guess programming language of the input file. 
    """ 
    guess = Guess()
    if zip_obj is not None:
        # extract a specific file from the zip container
        with zip_obj.open(file, 'r') as f:
            lang = guess.language_name(f.read())
    else:
        with open(file, 'r', encoding= 'unicode_escape') as f:
            lang = guess.language_name(f.read())
    return lang

# # Example code:
# file = '../data/projects/contiki-2.4/tools/tunslip.c'
# path = '../data/projects/contiki-2.4/tools/'
# # cmd = 'flawfinder --csv --inputs ' + path + ' >> output.csv'
# cmd = 'flawfinder --csv --inputs ' + path
# process = subprocess.Popen(cmd,  shell=True, stdout=subprocess.PIPE)
# output = process.stdout.read()
# df=pd.read_csv(StringIO(str(output,'utf-8')))
# df.head()

In [9]:
def find_flaw(file_or_dir):
    """ find flaws ini the file using flawfinder tool
    return : flawfinder output as a CSV file.
    Usage: cmd = 'flawfinder --csv --inputs ' + path + ' >> output.csv'
    """
    if os.path.isfile(file_or_dir):
        cmd = 'flawfinder --csv ' + file_or_dir
    elif ps.path.isdir(file_or_dir):
        cmd = 'flawfinder --csv --inputs ' + file_or_dir
        
    process = subprocess.Popen(cmd,  shell=True, stdout=subprocess.PIPE)
    output = process.stdout.read()
    return pd.read_csv(StringIO(str(output,'utf-8')))
    

def file2df(file, zip_obj=None):
    """ convert zipped file stream - tempfile to pandas dataframe. 
    """
    file_content = ''
    
    if zip_obj:
        # io.StringIO(sf.read().decode("utf-8")).read()
        with zip_obj.open(file) as fc:
            # file_content = fc.read().encode('UTF-8')
            file_content = fc.read()
    else:
        with open(file) as fc:
            file_content = fc.read().encode('UTF-8')

    fp = tempfile.NamedTemporaryFile(suffix='_Flawfinder',
                                    prefix='Filename_')
    # deal with the temp file of extracted zipped file
    try:
        fp.write(file_content)
        fp.seek(0)  # move reader's head to the initial point of the file. 
        file_name = fp.name
        df = find_flaw(file_name)
    except OSError:
        print("Could not open/read file:", fp)
        sys.exit(1)
    finally:
        fp.close()
    return df


url  = 'https://sourceforge.net/projects/contiki/files/Contiki/Contiki%202.4/contiki-sky-2.4.zip/download'
zipobj = retrieve_zip(url)

files = zipobj.namelist() 
selected_files = [x for x in files if guess_pl(x, zipobj) in ['C', 'C++']]
# concatenate all the output dataframes of all the files
df_composite = pd.concat([file2df(selected_files[i], zipobj) for i in range(len(selected_files))])
print(df_composite.shape)

NameError: name 'io' is not defined

In [None]:
zipobj = retrieve_zip(url)

files = zipobj.namelist() 
selected_files = [x for x in files if guess_pl(x, zipobj) in ['C', 'C++']]
# concatenate all the output dataframes of all the files
df_composite = pd.concat([file2df(selected_files[i], zipobj) for i in range(len(selected_files))])
print(df_composite.shape)

NameError: name 'retrieve_zip' is not defined

## Saving prj url to a folder

In [11]:
from urllib.request import urlopen
from zipfile import ZipFile

url  = 'https://sourceforge.net/projects/contiki/files/Contiki/Contiki%202.4/contiki-sky-2.4.zip/download'

zipresp = urlopen(url)

In [12]:
zipresp

<http.client.HTTPResponse at 0x7f8c20356070>