## Queries on CVE records for the extraction of IoT related referenced repositories:

In [1]:
import collections
import pandas as pd
from matplotlib import pyplot as plt
import json 
import ast
import re
import os
import requests

df = pd.read_csv('data/cve-records.csv')
des_str = df['description'][0]

  df = pd.read_csv('data/cve-records.csv')


### Search Query: 
"Internet of Things" OR "IoT" OR "Industry 4.0" OR "smart cities" OR "smart city"OR "smart contract" OR "manufacturing" OR "energy" OR "supply chain" 

In [None]:
def get_description(des_str):
    if des_str!=None or des_str!='':
        des_arr_dict = ast.literal_eval(des_str)
        des_cve = ""    #description of a CVE-> 'value' from array of dict.

        for dic in des_arr_dict:
            des_cve = des_cve + dic['value']
        return des_cve
        
    else:
        print('Empty description for CVE: ')
        return 0

def get_iot_cves(df)
    iot_set = ["Internet of Things", "IoT", "Industry 4.0", 
                "smart cities", "smart city", "smart contract", 
                "manufacturing", "energy", "supply chain", "orange pi", "banana pi", "arduino"]
    iot_cves = []

    for row in range(len(df)):
        des_cve = get_description(df['description'][row])
        
        # print if they are IoT related descriptions
        for x in iot_set:
            if x.lower() in des_cve.lower():
                # print(des_cve)
                # print(df['cve_id'][row])
                iot_cves.append(df['cve_id'][row])
                # print(df['reference_json'][row])
                # print('\n')
    return iot_cves
        
print('count_cves:', len(iot_cves))

count_cves: 2175


In [None]:
df_iot = df[df.cve_id.isin(iot_cves)]
len(df_iot)

2167

In [None]:
iot_vcs = ['github', 'bitbucket', 'gitlab']
vcs_list = []

for ref_str in df_iot.reference_json:
    url_dict  = ast.literal_eval(ref_str)
    
    if len(url_dict) > 0:
        for ref in url_dict:
            vcs_list.append(ref['url'])     

In [None]:
url_heads = []

for x in vcs_list:
    url = re.match('http://www.*.com/', x)
    if url!=None:
        url_heads.append(url[0])

## Vulnerabilty reporting databases and number of their occurances in CVEs

In [None]:
url_freq = collections.Counter(url_heads)
df_url = pd.DataFrame(url_freq.items(), columns=['urls', 'count'])
df_url = df_url.sort_values(by=['count'], ascending=False)
df_url.to_csv('result/top-databases.csv', index=False, sep=';')
df_url

Unnamed: 0,urls,count
0,http://www.securityfocus.com/,222
8,http://www.oracle.com/,177
2,http://www.securitytracker.com/,127
3,http://www.mandriva.com/,25
14,http://www.zerodayinitiative.com/,24
5,http://www.ubuntu.com/,14
1,http://www.vupen.com/,11
12,http://www.openwall.com/,6
6,http://www.exploit-db.com/,5
16,http://www.ibm.com/,4


In [None]:
x = requests.get('http://www.vupen.com/english/advisories/2006/2835')

print(x.status_code)
print(x.headers['content-type'])
print(x.encoding)
print(x.url)

200
text/html
ISO-8859-1
https://www.ovhcloud.com/en-gb/mail/


## Crawl project directories for source-code files and scan them for vulnerabilities.

In [None]:
import pathlib
from os import walk

prj_dir = 'data/projects/contiki-2.4/'

[p for p in pathlib.Path(prj_dir).iterdir()]

[PosixPath('data/projects/contiki-2.4/tools'),
 PosixPath('data/projects/contiki-2.4/.DS_Store'),
 PosixPath('data/projects/contiki-2.4/core'),
 PosixPath('data/projects/contiki-2.4/README-EXAMPLES'),
 PosixPath('data/projects/contiki-2.4/cpu'),
 PosixPath('data/projects/contiki-2.4/platform'),
 PosixPath('data/projects/contiki-2.4/README-BUILDING'),
 PosixPath('data/projects/contiki-2.4/README'),
 PosixPath('data/projects/contiki-2.4/examples'),
 PosixPath('data/projects/contiki-2.4/Makefile.include'),
 PosixPath('data/projects/contiki-2.4/doc'),
 PosixPath('data/projects/contiki-2.4/apps')]

In [None]:
def get_filepaths(directory):
    """
    This function will generate the file names in a directory 
    tree by walking the tree either top-down or bottom-up. For each 
    directory in the tree rooted at directory top (including top itself), 
    it yields a 3-tuple (dirpath, dirnames, filenames).
    # ref: https://stackoverflow.com/questions/3207219/how-do-i-list-all-files-of-a-directory
    """
    file_paths = []  # List which will store all of the full filepaths.

    # Walk the tree.
    for root, directories, files in os.walk(directory):
        for filename in files:
            # Join the two strings in order to form the full filepath.
            filepath = os.path.join(root, filename)
            file_paths.append(filepath)  # Add it to the list.

    return file_paths  # Self-explanatory.

prj_files = get_filepaths(prj_dir)
print('Number of files in the project: ', len(prj_files))

### Guess programming language and scan only C programs

In [None]:
from guesslang import Guess

guess = Guess()

# Guess the language from code
for file in prj_files:
    # code = file.read()
    with open(file, 'r', encoding= 'unicode_escape') as f:
        language = guess.language_name(f.read())
        print(language)

Rust
Markdown
CMake
Markdown
Makefile
TeX
Shell
Perl
Perl
YAML
Shell
Shell
C++
C
Perl
Shell
Shell
C++
Shell
Shell
C
Shell
C
C
Shell
Shell
Perl


UnicodeDecodeError: 'unicodeescape' codec can't decode bytes in position 31267-31268: truncated \UXXXXXXXX escape

In [None]:
eg_file = 'data/projects/contiki-2.4/tools/tunslip.c'
guess.language_name(eg_file)

'INI'

In [None]:
with open(eg_file, 'r') as f:
    print(guess.language_name(f.read()))

C


In [None]:
!flawfinder 'data/projects/contiki-2.4/tools/tunslip.c' --csv 'file.csv'

Flawfinder version 2.0.19, (C) 2001-2019 David A. Wheeler.
Number of rules (primarily dangerous function names) in C/C++ ruleset: 222
Examining data/projects/contiki-2.4/tools/tunslip.c

FINAL RESULTS:

data/projects/contiki-2.4/tools/tunslip.c:438:  [4] (format) vsnprintf:
  If format strings can be influenced by an attacker, they can be exploited,
  and note that sprintf variations do not always \0-terminate (CWE-134). Use
  a constant for the format specification.
data/projects/contiki-2.4/tools/tunslip.c:442:  [4] (shell) system:
  This causes a new program to execute and is difficult to use safely
  (CWE-78). try using a library call that implements the same functionality
  if available.
data/projects/contiki-2.4/tools/tunslip.c:753:  [4] (buffer) strcat:
  Does not check for buffer overflows when concatenating to destination
  [MS-banned] (CWE-120). Consider using strcat_s, strncat, strlcat, or
data/projects/contiki-2.4/tools/tunslip.c:785:  [4] (buffer) strcpy:
  Does not check 