# Source Code Embeddings

In [8]:
import pandas as pd
import h5py
from pycparser import parse_file, c_ast, c_parser
import numpy as np
from collections import Counter
from collections.abc import Iterable

# Open and Explore Test, Train and Validation Datasets

In [5]:
def get_dataset(path):
  df = None
  # open the file as 'f'
  with h5py.File(path, 'r') as f:
    # List all groups
    print("Keys: %s" % f.keys())
    #create a dictionary of our data
    data = dict()
    for column in list(f.keys()):
      data[column] = f[column]

    #Create Pandas Dataframe
    df = pd.DataFrame(data)

  return df


train_path = "./dataset/VDISC_train.hdf5"

df_train = get_dataset(train_path)


Keys: <KeysViewHDF5 ['CWE-119', 'CWE-120', 'CWE-469', 'CWE-476', 'CWE-other', 'functionSource']>


In [6]:
df_train.head(5)

Unnamed: 0,CWE-119,CWE-120,CWE-469,CWE-476,CWE-other,functionSource
0,False,False,False,False,False,"clear_area(int startx, int starty, int xsize, ..."
1,False,False,False,False,False,ReconstructDuList(Statement* head)\n{\n Sta...
2,False,False,False,False,False,free_speaker(void)\n{\n if(Lengths)\n ...
3,False,False,False,False,False,mlx4_register_device(struct mlx4_dev *dev)\n{\...
4,True,True,False,False,True,"Parse_Env_Var(void)\n{\n char *p = getenv(""LI..."


# AST Generation

Pycparser library, which
is a parser for the C language (C99), for generating ASTs
of the source codes

In [9]:
text = " int main() { int a = 5, b = 2; printf(a+b); }"
parser = c_parser.CParser()
ast = parser.parse(text, filename='<none>')
ast.show(nodenames=True, )

FileAST: 
  FuncDef <ext[0]>: 
    Decl <decl>: main, [], [], []
      FuncDecl <type>: 
        TypeDecl <type>: main, []
          IdentifierType <type>: ['int']
    Compound <body>: 
      Decl <block_items[0]>: a, [], [], []
        TypeDecl <type>: a, []
          IdentifierType <type>: ['int']
        Constant <init>: int, 5
      Decl <block_items[1]>: b, [], [], []
        TypeDecl <type>: b, []
          IdentifierType <type>: ['int']
        Constant <init>: int, 2
      FuncCall <block_items[2]>: 
        ID <name>: printf
        ExprList <args>: 
          BinaryOp <exprs[0]>: +
            ID <left>: a
            ID <right>: b


In [10]:
# Definition for a Node.
class Node(object):
    def __init__(self, val, children):
        self.val = val
        self.children = children


# Definition for a binary tree node.
class TreeNode(object):
    def __init__(self, x):
        self.val = x
        self.left = None
        self.right = None

class ModifiedNodeVisitor(c_ast.NodeVisitor):

  lis = list()

  def visit(self, node):
    """ Visit a node.
    """
    print(self.getNodeValue(node))

    return self.generic_visit(node)

  def generic_visit(self, node):
    """ Called if no explicit visitor function exists for a
        node. Implements preorder visiting of the node.
    """
    self.lis.append(node)
    #print("Children: ", self.getChildren(node))
    for c in node:
      print("Output of c.__class__.__name__: ", c.__class__.__name__)
      self.visit(c)

  def getNodeValue(self, node):
    attributes = []
    attributes.append(node.__class__.__name__) #this add the token name
    #this adds the remaining attribute values associated with token
    try:
      for attr in node.attr_names:
        if getattr(node, attr):
          attributes.append(getattr(node, attr))
    except:
      pass

    return attributes

  def getChildren(self, node):
    children = []
    for child in node:
      children.append(child)

    return children

#v = ModifiedNodeVisitor()
#v.generic_visit(ast)
#print(len(v.lis))

# Transform M-Arry Tree to Binary Tree

In [12]:
def getNodeValue(node):
  attributes = []
  attributes.append(node.__class__.__name__) #this add the token name
  #this adds the remaining attribute values associated with token
  try:
    for attr in node.attr_names:
      if getattr(node, attr):
        attributes.append(getattr(node, attr))
  except:
    pass

  return attributes

def getChildren(node):
  children = []
  for child in node:
    children.append(child)

  return children

def encode(root):
  if root == None:
    return None
  
  rootTreeNode = TreeNode(getNodeValue(root))

  children = getChildren(root)
  if children:
    rootTreeNode.right = encode(children[0])

  # the parent for the rest of the children
  currTreeNode = rootTreeNode.right

    # encode the rest of the children
  children = getChildren(root)
  for i in range(1, len(children)):
    currTreeNode.left = encode(children[i])
    currTreeNode = currTreeNode.left

  return rootTreeNode


In [13]:
binaryTree  = encode(ast.ext[0])

In [14]:
def test_level_order(root):
    """
    :type root: Node
    :rtype: List[List[int]]
    """
    if root == None:
        return []
    result = []
    queue = []
    queue.append(root)
    level = 0
    while len(queue) > 0:
        size = len(queue)
        nodes_on_the_same_level = []
        # iterate the nodes on the same level
        print("Size at level {}: {}".format(level, size))
        level += 1
        for i in range(size):

                # add each node to an array
            temp = queue.pop(0)
            nodes_on_the_same_level.append(temp.val)
            # add its children to the queue
            if temp.left != None:
                queue.append(temp.left)
            if temp.right != None:
                queue.append(temp.right)
        result.append(nodes_on_the_same_level)
    #print(result)
    return result

test_level_order(binaryTree)

Size at level 0: 1
Size at level 1: 1
Size at level 2: 2
Size at level 3: 2
Size at level 4: 3
Size at level 5: 4
Size at level 6: 3
Size at level 7: 1
Size at level 8: 1
Size at level 9: 1
Size at level 10: 1


[[['FuncDef']],
 [['Decl', 'main']],
 [['Compound'], ['FuncDecl']],
 [['Decl', 'a'], ['TypeDecl', 'main']],
 [['Decl', 'b'], ['TypeDecl', 'a'], ['IdentifierType', ['int']]],
 [['FuncCall'],
  ['TypeDecl', 'b'],
  ['Constant', 'int', '5'],
  ['IdentifierType', ['int']]],
 [['ID', 'printf'], ['Constant', 'int', '2'], ['IdentifierType', ['int']]],
 [['ExprList']],
 [['BinaryOp', '+']],
 [['ID', 'a']],
 [['ID', 'b']]]

# Complete Binary Tree to Array Representation

In [15]:
#pads a list with zeros if size if not 3
def pad_list(lis):
  size = len(lis)
  if size < 3:
    difference = 3 - size
    for i in range(0,difference):
      lis.append(0.0)
  return lis

  

#preprocesses a node
def preprocess_node(node):
  if node is None:
      return
  node.val = pad_list(node.val)
  if node.left is None:
    node.left = TreeNode([0.0, 0.0, 0.0])
  if node.right is None:
    node.right = TreeNode([0.0, 0.0, 0.0])
  return node

# Function to  print level order traversal of tree
def printLevelOrder(root):
  h = height(root)
  array_representation = []
  for i in range(1, h+1):
    #print("\nlevel: ", i)
    printCurrentLevel(root, i, array_representation)
    if i == 9:
      break

  return array_representation
  
# Print nodes at a current level
def printCurrentLevel(root, level, arr):
  root = preprocess_node(root)
  
  if root is None:
      return
  if level == 1:
      #print(root.val, end=" ")
      arr.append(root.val)
  elif level > 1:
      printCurrentLevel(root.left, level-1, arr)
      printCurrentLevel(root.right, level-1, arr)
  

def height(node):
    if node is None:
        return 0
    else:
        # Compute the height of each subtree
        lheight = height(node.left)
        rheight = height(node.right)
 
        # Use the larger one
        if lheight > rheight:
            return lheight+1
        else:
            return rheight+1


binary_tree = binaryTree
array_representation = printLevelOrder(binary_tree)

In [16]:
print(array_representation)

[['FuncDef', 0.0, 0.0], [0.0, 0.0, 0.0], ['Decl', 'main', 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], ['Compound', 0.0, 0.0], ['FuncDecl', 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], ['Decl', 'a', 0.0], [0.0, 0.0, 0.0], ['TypeDecl', 'main', 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], ['Decl', 'b', 0.0], ['TypeDecl', 'a', 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], ['IdentifierType', ['int'], 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], ['FuncCall', 0.0, 0.0], ['TypeDecl', 'b', 0.0], ['Constant', 'int'

In [17]:
import numpy as np
from collections.abc import Iterable

def flatten(l):
    for el in l:
        if isinstance(el, Iterable) and not isinstance(el, (str, bytes)):
            yield from flatten(el)
        else:
            yield el

#print('Original list', array_representation)
trf = list(flatten(array_representation))
print('Transformed list', trf)
print('Length of transformed list: ', len(trf))

Transformed list ['FuncDef', 0.0, 0.0, 0.0, 0.0, 0.0, 'Decl', 'main', 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 'Compound', 0.0, 0.0, 'FuncDecl', 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 'Decl', 'a', 0.0, 0.0, 0.0, 0.0, 'TypeDecl', 'main', 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 'Decl', 'b', 0.0, 'TypeDecl', 'a', 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 'IdentifierType', 'int', 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 'FuncCall', 0.0, 0.0, 'TypeDecl', 'b', 0.0, 'Constant', 'int', '5', 'IdentifierType', 'int', 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 

In [None]:
from pycparser import c_parser
import json

def add_array_rep_column(dataframe):
  new_dataframe = dataframe #copy the dataframe to make operations on

  df_list = dataframe.values.tolist()
  results = []

  for i in range(len(dataframe)):
    if i%1000 == 0:
      print("iteration", i)
    text = new_dataframe.loc[i, 'functionSource']
    text = text.decode("utf-8")
    #print(text)
    #this needs to be included because unfortunately alot of the provided code does not contain 
    #compilable code, i.e. missing ";" after a statement or other errors. 
    try:
      parser = c_parser.CParser()
      ast_1 = parser.parse(text, filename='<none>')
      tree  = encode(ast_1.ext[0])
      arr = printLevelOrder(tree)
      arr = arr
      arrRepUnmapped = list(flatten(arr))

      temp = df_list[i] + arrRepUnmapped[:1533]
      #print(len(temp))
      results.append(temp)
    except:
      continue

  return results

In [None]:
df_train_array_rep = add_array_rep_column(df_train)

iteration 0
iteration 1000
iteration 2000
iteration 3000
iteration 4000
iteration 5000
iteration 6000
iteration 7000
iteration 8000
iteration 9000
iteration 10000
iteration 11000
iteration 12000
iteration 13000
iteration 14000
iteration 15000
iteration 16000
iteration 17000
iteration 18000
iteration 19000
iteration 20000
iteration 21000
iteration 22000
iteration 23000
iteration 24000
iteration 25000
iteration 26000
iteration 27000
iteration 28000
iteration 29000
iteration 30000
iteration 31000
iteration 32000
iteration 33000
iteration 34000
iteration 35000
iteration 36000
iteration 37000
iteration 38000
iteration 39000
iteration 40000
iteration 41000
iteration 42000
iteration 43000
iteration 44000
iteration 45000
iteration 46000
iteration 47000
iteration 48000
iteration 49000
iteration 50000
iteration 51000
iteration 52000
iteration 53000
iteration 54000
iteration 55000
iteration 56000
iteration 57000
iteration 58000
iteration 59000
iteration 60000
iteration 61000
iteration 62000
itera

# Create New Dataset

In [None]:
#function to insert the column names into our new dataframes
def put_columns(dataframe):
  columns = ["CWE-119","CWE-120",	"CWE-469",	"CWE-476",	"CWE-other",	"functionSource"]
  for i in range(1, 1534):
    columns.append(str(i))
  
  df = pd.DataFrame(dataframe, columns=columns)

  return df

In [None]:
df_train_raw = put_columns(df_train_array_rep)

In [None]:
df_train_raw.head(5)
#df_validate_raw.head(5)

Unnamed: 0,CWE-119,CWE-120,CWE-469,CWE-476,CWE-other,functionSource,1,2,3,4,...,1524,1525,1526,1527,1528,1529,1530,1531,1532,1533
0,False,False,False,False,False,b'timeoutProtoDisplays(void)\n{\n struct pr...,FuncDef,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,False,False,False,False,False,b'smsc911x_rx_readfifo_shift(struct smsc911x_d...,FuncDef,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,False,False,False,False,False,b'technisat_usb2_probe(struct usb_interface *i...,FuncDef,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,False,False,False,False,False,b'hasBorder()\n{\n\tif (entry)\n\t\treturn gtk...,FuncDef,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,False,False,False,False,False,b'log_off(struct session *ses)\n{\n fclose(...,FuncDef,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
def fillna(dataframe):
  df = dataframe
  if dataframe.iloc[:,:].isnull().values.any():
    print("datasets contain null/empty values... filling them with 0.0")
    df = df.fillna(0.0) 

  return df

In [None]:
df_test_raw_complete  = fillna(df_train_raw)
df_test_raw_complete.isnull().values.any()

In [None]:
print("Length of Compilable Raw Test Dataset: ", len(df_test_raw_complete))
#print("Length of Compilable Raw Validation Dataset: ", len(df_validation_raw_complete))

Length of Compilable Raw Test Dataset:  15315
Length of Compilable Raw Validation Dataset:  15235


# Creating Dataset to review "Impact of Depth" section

In [None]:
#path = "/content/drive/MyDrive/Spring 2022 Courses/Secure Software Engineering/Vulnerability Project/Train/cwe_120_train.csv"
path = '/content/drive/MyDrive/Vulnerability Prediction/Raw Datasets/df_train_raw.csv'
df= pd.read_csv(path)

In [None]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,CWE-119,CWE-120,CWE-469,CWE-476,CWE-other,functionSource,1,2,3,...,1524,1525,1526,1527,1528,1529,1530,1531,1532,1533
0,10,True,True,False,False,False,b'draw_keys (int draw)\n{\n char s...,12,0,0,...,0,0,0,0,0,0,0,0,0,0
1,91,False,True,False,False,False,b'mystrdup(char *s)\n{\n char *dup;\n\n ...,12,0,0,...,0,0,0,0,0,0,0,0,0,0
2,103,False,True,False,False,False,"b'nfs_iob_get_fh ( struct io_buffer *io_buf, s...",12,0,0,...,0,0,0,0,0,0,0,0,0,0
3,120,True,True,False,False,False,"b""check_printable(char *id, int maxlen)\n{\n ...",12,0,0,...,0,0,0,0,0,0,0,0,0,0
4,129,True,True,False,False,False,"b'variantSet(enum VariantType type, void *valu...",12,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df['CWE-120'].value_counts()

True     5159
False    5159
Name: CWE-120, dtype: int64

In [None]:
#function to filter our rows on a dataframe based on  column values
def filter_rows(dataframe, column, value):
  filter_condition = dataframe[column] == value
  result = dataframe[filter_condition]
  result = result.iloc[:, :]

  return result


In [None]:
def undersample_df(dataframe, column):
  cwe_true = filter_rows(dataframe, column, True)
  print(f'Length of {column} Positive Dataset: {len(cwe_true)}')

  cwe_false = filter_rows(dataframe, column, False)
  print(f'Randomly sampling {len(cwe_true)} instances from Negative Dataset')

  cwe_false_sample = cwe_false.sample(n=len(cwe_true))
  print('Concatenating Positive and Negative instances')

  complete_cwe_raw = pd.concat([cwe_true, cwe_false_sample], axis=0)
  print(f'Length of complete {column} undersampled dataset: { len(complete_cwe_raw)}\n')

  return complete_cwe_raw

In [None]:
cwe_119_test = undersample_df(df, 'CWE-119')
cwe_120_test = undersample_df(df, 'CWE-120')
cwe_469_test = undersample_df(df, 'CWE-469')
cwe_476_test = undersample_df(df, 'CWE-476')
cwe_other_test = undersample_df(df, 'CWE-other')

Length of CWE-119 Positive Dataset: 357
Randomly sampling 357 instances from Negative Dataset
Concatenating Positive and Negative instances
Length of complete CWE-119 undersampled dataset: 714

Length of CWE-120 Positive Dataset: 688
Randomly sampling 688 instances from Negative Dataset
Concatenating Positive and Negative instances
Length of complete CWE-120 undersampled dataset: 1376

Length of CWE-469 Positive Dataset: 32
Randomly sampling 32 instances from Negative Dataset
Concatenating Positive and Negative instances
Length of complete CWE-469 undersampled dataset: 64

Length of CWE-476 Positive Dataset: 141
Randomly sampling 141 instances from Negative Dataset
Concatenating Positive and Negative instances
Length of complete CWE-476 undersampled dataset: 282

Length of CWE-other Positive Dataset: 404
Randomly sampling 404 instances from Negative Dataset
Concatenating Positive and Negative instances
Length of complete CWE-other undersampled dataset: 808



In [None]:
cwe_119_test.head(3)

Unnamed: 0,CWE-119,CWE-120,CWE-469,CWE-476,CWE-other,functionSource,1,2,3,4,...,1524,1525,1526,1527,1528,1529,1530,1531,1532,1533
5,True,True,False,False,False,"b'get_nth_ancestor(const char *name, int len,\...",FuncDef,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36,True,True,False,False,False,"b'smk_bu_current(char *note, struct smack_know...",FuncDef,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
71,True,True,True,False,False,"b'chirp_audit_recursive(const char *path, stru...",FuncDef,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# saving the dataframe
#cwe_119_test.to_csv('/content/drive/MyDrive/Spring 2022 Courses/Secure Software Engineering/Vulnerability Project/raw datasets/cwe_119_test_raw.csv')
#cwe_120_test.to_csv('/content/drive/MyDrive/Spring 2022 Courses/Secure Software Engineering/Vulnerability Project/raw datasets/cwe_120_test_raw.csv')
#cwe_469_test.to_csv('/content/drive/MyDrive/Spring 2022 Courses/Secure Software Engineering/Vulnerability Project/raw datasets/cwe_469_test_raw.csv')
#cwe_476_test.to_csv('/content/drive/MyDrive/Spring 2022 Courses/Secure Software Engineering/Vulnerability Project/raw datasets/cwe_476_test_raw.csv')
#cwe_other_test.to_csv('/content/drive/MyDrive/Spring 2022 Courses/Secure Software Engineering/Vulnerability Project/raw datasets/cwe_other_test_raw.csv')

Repeating the same procedure for the validation dataset

In [None]:
#cwe_119_validation = undersample_df(df_validation_raw_complete, 'CWE-119')
#cwe_120_validation = undersample_df(df_validation_raw_complete, 'CWE-120')
#cwe_469_validation = undersample_df(df_validation_raw_complete, 'CWE-469')
#cwe_476_validation = undersample_df(df_validation_raw_complete, 'CWE-476')
#cwe_other_validation = undersample_df(df_validation_raw_complete, 'CWE-other')

Length of CWE-119 Positive Dataset: 339
Randomly sampling 339 instances from Negative Dataset
Concatenating Positive and Negative instances
Length of complete CWE-119 undersampled dataset: 678

Length of CWE-120 Positive Dataset: 646
Randomly sampling 646 instances from Negative Dataset
Concatenating Positive and Negative instances
Length of complete CWE-120 undersampled dataset: 1292

Length of CWE-469 Positive Dataset: 36
Randomly sampling 36 instances from Negative Dataset
Concatenating Positive and Negative instances
Length of complete CWE-469 undersampled dataset: 72

Length of CWE-476 Positive Dataset: 147
Randomly sampling 147 instances from Negative Dataset
Concatenating Positive and Negative instances
Length of complete CWE-476 undersampled dataset: 294

Length of CWE-other Positive Dataset: 423
Randomly sampling 423 instances from Negative Dataset
Concatenating Positive and Negative instances
Length of complete CWE-other undersampled dataset: 846



In [None]:
#cwe_119_validation.head(4)

Unnamed: 0,CWE-119,CWE-120,CWE-469,CWE-476,CWE-other,functionSource,1,2,3,4,...,1524,1525,1526,1527,1528,1529,1530,1531,1532,1533
105,True,True,False,False,True,b'table_code(char *table)\n{\n\tint i;\n\tchar...,FuncDef,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
198,True,True,False,False,True,b'login_allowed(char *tty)\n{\n\tFILE *fp;\n\t...,FuncDef,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
220,True,True,False,False,True,b'dump_pid (void)\n{\n FILE *f;\n char...,FuncDef,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
306,True,True,False,False,False,"b'margins_command(char *arg,struct session *se...",FuncDef,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# saving the dataframe
#cwe_119_validation.to_csv('/content/drive/MyDrive/Spring 2022 Courses/Secure Software Engineering/Vulnerability Project/raw datasets/cwe_119_validation_raw.csv')
#cwe_120_validation.to_csv('/content/drive/MyDrive/Spring 2022 Courses/Secure Software Engineering/Vulnerability Project/raw datasets/cwe_120_validation_raw.csv')
#cwe_469_validation.to_csv('/content/drive/MyDrive/Spring 2022 Courses/Secure Software Engineering/Vulnerability Project/raw datasets/cwe_469_validation_raw.csv')
#cwe_476_validation.to_csv('/content/drive/MyDrive/Spring 2022 Courses/Secure Software Engineering/Vulnerability Project/raw datasets/cwe_476_validation_raw.csv')
#cwe_other_validation.to_csv('/content/drive/MyDrive/Spring 2022 Courses/Secure Software Engineering/Vulnerability Project/raw datasets/cwe_other_validation_raw.csv')

#Encode Features

In [None]:
def flatten(l):
    for el in l:
        if isinstance(el, Iterable) and not isinstance(el, (str, bytes)):
            yield from flatten(el)
        else:
            yield el

def get_words(dataframe):
  results = dataframe.values.tolist()
  results = list(flatten(results))
  return results

def filter0(variable):
  list_to_filter = [0.0, 0]

  if variable in list_to_filter:
    return False
  else:
    return True

def get_word_mapping(dataframe):
  words = get_words(dataframe) #get all values in a dataframe and flatten if it contains a list
  string_words = [str(i) for i in words] #convert it all to string 
  wordCounts = Counter(string_words) #creates a Counter
  uniqueWords = sorted(wordCounts, key=wordCounts.get, reverse=True) #removes duplicate words
  wordsToIndex = {w: i for i, w in enumerate(uniqueWords)} #gets a mapping for word to index

  return wordsToIndex

In [None]:
#given a feature dataframe and a word mapping, we transforms it's values to their mapping
def transform_features(feature, mapping):
  result = feature
  for col in feature:
    result[col] = result[col].astype(str) #first convert the cell values to string type
    result[col] = result[col].map(mapping) #convert string type words to their numerical mapping

  return result

def process_raw_df(dataframe):
  features = dataframe.iloc[:, 6:]
  labels = dataframe.iloc[:, :6]

  wordsToIndex = get_word_mapping(features)
  transformed_features = transform_features(features, wordsToIndex)

  result = pd.concat([labels, transformed_features], axis=1, join='inner')

  return result


In [None]:
def save_df(dataframe, path):
  processed = process_raw_df(dataframe)
  has_null = processed.iloc[:,:].isnull().values.any() #check if everything got converted correctly -- should print False

  print(f"Does data contain null values?: {has_null}")

  processed.to_csv(path)



In [None]:
cwe_119_train = pd.read_csv('/content/drive/MyDrive/Spring 2022 Courses/Secure Software Engineering/Vulnerability Project/raw datasets/CWE 119/cwe_119_train_raw.csv', index_col = 0)
cwe_120_train = pd.read_csv('/content/drive/MyDrive/Spring 2022 Courses/Secure Software Engineering/Vulnerability Project/raw datasets/CWE 120/cwe_120_train_raw.csv', index_col = 0)
cwe_469_train =  pd.read_csv('/content/drive/MyDrive/Spring 2022 Courses/Secure Software Engineering/Vulnerability Project/raw datasets/CWE 469/cwe_469_train_raw.csv', index_col = 0)
cwe_476_train =  pd.read_csv('/content/drive/MyDrive/Spring 2022 Courses/Secure Software Engineering/Vulnerability Project/raw datasets/CWE 476/cwe_476_train_raw.csv', index_col = 0)
cwe_other_train =  pd.read_csv('/content/drive/MyDrive/Spring 2022 Courses/Secure Software Engineering/Vulnerability Project/raw datasets/CWE Other/cwe_other_train_raw.csv', index_col = 0)

cwe_119_train.head(5)

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,CWE-119,CWE-120,CWE-469,CWE-476,CWE-other,functionSource,1,2,3,4,...,1524,1525,1526,1527,1528,1529,1530,1531,1532,1533
10,True,True,False,False,False,b'draw_keys (int draw)\n{\n char s...,FuncDef,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
120,True,True,False,False,False,"b""check_printable(char *id, int maxlen)\n{\n ...",FuncDef,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
129,True,True,False,False,False,"b'variantSet(enum VariantType type, void *valu...",FuncDef,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
161,True,True,False,False,False,"b'ccid_error(int error, const char *file, int ...",FuncDef,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
188,True,True,False,False,False,"b'verify_absent_1(struct cache_entry *ce,\n\t\...",FuncDef,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
test_path = '/content/drive/MyDrive/Vulnerability Prediction/'
save_df(cwe_119_train, test_path+'cwe_119_train.csv')
save_df(cwe_120_train, test_path+'cwe_120_train.csv')
save_df(cwe_469_train, test_path+'cwe_469_train.csv')
save_df(cwe_476_train, test_path+'cwe_476_train.csv')
save_df(cwe_other_train, test_path+'cwe_other_train.csv')

Does data contain null values?: False
Does data contain null values?: False
Does data contain null values?: False
Does data contain null values?: False
Does data contain null values?: False


In [None]:


#cwe_119_validation = pd.read_csv('/content/drive/MyDrive/Spring 2022 Courses/Secure Software Engineering/Vulnerability Project/raw datasets/CWE 119/cwe_119_validation_raw.csv', index_col = 0)
#cwe_120_validation = pd.read_csv('/content/drive/MyDrive/Spring 2022 Courses/Secure Software Engineering/Vulnerability Project/raw datasets/CWE 120/cwe_120_validation_raw.csv', index_col = 0)
#cwe_469_validation =  pd.read_csv('/content/drive/MyDrive/Spring 2022 Courses/Secure Software Engineering/Vulnerability Project/raw datasets/CWE 469/cwe_469_validation_raw.csv', index_col = 0)
#cwe_476_validation =  pd.read_csv('/content/drive/MyDrive/Spring 2022 Courses/Secure Software Engineering/Vulnerability Project/raw datasets/CWE 476/cwe_476_validation_raw.csv', index_col = 0)
#cwe_other_validation =  pd.read_csv('/content/drive/MyDrive/Spring 2022 Courses/Secure Software Engineering/Vulnerability Project/raw datasets/CWE Other/cwe_other_validation_raw.csv', index_col = 0)

#cwe_119_validation.head(5)



  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,CWE-119,CWE-120,CWE-469,CWE-476,CWE-other,functionSource,1,2,3,4,...,1524,1525,1526,1527,1528,1529,1530,1531,1532,1533
105,True,True,False,False,True,b'table_code(char *table)\n{\n\tint i;\n\tchar...,FuncDef,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
198,True,True,False,False,True,b'login_allowed(char *tty)\n{\n\tFILE *fp;\n\t...,FuncDef,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
220,True,True,False,False,True,b'dump_pid (void)\n{\n FILE *f;\n char...,FuncDef,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
306,True,True,False,False,False,"b'margins_command(char *arg,struct session *se...",FuncDef,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
307,True,True,False,False,False,b'disconnect_client(int sock)\n{\n\tchar User[...,FuncDef,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
#validation_path = '/content/drive/MyDrive/Spring 2022 Courses/Secure Software Engineering/Vulnerability Project/try/validation/'

#save_df(cwe_119_validation, validation_path+'cwe_119_validation.csv')
#save_df(cwe_120_validation, validation_path+'cwe_120_validation.csv')
#save_df(cwe_469_validation, validation_path+'cwe_469_validation.csv')
#save_df(cwe_476_validation, validation_path+'cwe_476_validation.csv')
#save_df(cwe_other_validation, validation_path+'cwe_other_validation.csv')

Does data contain null values?: False
Does data contain null values?: False
Does data contain null values?: False
Does data contain null values?: False
Does data contain null values?: False


In [None]:
#test_path = '/content/drive/MyDrive/Spring 2022 Courses/Secure Software Engineering/Vulnerability Project/try/test'

#cwe_119_test = pd.read_csv('/content/drive/MyDrive/Spring 2022 Courses/Secure Software Engineering/Vulnerability Project/raw datasets/CWE 119/cwe_119_test_raw.csv', index_col = 0)
#cwe_120_test = pd.read_csv('/content/drive/MyDrive/Spring 2022 Courses/Secure Software Engineering/Vulnerability Project/raw datasets/CWE 120/cwe_120_test_raw.csv', index_col = 0)
#cwe_469_test =  pd.read_csv('/content/drive/MyDrive/Spring 2022 Courses/Secure Software Engineering/Vulnerability Project/raw datasets/CWE 469/cwe_469_test_raw.csv', index_col = 0)
#cwe_476_test =  pd.read_csv('/content/drive/MyDrive/Spring 2022 Courses/Secure Software Engineering/Vulnerability Project/raw datasets/CWE 476/cwe_476_test_raw.csv', index_col = 0)
#cwe_other_test =  pd.read_csv('/content/drive/MyDrive/Spring 2022 Courses/Secure Software Engineering/Vulnerability Project/raw datasets/CWE Other/cwe_other_test_raw.csv', index_col = 0)

#cwe_119_test.head(5)


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,CWE-119,CWE-120,CWE-469,CWE-476,CWE-other,functionSource,1,2,3,4,...,1524,1525,1526,1527,1528,1529,1530,1531,1532,1533
5,True,True,False,False,False,"b'get_nth_ancestor(const char *name, int len,\...",FuncDef,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36,True,True,False,False,False,"b'smk_bu_current(char *note, struct smack_know...",FuncDef,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
71,True,True,True,False,False,"b'chirp_audit_recursive(const char *path, stru...",FuncDef,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
79,True,True,False,False,True,"b'main(int argc, char *argv[])\n{\n\tchar dumm...",FuncDef,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
119,True,True,False,False,False,b'usnic_transport_put_socket(struct socket *so...,FuncDef,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
#save_df(cwe_119_test, test_path+'cwe_119_test.csv')
#save_df(cwe_120_test, test_path+'cwe_120_test.csv')
#save_df(cwe_469_test, test_path+'cwe_469_test.csv')
#save_df(cwe_476_test, test_path+'cwe_476_test.csv')
#save_df(cwe_other_test, test_path+'cwe_other_test.csv')

Does data contain null values?: False
Does data contain null values?: False
Does data contain null values?: False
Does data contain null values?: False
Does data contain null values?: False
