# Parse source codes to a Pandas Data Frame

In [1]:
import os
import numpy as np
import pandas as pd
import re

In [2]:
files = os.listdir('./sol_source')

In [41]:
# Load list from existing dataset
contract = pd.read_csv("./Ethereum.csv", delimiter=',', header=None)
contract.dropna(axis=0, inplace=True, subset=[1])
contract.rename(columns={0:'timestamp', 1:'address', 2:'n_tr', 3:'ether'}, inplace=True)
contract.address = contract.address.apply(lambda x: x[1:])
contract.drop_duplicates(inplace=True)

In [4]:
# functions

def contract_name_extract(data):
    #extract contract name
    contract_name = re.findall('[\n\r].*Contract|contract\s*([^\n\r]*)\s{', data)
    if len(contract_name)>1:
        contract_name_string = ' '.join(contract_name)
        contract_name_string = re.sub(r'[^\w\s]','', contract_name_string)
        contract_name = contract_name[0]        
    else:
        contract_name_string = 'only_1_contract'
    return(contract_name, contract_name_string)

def function_name_extract(data):
    #extract function names and join to one string
    function_name = re.findall('[\n\r].*function\s*([^\n\r\(]*)', data)
    function_name_string = ' '.join(function_name)
    function_name_string = re.sub(r'[^\w\s]','', function_name_string)
    return(function_name_string)

def comments_extract(data):
    #extract contract comments and join to one text
    one_line_comments = re.findall(re.compile("/\*.*?\*/", re.DOTALL), data) # find all occurance streamed comments (/*COMMENT */) from string
    mult_line_comments = re.findall('//.*?\n', data) # find all occurance singleline comments (//COMMENT\n ) from string
    comments1 = ' '.join(one_line_comments)
    comments2 = ' '.join(mult_line_comments)
    comments = ' '.join([comments1, comments2])
    comments = re.sub(r'[^\w\s]',' ', comments)
    comments = re.sub(r'[\n]','', comments)
    comments = comments.split()
    comments = ' '.join(comments)
    return(comments)

def code_no_punct(data):
    codes = re.sub(r'[^\w\s]',' ', data)
    codes = re.sub(r'[\n]','', codes)
    codes = codes.split()
    codes = ' '.join(codes)
    return(codes)

In [5]:
df = pd.DataFrame(columns=['address', 'contract_name', 'source_code', 'contract_name_string', 'function_names', 'comments', 'code_all_no_punct'])

In [6]:
for i, f in enumerate(files):
    if i%1000==0:
        print(i)
    try:
        with open(f'./sol_source/{f}') as file:  
            data = file.read() 
            contract_name, contract_name_string = contract_name_extract(data)
            function_name_string = function_name_extract(data)
            comments = comments_extract(data)
            codes_no_p = code_no_punct(data)
            df_temp = pd.DataFrame(data = {'address' : f[:-4],  'contract_name' : contract_name, 'source_code': data,
                                           'contract_name_string' : contract_name_string,  'function_names' : function_name_string,
                                          'comments' : comments, 'code_all_no_punct' : codes_no_p}, index=[0])
            df = pd.concat([df, df_temp])
    except:
        continue
print('Parsing finished')
df.reset_index(drop=True, inplace=True)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
Parsing finished


In [48]:
df_new = pd.merge(df, contract, how='inner', on='address')

In [50]:
df_new.to_csv('./contracts_source_codes.csv', index=False)