In [76]:
# imports
import os
import io
import re
# import tokenize
# import json
# import numpy as np
import pandas as pd

In [77]:
# paths
data_path =  '/home/ourownstory/Documents/SOL/data/'
# zeppelin_folder = '/home/ourownstory/Documents/SOL/data/Zeppelin/token/'
# os.listdir(data_path)
# os.listdir(zeppelin_folder)
out_path = '/home/ourownstory/Documents/SOL/derived/'

In [78]:
# create a pandas dataframe with the filenames in our dataset

df_files_dict = {
    'root': [],
    'file_name': [],
}
for root, subdirs, df_files in os.walk(data_path):
    for file_name in df_files:
        df_files_dict['root'].append(root[len(data_path):])
        df_files_dict['file_name'].append(file_name)
        
df_files = pd.DataFrame.from_dict(df_files_dict)
root_list = df_files['root'].values
root_list = [root.split('/') for root in root_list]
df_files['class'] = [r.pop(0) for r in root_list]
df_files['company'] = [r.pop(0) for r in root_list]
df_files['root'] = ["/".join(r) for r in root_list]
file_name_list = df_files['file_name'].values
df_files['extension'] = [e.split('.')[-1] for e in file_name_list]

# filter all files that are not .sol
df_files = df_files[df_files.pop('extension') == 'sol']

df_files.to_csv(os.path.join(out_path, 'df_files.csv'))


In [79]:
# short analysis: count unique counts of entries in each column
# save to csv

unique_counts = {}
for col in df_files:
    col_counts = pd.Series.value_counts(df_files[col], sort=True)
    unique_counts[col] = col_counts.to_dict()
    col_counts.to_csv(os.path.join(out_path, 'counts_{}.csv'.format(col)))
    
# TODO: make plot ICO vs notICO

In [80]:
# read in contents of files as string

# including comments
def read_src_as_string(file_name):
    with open(file_name, 'r') as f:
        return f.read()

# strips pragma statements and comments
# note: does not strip in-line comments, we assume these do not exist.
def read_src_as_string_nocomments(file_name):
    with open(file_name, 'r') as f:
        src_list = []
        for line in f.readlines():
            if re.match('pragma solidity .*;', line.strip()):
                continue
            if re.match('/', line.strip()) or re.match('\*', line.strip()):
                continue
            src_list.append(line)
        return '\n'.join(src_list)
        

def get_filename_for_row(row):
    filename = os.path.join(
        data_path, 
        row.loc['class'], 
        row.loc['company'], 
        row.loc['root'], 
        row.loc['file_name']
    )
    return filename

def get_file_src_as_string(row):
    return read_src_as_string(get_filename_for_row(row))

def get_file_src_as_string_nocomments(row):
    return read_src_as_string_nocomments(get_filename_for_row(row))
    

In [81]:
# test comment stripping

# lines = '    /**\n    * @dev Allows the current owner to transfer control of the contract to a newOwner.\n    * @param newOwner The address to transfer ownership to.\n    */\n    function transferOwnership(address newOwner) onlyOwner public {\n        // do not allow self ownership\n        require(newOwner != address(this));\n        super.transferOwnership(newOwner);\n    }'
# lines = lines.split('\n')
# for line in lines:
#     print line
#     print re.match('/', line.strip())
#     print re.match('\*', line.strip())

In [84]:
# test pragma stripping
# line = ' pragma solidity ^0.4.11; '
# print re.match('pragma solidity .*;', line.strip())

In [87]:
# execute src reading
# df_files['src'] = df_files.apply(get_file_src_as_string, axis=1)

# OR
# execute src reading without comments
df_files['src'] = df_files.apply(get_file_src_as_string_nocomments, axis=1)

In [88]:
def match_all_contracts_from_src_string(src_string):
    regex = "contract .*{\n"
    matches = re.findall(regex, src_string)
    contract_names = []
    inherited_contracts = []
    for match in matches:
        match = match[len("contract "):-len("{\n")]
        match_type = match.split(' is ')
        contract_names.append(match_type[0].strip())
        if len(match_type) > 1:
            inherited_contracts.append([x.strip() for x in match_type[1].split(',')])
        else:
            inherited_contracts.append([])
    return contract_names, inherited_contracts


In [91]:
def match_contract_name_from_src_string(src_string):
    return match_all_contracts_from_src_string(src_string)[0]
    
def match_inherited_contracts_from_src_string(src_string):
    return match_all_contracts_from_src_string(src_string)[1]

In [92]:
df_files['contract_name'] = df_files['src'].apply(match_contract_name_from_src_string)
df_files['inherited_contracts'] = df_files['src'].apply(match_inherited_contracts_from_src_string)

In [114]:
# df_files

In [128]:
def flatten(x):
    return [item for sublist in x for item in sublist]

In [115]:
# check files without contract
files_without_contracts = df_files[df_files['contract_name'].apply(lambda x: len(x) == 0)]
print "files without contract: ", len(files_without_contracts)
# files_without_contracts

files without contract:  43


In [130]:
# check files many inherited contracts
files_with_many_inherited_contracts = df_files[df_files['inherited_contracts'].apply(lambda x: len(flatten(x)) == 0)]
print "files 0 inherited contracts: ", len(files_with_many_inherited_contracts)
files_with_many_inherited_contracts = df_files[df_files['inherited_contracts'].apply(lambda x: len(flatten(x)) == 1)]
print "files 1 inherited contracts: ", len(files_with_many_inherited_contracts)
files_with_many_inherited_contracts = df_files[df_files['inherited_contracts'].apply(lambda x: len(flatten(x)) > 1)]
print "files more than 1 inherited contracts: ", len(files_with_many_inherited_contracts)

files 0 inherited contracts:  195
files 1 inherited contracts:  203
files more than 1 inherited contracts:  124


In [131]:
# check files many  inherited contracts
files_with_many_inherited_contracts = df_files[df_files['inherited_contracts'].apply(lambda x: len(flatten(x)) > 4)]
print "files many inherited contracts: ", len(files_with_many_inherited_contracts)
# files_with_many_inherited_contracts

files many inherited contracts:  21


In [133]:
# compute frequency of inherited files
inheritance_flat = flatten(flatten(df_files['inherited_contracts']))
col_counts = pd.Series(inheritance_flat).value_counts(sort=True)
col_counts.to_csv(os.path.join(out_path, 'counts_{}.csv'.format('inheritance')))

# TODO: make plot ICO vs notICO

In [94]:
# retrieve imports
def match_imports_from_src_string(src_string):
    regex = "import .*;\n"
    matches = re.findall(regex, src_string)
    matches_cleaned = [m[len("import '"):-len("';\n")] for m in matches]
    return matches_cleaned

def exctract_imports(src_string):
    matches_cleaned = match_imports_from_src_string(src_string)
    imports =  [m.split('/')[-1] for m in matches_cleaned]
    return imports

def exctract_imports_path(src_string):
    matches_cleaned = match_imports_from_src_string(src_string)
    imports_path =  ["/".join(m.split('/')[:-1]) for m in matches_cleaned]
    return imports_path

In [95]:
# execute
df_files['imports'] = df_files['src'].apply(exctract_imports)
df_files['imports_path'] = df_files['src'].apply(exctract_imports_path)

In [96]:
# check files with (many) imports
files_with_imports = df_files[df_files['imports'].apply(lambda x: len(x) > 5)]
len(files_with_imports)
# files_with_imports

# TODO: make plot ICO vs notICO

38

In [101]:
# files_with_imports['imports'].values[:3]

In [103]:
# we want to combine imported files with imports
# first check if file_names are unique in a company
# A: no. but only AugurCore and DemeterLife

# identical_filenames = df_files.groupby(['company', 'file_name']).size()
# identical_filenames[identical_filenames > 1]

In [104]:
# look at example duplicate name

# augurcore = df_files[df_files['company'] == 'AugurCore']
# augurcore[augurcore['file_name'] == 'BasicToken.sol']

In [105]:
# compute frequency of imported files
imports_flat = [item for imports in df_files['imports'].values for item in imports]
col_counts = pd.Series(imports_flat).value_counts(sort=True)
col_counts.to_csv(os.path.join(out_path, 'counts_{}.csv'.format('imports')))

# TODO: make plot ICO vs notICO

In [106]:
# save without src to csv
df_files_out = df_files.copy(deep=True)
df_files_out['src'] = df_files['src'].apply(lambda x: len(x))
df_files_out.to_csv(os.path.join(out_path, 'df_files.csv'))


In [19]:
# match imported files with files in dataset
df_files['ID'] = df_files.index.values
df_files['imports_idx'] = None
df_files['imports_zeppelin'] = False
df_files['contains_zeppelin'] = False

# all zeppelin files for imports from there.
files_zeppelin = df_files[df_files['company'] == 'Zeppelin']

# TODO: add these two, combine files based on inheritance instead of imports
# df_files['inherits_idx'] = None
# df_files['is_inherited'] = False


verbose = True

# TODO: only import file if it contains an inherited contract!
import_only_inherited = True


for idx in df_files.index.values:
    f = df_files.loc[idx]
    df_files.loc[idx, 'contains_zeppelin'] = 'zeppelin' in f.loc['src'].lower()
    company = f.loc['company']
    files_company = df_files[df_files['company'] == company]
    f_imports = f.loc['imports']
    f_imports_path = f.loc['imports_path']
#     if len(f_imports) < 1:
#         continue
    imports_idx_list = []
    for import_file_name, import_file_path in zip(f_imports, f_imports_path):
        matching_files = files_company[files_company['file_name'] == import_file_name]
        
        if len(matching_files) == 1:
            imports_idx = matching_files.index.values[0]
            
        elif len(matching_files) > 1:
            # handle ties
            target_set = set(import_file_path.split('/'))
            num_joint_roots = []
            for match_root in matching_files['root'].values:
                joint_roots = [1 for r in match_root.split('/') if r in target_set]
                num_joint_roots.append(sum(joint_roots))
            max_matches = max(num_joint_roots)
            if sum(max_matches == m for m in num_joint_roots) > 1:
                # handle tie-tie
                root_len_diff = [abs(len(import_file_path.split('/')) - len(r.split('/'))) 
                                 for r in matching_files['root'].values]
                better_match = root_len_diff.index(min(root_len_diff)) 
            else:
                better_match = num_joint_roots.index(max_matches)      
            imports_idx = matching_files.index.values[better_match]
            
            if verbose:
                print "import root: {}; matching roots: {}".format(import_file_path, matching_files['root'].values)
                print import_file_path in matching_files['root'].values
            
        elif len(matching_files) < 1:
            # not found
            if 'zeppelin' in import_file_path.lower():
                imports_idx = -2
                df_files.at[idx, 'imports_zeppelin'] = True
            else: 
                imports_idx = -1
                if verbose:
                    print "no import match for: ", import_file_name, import_file_path
        imports_idx_list.append(imports_idx)
    
    df_files.at[idx, 'imports_idx'] = imports_idx_list


In [20]:
sum(df_files['imports_zeppelin'] == True)

68

In [21]:
sum(df_files['contains_zeppelin'] == True)

93

In [22]:
imports_z = df_files.groupby(['class', 'company'])['imports_zeppelin'].sum()
contains_z = df_files.groupby(['class', 'company'])['contains_zeppelin'].sum()
z_presence = pd.DataFrame(pd.Series(imports_z))
z_presence['contains_zeppelin'] = contains_z
z_presence

Unnamed: 0_level_0,Unnamed: 1_level_0,imports_zeppelin,contains_zeppelin
class,company,Unnamed: 2_level_1,Unnamed: 3_level_1
ICO,Aragon,3.0,3.0
ICO,AuthenticID,2.0,2.0
ICO,BATToken,0.0,0.0
ICO,BitClave,7.0,7.0
ICO,BitDegree,1.0,1.0
ICO,Bread,0.0,6.0
ICO,BullToken,8.0,9.0
ICO,DemeterLife,0.0,7.0
ICO,Gameflip,3.0,3.0
ICO,Hoqu,3.0,3.0


In [23]:
# check if is imported 
# -> not imported files will be combined with the files they import

imported_idxs = set([item for imports in df_files['imports_idx'].values for item in imports])
df_files['is_imported'] = df_files['ID'].apply(lambda x: x in imported_idxs)

In [24]:
def get_all_imports_idx(imports_idx, idx_set, depth_list, depth, max_depth):
#     print 'imports_idx: ', imports_idx
#     print 'idx_set: ', idx_set
#     print 'depth: ', depth
#     print 'depth_list: ', depth_list
    depth += 1
    if depth <= max_depth:
        for idx in imports_idx:
    #         print 'idx: ', idx
            if idx not in idx_set:
                idx_set.add(idx)
                depth_list.append(depth)
                if idx >= 0:
                    next_imports_idx = df_files.loc[idx, 'imports_idx']
                    if len(next_imports_idx) > 0:
            #             print 'next_imports_idx: ', next_imports_idx
                        _ = get_all_imports_idx(next_imports_idx, idx_set, depth_list, depth, max_depth)

    else:
        print "max_depth reached for :"
        print idx_set
        print depth_list
    # TODO: also import zeppelin files (and their imports)?
#     print 'reached return: ', idx_list
#     print 'depth: ', depth, depth_list
    return idx_set

def get_all_imports_idx_from_row(row):
    idx_set = set([row['ID']])
    depth_list = [0]
    depth = 0
    max_depth = 20
#     print row
    if row.loc['is_imported'] == False:
        return  get_all_imports_idx(row.loc['imports_idx'], idx_set, depth_list, depth, max_depth)
    else: 
        return set()
    

In [25]:
def get_all_imports_depth(imports_idx, idx_set, depth_list, depth, max_depth):
#     print 'imports_idx: ', imports_idx
#     print 'idx_set: ', idx_set
#     print 'depth: ', depth
#     print 'depth_list: ', depth_list
    depth += 1
    if depth <= max_depth:
        for idx in imports_idx:
    #         print 'idx: ', idx
            if idx not in idx_set:
                idx_set.add(idx)
                depth_list.append(depth)
                if idx >= 0:
                    next_imports_idx = df_files.loc[idx, 'imports_idx']
                    if len(next_imports_idx) > 0:
            #             print 'next_imports_idx: ', next_imports_idx
                        _ = get_all_imports_idx(next_imports_idx, idx_set, depth_list, depth, max_depth)

    else:
        print "max_depth reached for :"
        print idx_set
        print depth_list
    # TODO: also import zeppelin files (and their imports)?
#     print 'reached return: ', idx_list
#     print 'depth: ', depth, depth_list
    return max(depth_list)

def get_all_imports_depth_from_row(row):
    idx_set = set([row['ID']])
    depth_list = [0]
    depth = 0
    max_depth = 20
#     print row
    if row.loc['is_imported'] == False:
        return  get_all_imports_depth(row.loc['imports_idx'], idx_set, depth_list, depth, max_depth)
    else: 
        return -1
    

In [26]:
# TODO: also import zeppelin files?
df_files['imports_idx_all'] = df_files.apply(get_all_imports_idx_from_row, axis=1)
df_files['imports_idx_all'] = df_files['imports_idx_all'].apply(list)
df_files['imports_depth'] = df_files.apply(get_all_imports_depth_from_row, axis=1)

In [27]:
df_joined = df_files[df_files['imports_depth'] >= 0]

In [28]:
print len(df_files), len(df_joined)

522 198


In [29]:
def join_imports(row):
    row.loc['joined_files'] = []
    row.loc['joined_roots'] = []
    row.loc['src'] = []
    for idx in row.loc['imports_idx_all']:
        if idx >= 0:
            row.loc['joined_files'].append(df_files.loc[idx, 'file_name'])
            row.loc['joined_roots'].extend(df_files.loc[idx, 'root'].split('/'))
            row.loc['src'].append(df_files.loc[idx, 'src'])
            if df_files.loc[idx, 'imports_zeppelin'] == True:
                row.loc['imports_zeppelin'] = True
            if df_files.loc[idx, 'contains_zeppelin'] == True:
                row.loc['contains_zeppelin'] = True
                
    row.loc['src'] = "\n".join(row.loc['src'])
    row.loc['joined_roots'] = list(set(row.loc['joined_roots'] ))
    row.loc['joined_roots'] = [x for x in row.loc['joined_roots'] if x not in ['', '.', '..']]
    del row['imports']
    del row['imports_idx']
    del row['imports_path']
    del row['is_imported']
    return row

In [30]:
df_joined = df_joined.apply(join_imports, axis=1)

In [31]:
df_joined

Unnamed: 0,file_name,root,class,company,src,contract_name,inherited_contracts,ID,imports_zeppelin,contains_zeppelin,imports_idx_all,imports_depth,joined_files,joined_roots
2,Migrations.sol,,notICO,AceBusters,pragma solidity ^0.4.11;\n\ncontract Migration...,[Migrations],[[]],2,False,False,[2],0,[Migrations.sol],[]
11,UpgradeEvent.sol,policies,notICO,AceBusters,pragma solidity ^0.4.11;\n\ncontract PullPayIn...,[UpgradeEvent],[[]],11,False,True,"[0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 15, 16, 17...",6,"[PullPayInterface.sol, ERC20Basic.sol, ERC223B...","[controller, ownership, satelites, policies]"
13,PowerEventReplacement.sol,policies,notICO,AceBusters,pragma solidity ^0.4.11;\n\ncontract PullPayIn...,[PowerEventReplacement],[[PowerEvent]],13,False,True,"[0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 15, 16...",7,"[PullPayInterface.sol, ERC20Basic.sol, ERC223B...","[controller, ownership, satelites, policies]"
14,UpgradeEventCompact.sol,policies,notICO,AceBusters,pragma solidity ^0.4.11;\n\ncontract PullPayIn...,[UpgradeEventCompact],[[]],14,False,True,"[0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 14, 15, 16, 17...",6,"[PullPayInterface.sol, ERC20Basic.sol, ERC223B...","[controller, ownership, satelites, policies]"
24,Migrations.sol,,notICO,GnosisMarkets,pragma solidity ^0.4.4;\n\ncontract Migrations...,[Migrations],[[]],24,False,False,[24],0,[Migrations.sol],[]
29,CampaignFactory.sol,Markets,notICO,GnosisMarkets,pragma solidity 0.4.15;\n\n\n/// @title Math l...,[CampaignFactory],[[]],29,False,False,"[32, 34, 44, 48, 49, 51, 53, 26, 27, 28, 29, 31]",5,"[Math.sol, MarketMaker.sol, Oracle.sol, Standa...","[Utils, Oracles, Tokens, Markets, Events, Mark..."
33,LMSRMarketMaker.sol,MarketMakers,notICO,GnosisMarkets,pragma solidity 0.4.15;\n\n\n/// @title Math l...,[LMSRMarketMaker],[[MarketMaker]],33,False,False,"[32, 33, 34, 44, 48, 49, 51, 53, 28]",5,"[Math.sol, LMSRMarketMaker.sol, MarketMaker.so...","[Utils, Oracles, Tokens, Markets, Events, Mark..."
36,MajorityOracleFactory.sol,Oracles,notICO,GnosisMarkets,pragma solidity 0.4.15;\n\n\n/// @title Abstra...,[MajorityOracleFactory],[[]],36,False,False,"[44, 36, 39]",2,"[Oracle.sol, MajorityOracleFactory.sol, Majori...",[Oracles]
37,UltimateOracleFactory.sol,Oracles,notICO,GnosisMarkets,pragma solidity 0.4.15;\n\n\n/// @title Math l...,[UltimateOracleFactory],[[]],37,False,False,"[32, 51, 35, 44, 37]",2,"[Math.sol, Token.sol, UltimateOracle.sol, Orac...","[Tokens, Utils, Oracles]"
40,SignedMessageOracleFactory.sol,Oracles,notICO,GnosisMarkets,"pragma solidity 0.4.15;\nimport ""../Oracles/Si...",[SignedMessageOracleFactory],[[]],40,False,False,"[40, 44, 47]",2,"[SignedMessageOracleFactory.sol, Oracle.sol, S...",[Oracles]


In [32]:
# save without src to csv
df_files_out = df_files.copy(deep=True)
df_files_out['src'] = df_files['src'].apply(lambda x: len(x))
df_files_out.to_csv(os.path.join(out_path, 'df_files.csv'))

In [33]:
df_joined_out = df_joined.copy(deep=True)
df_joined_out['src'] = df_joined['src'].apply(lambda x: len(x))
df_joined_out.to_csv(os.path.join(out_path, 'df_joined.csv'))

In [34]:
def save_joined_src(row):
    root_name = "_".join([x for x in row.loc['root'].split('/') if x not in ['', '.', '..']])
    root_name = "{}_".format(root_name) if len(root_name) > 0 else root_name
    file_name = "joined_{}{}".format(root_name, row.loc['file_name'])
    class_name = row.loc['class']
    company_name = row.loc['company']
    
    if not os.path.exists(os.path.join(out_path, 'joined')):
        os.mkdir(os.path.join(out_path, 'joined'))
    if not os.path.exists(os.path.join(out_path, 'joined', class_name)):
        os.mkdir(os.path.join(out_path, 'joined', class_name))
    if not os.path.exists(os.path.join(out_path, 'joined', class_name, company_name)):
        os.mkdir(os.path.join(out_path, 'joined', class_name, company_name))

    with open(os.path.join(out_path, 'joined', class_name, company_name, file_name), 'w')as f:
        f.write(row.loc['src'])

In [35]:
_ = df_joined.apply(save_joined_src, axis=1)