In [1]:
# imports
import os
import io
import re
# import tokenize
# import json
# import numpy as np
import pandas as pd
import stringdist

In [2]:
# paths
data_path =  '/home/ourownstory/Documents/SOL/data/'
# zeppelin_folder = '/home/ourownstory/Documents/SOL/data/Zeppelin/token/'
# os.listdir(data_path)
# os.listdir(zeppelin_folder)
out_path = '/home/ourownstory/Documents/SOL/derived/'

In [3]:
# create a pandas dataframe with the filenames in our dataset

df_files_dict = {
    'root': [],
    'file_name': [],
}
for root, subdirs, df_files in os.walk(data_path):
    for file_name in df_files:
        df_files_dict['root'].append(root[len(data_path):])
        df_files_dict['file_name'].append(file_name)
        
df_files = pd.DataFrame.from_dict(df_files_dict)
root_list = df_files['root'].values
root_list = [root.split('/') for root in root_list]
df_files['class'] = [r.pop(0) for r in root_list]
df_files['company'] = [r.pop(0) for r in root_list]
df_files['root'] = ["/".join(r) for r in root_list]
file_name_list = df_files['file_name'].values
df_files['extension'] = [e.split('.')[-1] for e in file_name_list]

# filter all files that are not .sol
df_files = df_files[df_files.pop('extension') == 'sol']

df_files.to_csv(os.path.join(out_path, 'df_files.csv'))


In [4]:
# short analysis: count unique counts of entries in each column
# save to csv

unique_counts = {}
for col in df_files:
    col_counts = pd.Series.value_counts(df_files[col], sort=True)
    unique_counts[col] = col_counts.to_dict()
    col_counts.to_csv(os.path.join(out_path, 'counts_{}.csv'.format(col)))
    
# TODO: make plot ICO vs notICO

In [5]:
# helper func to get file-path

def get_filename_for_row(row):
    filename = os.path.join(
        data_path, 
        row.loc['class'], 
        row.loc['company'], 
        row.loc['root'], 
        row.loc['file_name']
    )
    return filename

In [6]:
# read in contents of files as string

# including comments
def read_src(file_name):
    with open(file_name, 'r') as f:
        return f.read()

# strips pragma statements and comments
def read_src_nocomments(file_name, return_only_comments=False):
    with open(file_name, 'r') as f:
        src_list = []
        comments_list = []
        for line in f.readlines():
            # skip pragma
            if re.match('pragma solidity .*;', line.strip()):
                continue
            # single or multiline comments
            if re.match('/', line.strip()) or re.match('\*', line.strip()):
                comments_list.append(line)
                continue
            # inline comments
            if re.search('//', line):
                inline_comment = re.findall('[\s]*//.*\n', line)
                comments_list.extend(inline_comment)
                line = re.sub('[\s]*//.*\n', '\n', line)
                continue
                
            # add current src line
            src_list.append(line)
            
        if return_only_comments:
            return ''.join(comments_list)
        else:
            return ''.join(src_list)
    
# strips pragma statements and comments
def read_src_only_comments(file_name):
    with open(file_name, 'r') as f:
# TODO
        return ''.join(src_list)

def get_file_src(row):
    return read_src(get_filename_for_row(row))

def get_file_src_nocomments(row):
    return read_src_nocomments(get_filename_for_row(row))

def get_file_src_only_comments(row):
    return read_src_nocomments(get_filename_for_row(row), return_only_comments=True)
    

In [7]:
# test comment stripping

# lines = '    /**\n    * @dev Allows the current owner to transfer control of the contract to a newOwner.\n    * @param newOwner The address to transfer ownership to.\n    */\n    function transferOwnership(address newOwner) onlyOwner public {\n        // do not allow self ownership\n        require(newOwner != address(this));\n        super.transferOwnership(newOwner);\n    }'
# lines = "string public constant name    = 'VeChain Token';  //The Token's name\n uint8 public constant decimals = 18; \n   //Number of decimals of the smallest unit\n string public constant symbol  = 'VEN;            //An identifier  "  

# lines = lines.split('\n')
# for line in lines:
#     print line
#     print re.match('//', line.strip())
#     print re.match('\*', line.strip())

In [8]:
# test pragma stripping
# line = ' pragma solidity ^0.4.11; '
# print re.match('pragma solidity .*;', line.strip())

In [9]:
# execute src reading
# df_files['src'] = df_files.apply(get_file_src, axis=1)

# OR
# execute src reading without comments
df_files['src'] = df_files.apply(get_file_src_nocomments, axis=1)
df_files['comments'] = df_files.apply(get_file_src_only_comments, axis=1)

In [10]:
def match_all_contracts_from_src_string(src_string):
    regex = "contract .*{\n"
    matches = re.findall(regex, src_string)
    contract_names = []
    inherited_contracts = []
    for match in matches:
        match = match[len("contract "):-len("{\n")]
        match_type = match.split(' is ')
        contract_names.append(match_type[0].strip())
        if len(match_type) > 1:
            inherited_contracts.append([x.strip() for x in match_type[1].split(',')])
        else:
            inherited_contracts.append([])
    return contract_names, inherited_contracts


In [11]:
def match_contract_name_from_src_string(src_string):
    return match_all_contracts_from_src_string(src_string)[0]
    
def match_inherited_contracts_from_src_string(src_string):
    return match_all_contracts_from_src_string(src_string)[1]

In [12]:
df_files['contract_name'] = df_files['src'].apply(match_contract_name_from_src_string)
df_files['inherited_contracts'] = df_files['src'].apply(match_inherited_contracts_from_src_string)

In [13]:
# df_files

In [14]:
def flatten(x):
    return [item for sublist in x for item in sublist]

In [15]:
# check files without contract
files_without_contracts = df_files[df_files['contract_name'].apply(lambda x: len(x) == 0)]
print "files without contract: ", len(files_without_contracts)
# files_without_contracts

files without contract:  43


In [16]:
# check files many inherited contracts
files_with_many_inherited_contracts = df_files[df_files['inherited_contracts'].apply(lambda x: len(flatten(x)) == 0)]
print "files 0 inherited contracts: ", len(files_with_many_inherited_contracts)
files_with_many_inherited_contracts = df_files[df_files['inherited_contracts'].apply(lambda x: len(flatten(x)) == 1)]
print "files 1 inherited contracts: ", len(files_with_many_inherited_contracts)
files_with_many_inherited_contracts = df_files[df_files['inherited_contracts'].apply(lambda x: len(flatten(x)) > 1)]
print "files more than 1 inherited contracts: ", len(files_with_many_inherited_contracts)

files 0 inherited contracts:  195
files 1 inherited contracts:  203
files more than 1 inherited contracts:  124


In [17]:
# check files many  inherited contracts
files_with_many_inherited_contracts = df_files[df_files['inherited_contracts'].apply(lambda x: len(flatten(x)) > 4)]
print "files many inherited contracts: ", len(files_with_many_inherited_contracts)
# files_with_many_inherited_contracts

files many inherited contracts:  21


In [18]:
# compute frequency of inherited files
inheritance_flat = flatten(flatten(df_files['inherited_contracts']))
col_counts = pd.Series(inheritance_flat).value_counts(sort=True)
col_counts.to_csv(os.path.join(out_path, 'counts_{}.csv'.format('inheritance')))

# TODO: make plot ICO vs notICO

In [19]:
# retrieve imports
def match_imports_from_src_string(src_string):
    regex = "import .*;\n"
    matches = re.findall(regex, src_string)
    matches_cleaned = [m[len("import '"):-len("';\n")] for m in matches]
    return matches_cleaned

def exctract_imports(src_string):
    matches_cleaned = match_imports_from_src_string(src_string)
    imports =  [m.split('/')[-1] for m in matches_cleaned]
    return imports

def exctract_imports_path(src_string):
    matches_cleaned = match_imports_from_src_string(src_string)
    imports_path = []
    for m in matches_cleaned:
        cleaned_imports_path = [x for x in m.split('/')[:-1] if x not in [".", "..", " ",""]]
        imports_path.append("/".join(cleaned_imports_path))
    return imports_path

In [20]:
# execute
df_files['imports'] = df_files['src'].apply(exctract_imports)
df_files['imports_path'] = df_files['src'].apply(exctract_imports_path)

In [21]:
# check files with (many) imports
files_with_imports = df_files[df_files['imports'].apply(lambda x: len(x) > 5)]
len(files_with_imports)
# files_with_imports

# TODO: make plot ICO vs notICO

38

In [22]:
# files_with_imports['imports'].values[:3]

In [23]:
# we want to combine imported files with imports
# first check if file_names are unique in a company
# A: no. but only AugurCore and DemeterLife

# identical_filenames = df_files.groupby(['company', 'file_name']).size()
# identical_filenames[identical_filenames > 1]

In [24]:
# look at example duplicate name

# augurcore = df_files[df_files['company'] == 'AugurCore']
# augurcore[augurcore['file_name'] == 'BasicToken.sol']

In [25]:
# compute frequency of imported files
imports_flat = [item for imports in df_files['imports'].values for item in imports]
col_counts = pd.Series(imports_flat).value_counts(sort=True)
col_counts.to_csv(os.path.join(out_path, 'counts_{}.csv'.format('imports')))

# TODO: make plot ICO vs notICO

In [26]:
# save without src to csv
df_files_out = df_files.copy(deep=True)
df_files_out['src'] = df_files['src'].apply(lambda x: len(x))
df_files_out.to_csv(os.path.join(out_path, 'df_files.csv'))


In [27]:
# df_files

In [28]:
def import_contains_inherited_contract(import_match, inherited_contracts):
    import_match_contracts = flatten(import_match['contract_name'])
    inherited_contracts = flatten(flatten(inherited_contracts))
    is_contained = [x in import_match_contracts for x in inherited_contracts]
    return any(is_contained)

In [29]:
# test import_contains_inherited_contract

# A = pd.DataFrame.from_dict({'contract_name': [['a', 'b']]})
# import_contains_inherited_contract(A, [['a']])

In [30]:
# match imported files with files in dataset
df_files['ID'] = df_files.index.values
df_files['imports_idx'] = None
df_files['imports_zeppelin'] = False
# df_files['contains_zeppelin'] = False

# all zeppelin files for imports from there.
files_zeppelin = df_files[df_files['company'] == 'Zeppelin']

# TODO: add these two, combine files based on inheritance instead of imports
# df_files['inherits_idx'] = None
# df_files['is_inherited'] = False


verbose = False

# only import file if it contains an inherited contract!
import_only_inherited = True

# allow imports from zeppelin
allow_zeppelin = True


for idx in df_files.index.values:
    f = df_files.loc[idx]
#     df_files.loc[idx, 'contains_zeppelin'] = 'zeppelin' in f.loc['src'].lower()
    company = f.loc['company']
    files_company = df_files[df_files['company'] == company]
    f_imports = f.loc['imports']
    f_imports_path = f.loc['imports_path']
    f_inherited = flatten(f.loc['inherited_contracts'])

#     if len(f_imports) < 1:
#         continue
    imports_idx_list = []
    for import_file_name, import_file_path in zip(f_imports, f_imports_path):
        matching_files = files_company[files_company['file_name'] == import_file_name]
        
        # check if is importing from zeppelin, as these often in other folder
        if 'zeppelin' in import_file_path.lower():
                df_files.at[idx, 'imports_zeppelin'] = True
                if allow_zeppelin and company.lower() != 'zeppelin':
                    matching_files = matching_files.append(files_zeppelin[files_zeppelin['file_name'] == import_file_name])
        
        if len(matching_files) == 1:
            imports_idx = matching_files.index.values[0]
            
        elif len(matching_files) > 1:
            # handle ties
            target_set = set(import_file_path.split('/'))
            num_joint_roots = []
            for match_root in matching_files['root'].values:
                joint_roots = [1 for r in match_root.split('/') if r in target_set]
                num_joint_roots.append(sum(joint_roots))
            max_matches = max(num_joint_roots)
            if sum(max_matches == m for m in num_joint_roots) > 1:
                # handle tie-tie
                root_len_diff = [abs(len(import_file_path.split('/')) - len(r.split('/'))) 
                                 for r in matching_files['root'].values]
                better_match = root_len_diff.index(min(root_len_diff)) 
            else:
                better_match = num_joint_roots.index(max_matches)      
            imports_idx = matching_files.index.values[better_match]
            
            if verbose:
                print "import root: {}; matching roots: {}".format(import_file_path, matching_files['root'].values)
                print "has match: ", import_file_path in matching_files['root'].values
            
        elif len(matching_files) < 1:
            imports_idx = -1
            if verbose:
                print "no import match for: ", import_file_name, import_file_path
        
        # check if the import-match also contains an inherited contract
        if import_only_inherited and imports_idx >= 0:
            if not import_contains_inherited_contract(df_files.loc[imports_idx], f_inherited):
                if verbose:
                    print "import: ", df_files.loc[imports_idx]['contract_name'], "not contain inherited contract: ", f_inherited
                imports_idx = -3
            
            
        imports_idx_list.append(imports_idx)
    
    df_files.at[idx, 'imports_idx'] = imports_idx_list


In [31]:
sum(df_files['imports_zeppelin'] == True)

81

In [32]:
# sum(df_files['contains_zeppelin'] == True)

In [33]:
# how often is zeppelin imported?

imports_z = df_files.groupby(['class', 'company'])['imports_zeppelin'].sum()
# contains_z = df_files.groupby(['class', 'company'])['contains_zeppelin'].sum()
z_presence = pd.DataFrame(pd.Series(imports_z))
# z_presence['contains_zeppelin'] = contains_z

# z_presence

In [34]:
# check if is imported 
# -> not imported files will be combined with the files they import

imported_idxs = set([item for imports in df_files['imports_idx'].values for item in imports])
df_files['is_imported'] = df_files['ID'].apply(lambda x: x in imported_idxs)

In [35]:
# join all or just final tree nodes?
join_all = True
max_depth = 10

In [36]:
def get_all_imports_idx(imports_idx, idx_set, depth_list, depth, max_depth):
#     print 'imports_idx: ', imports_idx
#     print 'idx_set: ', idx_set
#     print 'depth: ', depth
#     print 'depth_list: ', depth_list
    depth += 1
    if depth <= max_depth:
        for idx in imports_idx:
    #         print 'idx: ', idx
            if idx not in idx_set:
                idx_set.add(idx)
                depth_list.append(depth)
                if idx >= 0:
                    next_imports_idx = df_files.loc[idx, 'imports_idx']
                    if len(next_imports_idx) > 0:
            #             print 'next_imports_idx: ', next_imports_idx
                        _ = get_all_imports_idx(next_imports_idx, idx_set, depth_list, depth, max_depth)

    else:
        print "max_depth reached for :"
        print idx_set
        print depth_list
    # TODO: also import zeppelin files (and their imports)?
#     print 'reached return: ', idx_list
#     print 'depth: ', depth, depth_list
    return idx_set

def get_all_imports_idx_from_row(row):
    idx_set = set([row['ID']])
    depth_list = [0]
    depth = 0
#     print row
    if join_all or not row.loc['is_imported']:
        return  get_all_imports_idx(row.loc['imports_idx'], idx_set, depth_list, depth, max_depth)
    else: 
        return set()
    

In [37]:
def get_all_imports_depth(imports_idx, idx_set, depth_list, depth, max_depth):
#     print 'imports_idx: ', imports_idx
#     print 'idx_set: ', idx_set
#     print 'depth: ', depth
#     print 'depth_list: ', depth_list
    depth += 1
    if depth <= max_depth:
        for idx in imports_idx:
    #         print 'idx: ', idx
            if idx not in idx_set:
                idx_set.add(idx)
                depth_list.append(depth)
                if idx >= 0:
                    next_imports_idx = df_files.loc[idx, 'imports_idx']
                    if len(next_imports_idx) > 0:
            #             print 'next_imports_idx: ', next_imports_idx
                        _ = get_all_imports_idx(next_imports_idx, idx_set, depth_list, depth, max_depth)

    else:
        print "max_depth reached for :"
        print idx_set
        print depth_list
    # TODO: also import zeppelin files (and their imports)?
#     print 'reached return: ', idx_list
#     print 'depth: ', depth, depth_list
    return max(depth_list)

def get_all_imports_depth_from_row(row):
    idx_set = set([row['ID']])
    depth_list = [0]
    depth = 0
#     print row
    if join_all or row.loc['is_imported'] == False:
        return  get_all_imports_depth(row.loc['imports_idx'], idx_set, depth_list, depth, max_depth)
    else: 
        return -1
    

In [38]:
df_files['imports_idx_all'] = df_files.apply(get_all_imports_idx_from_row, axis=1)
df_files['imports_idx_all'] = df_files['imports_idx_all'].apply(list)
df_files['imports_depth'] = df_files.apply(get_all_imports_depth_from_row, axis=1)

In [39]:
# remove import lines from src

def remove_import_lines(src):
#     print src
    regex = "import .*;"
    src = "\n".join([x for x in src.split("\n") if not re.match(regex, x)])  
#     print src
    return src   

In [40]:
# remove_import_lines(df_joined.loc[4, 'src'])
df_files['src'] = df_files['src'].apply(remove_import_lines)

In [41]:
def join_imports(row):
    row.loc['joined_files'] = []
    row.loc['joined_contracts'] = []
    row.loc['joined_roots'] = []
    src_joined = []
    comments_joined = []
    for idx in row.loc['imports_idx_all']:
        if idx >= 0:
            row.loc['joined_files'].append(df_files.loc[idx, 'file_name'])
            row.loc['joined_contracts'].extend(df_files.loc[idx, 'contract_name'])
            row.loc['joined_roots'].extend(df_files.loc[idx, 'root'].split('/'))
            src_joined.append(df_files.loc[idx, 'src'])
            comments_joined.append(df_files.loc[idx, 'comments'])
            if df_files.loc[idx, 'imports_zeppelin'] == True:
                row.loc['imports_zeppelin'] = True
#             if df_files.loc[idx, 'contains_zeppelin'] == True:
#                 row.loc['contains_zeppelin'] = True
    row.loc['src'] = "\n".join(src_joined)
    row.loc['comments'] = "\n".join(comments_joined)
    row.loc['joined_contracts'] = list(set(row.loc['joined_contracts']))
    row.loc['joined_roots'] = list(set(row.loc['joined_roots'] ))
    row.loc['joined_roots'] = [x for x in row.loc['joined_roots'] if x not in ['', '.', '..']]
    del row['imports']
    del row['imports_idx']
    del row['imports_path']
    del row['inherited_contracts']
#     del row['is_imported']
    return row

In [42]:
df_joined = df_files.apply(join_imports, axis=1)

In [43]:
# have a look at import dephts, potentially filter out not joined ones
df_joined = df_joined[df_joined['imports_depth'] >= 0]
print len(df_files), len(df_joined)

522 522


In [44]:
# df_joined

In [45]:
# save without src and comments to csv
df_files_out = df_files.copy(deep=True)
df_files_out['src'] = df_files['src'].apply(lambda x: len(x))
df_files_out['comments'] = df_files['comments'].apply(lambda x: len(x))
df_files_out.to_csv(os.path.join(out_path, 'df_files.csv'))

In [46]:
df_joined_out = df_joined.copy(deep=True)
df_joined_out['src'] = df_joined['src'].apply(lambda x: len(x))
df_joined_out['comments'] = df_joined['comments'].apply(lambda x: len(x))
df_joined_out.to_csv(os.path.join(out_path, 'df_joined.csv'))

In [47]:
def save_joined_src(row):
    root_name = "_".join([x for x in row.loc['root'].split('/') if x not in ['', '.', '..']])
    root_name = "{}_".format(root_name) if len(root_name) > 0 else root_name
    file_name = "{}{}".format(root_name, row.loc['file_name'])
    file_name_comments = "comments_{}{}".format(root_name, row.loc['file_name'])
    class_name = row.loc['class']
    company_name = row.loc['company']
    
    if not os.path.exists(os.path.join(out_path, 'joined')):
        os.mkdir(os.path.join(out_path, 'joined'))
    if not os.path.exists(os.path.join(out_path, 'joined', class_name)):
        os.mkdir(os.path.join(out_path, 'joined', class_name))
    if not os.path.exists(os.path.join(out_path, 'joined', class_name, company_name)):
        os.mkdir(os.path.join(out_path, 'joined', class_name, company_name))

    with open(os.path.join(out_path, 'joined', class_name, company_name, file_name), 'w')as f:
        f.write(row.loc['src'])
    with open(os.path.join(out_path, 'joined', class_name, company_name, file_name_comments), 'w')as f:
        f.write(row.loc['comments'])

In [48]:
_ = df_joined.apply(save_joined_src, axis=1)

In [49]:
# detect any list of strings in  ['named', 'contains', 'comments']

def detect_strings(row, strings, case_sensitive=False):
    for to_find in strings:
        names_mix = []
        names_mix.extend(row['joined_files'])
        names_mix.extend(row['joined_contracts'])
        names_mix.extend(row['joined_roots'])
        if case_sensitive:
            row['named_{}'.format(to_find)] = any([to_find in x.lower() for x in names_mix])
            row['contains_{}'.format(to_find)] = to_find in row['src'].lower()
            row['comments_{}'.format(to_find)] = to_find in row['comments'].lower()
        else:
            row['named_{}'.format(to_find)] = any([to_find in x for x in names_mix])
            row['contains_{}'.format(to_find)] = to_find in row['src']
            row['comments_{}'.format(to_find)] = to_find in row['comments']        
    return row

def detect_crowdsale_presale_ICO(row):
    row = detect_strings(row, strings=['crowdsale', 'presale'])
    return detect_strings(row, strings=['ICO'], case_sensitive=True)

def detect_coin_token(row):
    strings = ['coin', 'token']
    return detect_strings(row, strings)

# run
df_joined = df_joined.apply(detect_crowdsale_presale_ICO, axis=1)
df_joined = df_joined.apply(detect_coin_token, axis=1)

In [50]:
# df_joined[df_joined['contains_ICO']]
# df_joined[df_joined['contains_crowdsale'] != df_joined['named_crowdsale']]
# df_joined[df_joined['contains_ICO'] != df_joined['named_ICO']]

In [51]:
# ICO indicator counts
indicator_list =["{}_{}".format(a, b) for a in ['named', 'contains', 'comments'] for b in ['crowdsale', 'presale', 'ICO']]
df_joined['ICO_indications'] = df_joined[indicator_list].sum(axis=1)

# ICO indicator counts for each company
indicator_list.append('ICO_indications')
ICO_indicators = df_joined.groupby(['class', 'company'])[indicator_list].sum()
ICO_indicators

Unnamed: 0_level_0,Unnamed: 1_level_0,named_crowdsale,named_presale,named_ICO,contains_crowdsale,contains_presale,contains_ICO,comments_crowdsale,comments_presale,comments_ICO,ICO_indications
class,company,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ICO,Aragon,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
ICO,AuthenticID,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2
ICO,BATToken,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
ICO,BitClave,4.0,0.0,0.0,0.0,1.0,0.0,4.0,1.0,0.0,10
ICO,BitDegree,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2
ICO,Bread,0.0,0.0,0.0,2.0,0.0,0.0,5.0,1.0,0.0,8
ICO,BullToken,3.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,9
ICO,DemeterLife,8.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,18
ICO,Gameflip,4.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,9
ICO,Hoqu,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,4


In [52]:
# df_joined[df_joined['company'] == 'Status']

In [53]:
# Token indicator counts
indicator_list =["{}_{}".format(a, b) for a in ['named', 'contains', 'comments'] for b in ['token', 'coin']]
df_joined['token_indications'] = df_joined[indicator_list].sum(axis=1)

# Token indicator counts for each company
indicator_list.append('token_indications')
token_indicators = df_joined.groupby(['class', 'company'])[indicator_list].sum()
token_indicators

Unnamed: 0_level_0,Unnamed: 1_level_0,named_token,named_coin,contains_token,contains_coin,comments_token,comments_coin,token_indications
class,company,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ICO,Aragon,3.0,0.0,7.0,0.0,7.0,0.0,17
ICO,AuthenticID,2.0,0.0,1.0,0.0,2.0,0.0,5
ICO,BATToken,0.0,0.0,1.0,0.0,3.0,0.0,4
ICO,BitClave,4.0,0.0,4.0,0.0,7.0,0.0,15
ICO,BitDegree,2.0,0.0,1.0,0.0,2.0,0.0,5
ICO,Bread,0.0,0.0,4.0,0.0,9.0,0.0,13
ICO,BullToken,2.0,0.0,5.0,0.0,6.0,0.0,13
ICO,DemeterLife,7.0,0.0,9.0,0.0,16.0,0.0,32
ICO,Gameflip,6.0,0.0,6.0,0.0,6.0,0.0,18
ICO,Hoqu,5.0,0.0,4.0,0.0,5.0,0.0,14


In [54]:
df_joined[df_joined['token_indications'] > 2]

Unnamed: 0,file_name,root,class,company,src,comments,contract_name,ID,imports_zeppelin,is_imported,...,contains_ICO,comments_ICO,named_coin,contains_coin,comments_coin,named_token,contains_token,comments_token,ICO_indications,token_indications
68,EtherToken.sol,common,notICO,AragonCore,\n\ncontract EtherToken is ERC677Token {\n ...,// fails if no balance\n\n\n\n/**\n * @title ...,[EtherToken],68,True,False,...,False,False,False,False,False,True,True,True,0,3
70,ERC677Token.sol,common/erc677,notICO,AragonCore,\n\ncontract ERC677Token is StandardToken {\n ...,\n\n/**\n * @title Standard ERC20 token\n *\n ...,[ERC677Token],70,True,True,...,False,False,False,False,False,True,True,True,0,3
104,ITime.sol,,notICO,AugurCore,\n\ncontract ITyped {\n function getTypeNam...,\n/**\n * Contracts inheriting from Extractabl...,[ITime],104,False,True,...,False,False,False,False,False,True,True,True,0,3
105,Augur.sol,,notICO,AugurCore,\n\ncontract ITyped {\n function getTypeNam...,\n/**\n * Contracts inheriting from Extractabl...,[Augur],105,False,True,...,False,False,False,False,False,True,True,True,0,3
107,Controlled.sol,,notICO,AugurCore,\n\n\ncontract ERC20Basic {\n event Transfe...,/**\n * @title ERC20Basic\n * @dev Simpler ver...,[Controlled],107,False,True,...,False,False,False,False,False,True,True,True,0,3
108,TimeControlled.sol,,notICO,AugurCore,\n\ncontract ITyped {\n function getTypeNam...,\n/**\n * Contracts inheriting from Extractabl...,[TimeControlled],108,False,False,...,False,False,False,False,False,True,True,True,0,3
110,Time.sol,,notICO,AugurCore,\n\ncontract ITyped {\n function getTypeNam...,\n/**\n * Contracts inheriting from Extractabl...,[Time],110,False,False,...,False,False,False,False,False,True,True,True,0,3
112,Controller.sol,,notICO,AugurCore,\n\ncontract ITyped {\n function getTypeNam...,\n/**\n * Contracts inheriting from Extractabl...,[Controller],112,False,False,...,False,False,False,False,False,True,True,True,0,3
134,Extractable.sol,libraries,notICO,AugurCore,\n\n\n\ncontract Extractable is Controlled {\n...,/**\n * Contracts inheriting from Extractable ...,[Extractable],134,False,True,...,False,False,False,False,False,True,True,True,0,3
136,MarketValidator.sol,libraries,notICO,AugurCore,"\n\n\ncontract IFeeToken is ERC20, Initializab...",\n\n\n\n\n\n\n // Transfer tokens to ta...,[MarketValidator],136,False,True,...,False,False,False,False,False,True,True,True,0,3


In [55]:
df_joined.loc[369]

file_name                                        ClaimableCrowdsale.sol
root                                                                   
class                                                               ICO
company                                                            Hoqu
src                   \n\ncontract ERC20Basic {\n  uint256 public to...
comments              /**\n * @title ERC20Basic\n * @dev Simpler ver...
contract_name                                      [ClaimableCrowdsale]
ID                                                                  369
imports_zeppelin                                                   True
is_imported                                                        True
imports_idx_all            [272, 267, 269, 240, 369, 277, 374, -3, 286]
imports_depth                                                         4
joined_files          [ERC20Basic.sol, StandardToken.sol, BasicToken...
joined_contracts      [ClaimableCrowdsale, HoQuToken, BasicToken

In [102]:
# df_joined[df_joined['file_name'].str.match('Crowd')]
df_joined.loc[291]
# df_files.loc[291]['src']

file_name                                                 Crowdsale.sol
root                                                          crowdsale
class                                                          Zeppelin
company                                                        Zeppelin
src                   \n\ncontract Crowdsale {\n  using SafeMath for...
comments              /**\n * @title Crowdsale\n * @dev Crowdsale is...
contract_name                                               [Crowdsale]
ID                                                                  291
imports_zeppelin                                                  False
is_imported                                                        True
imports_idx_all                                               [291, -3]
imports_depth                                                         1
joined_files          [Crowdsale.sol, Crowdsale, crowdsale, Crowdsal...
joined_contracts                                            [Cro

In [103]:
stringdist.levenshtein_norm(df_files.loc[291]['src'], df_joined.loc[291]['src'])

0.0

In [1]:
for i in df_joined.index.values[:10]:
    print stringdist.levenshtein_norm(df_joined.loc[291]['src'], df_joined.loc[i]['src'])


NameError: name 'df_joined' is not defined

In [98]:
df_files

Unnamed: 0,file_name,root,class,company,src,comments,contract_name,inherited_contracts,imports,imports_path,ID,imports_idx,imports_zeppelin,is_imported,imports_idx_all,imports_depth
0,PullPayInterface.sol,,notICO,AceBusters,\ncontract PullPayInterface {\n function asyn...,,[PullPayInterface],[[]],[],[],0,[],False,True,[0],0
1,ERC20Basic.sol,,notICO,AceBusters,\n\ncontract ERC20Basic {\n function totalSup...,/*\n * ERC20Basic\n * Simpler version of ERC20...,[ERC20Basic],[[]],[],[],1,[],False,True,[1],0
2,Migrations.sol,,notICO,AceBusters,\ncontract Migrations {\n address public owne...,,[Migrations],[[]],[],[],2,[],False,False,[2],0
3,ERC223Basic.sol,,notICO,AceBusters,\n\ncontract ERC223Basic is ERC20Basic {\n ...,,[ERC223Basic],[[ERC20Basic]],[ERC20Basic.sol],[],3,[1],False,True,"[1, 3]",1
4,ERC20.sol,,notICO,AceBusters,\n\n\n\ncontract ERC20 is ERC223Basic {\n fun...,/*\n * ERC20 interface\n * see https://github....,[ERC20],[[ERC223Basic]],[ERC223Basic.sol],[],4,[3],False,True,"[1, 3, 4]",2
5,ERC223ReceivingContract.sol,,notICO,AceBusters,\n \ncontract ERC223ReceivingContract {\n f...,/*\n * Contract that is working with ERC223 t...,[ERC223ReceivingContract],[[]],[],[],5,[],False,True,[5],0
6,SafeMath.sol,,notICO,AceBusters,\n\nlibrary SafeMath {\n function mul(uint256...,/**\n * @title SafeMath\n * @dev Math operatio...,[],[],[],[],6,[],False,False,[6],0
7,PullPayment.sol,satelites,notICO,AceBusters,\n\n\ncontract PullPayment is Ownable {\n usi...,/**\n * @title PullPayment\n * @dev Base contr...,[PullPayment],[[Ownable]],"[SafeMath.sol, Ownable.sol, ControllerInterfac...","[, ownership, controller]",7,"[-3, 22, 15]",False,True,"[15, -3, 22, 7]",1
8,Storage.sol,satelites,notICO,AceBusters,\n\ncontract Storage is Ownable {\n struct ...,,[Storage],[[Ownable]],[Ownable.sol],[ownership],8,[22],False,False,"[8, 22]",1
9,Nutz.sol,satelites,notICO,AceBusters,"\n\n\ncontract Nutz is Ownable, ERC20 {\n\n e...",/**\n * Nutz implements a price floor and a pr...,[Nutz],"[[Ownable, ERC20]]","[ERC20.sol, Ownable.sol, ControllerInterface.s...","[, ownership, controller, , ]",9,"[4, 22, 15, 5, 0]",False,True,"[0, 1, 3, 4, 5, 9, 15, 22]",3



