In [272]:
# imports
import os
import io
import tokenize
import numpy as np
import pandas as pd

import re


In [273]:
# paths
# zeppelin_folder = '/home/ourownstory/Documents/SOL/data/Zeppelin/Zeppelin/crowdsale/'
zeppelin_joined = '/home/ourownstory/Documents/SOL/derived/test/joined/Zeppelin/Zeppelin/'
# os.listdir(zeppelin_folder)
os.listdir(zeppelin_joined)
crowdsale_joined = 'crowdsale/Crowdsale.sol'
# BasicToken.sol     CappedToken.sol    ERC20Basic.sol  MintableToken.sol  SafeERC20.sol      TokenTimelock.sol
# BurnableToken.sol  DetailedERC20.sol  ERC20.sol       PausableToken.sol  StandardToken.sol  TokenVesting.sol

In [274]:
# src reader from dat_wrangling notebook
# NOTE: modified to return list

# strips pragma statements and comments
def read_src_nocomments(file_name, return_only_comments=False):
    with open(file_name, 'r') as f:
        src_list = []
        comments_list = []
        for line in f.readlines():
            # skip pragma
            if re.match('pragma solidity .*;', line.strip()):
                continue
            # single or multiline comments
            if re.match('/', line.strip()) or re.match('\*', line.strip()):
                comments_list.append(line)
                continue
            # inline comments
            if re.search('//', line):
                inline_comment = re.findall('[\s]*//.*\n', line)
                comments_list.extend(inline_comment)
                line = re.sub('[\s]*//.*\n', '\n', line)
                continue
                
            # add current src line
            src_list.append(line)
        
        if return_only_comments:
            return comments_list
        else:
            return src_list

def remove_import_lines(src):
#     print src
    regex = "import .*;"
    src = [x for x in src if not re.match(regex, x)]
#     print src
    return src   

In [275]:
# read in as txt
raw_lines = []
with open(zeppelin_joined+crowdsale_joined, 'r') as file:
    for line in file:
        raw_lines.append(line)
        
# read stripped src
# raw_lines = read_src_nocomments(file_name=zeppelin_joined+crowdsale_joined)
# raw_lines = remove_import_lines(raw_lines)

In [276]:
# raw_lines

In [277]:
# for raw_line in raw_lines:
#     print(raw_line.split())

In [278]:
# would need to use hand-crafted regex to split as wanted.
# TODO: implement if tokenize not desirable.

In [279]:
# approach using tokenize


In [280]:
tokens_dict = {
    'line': [],
    'line_se': [],
    'type': [],
    'value': [],
}
tokens_list = []
with open(zeppelin_joined+crowdsale_joined, 'r') as file:
    token_generator = tokenize.generate_tokens(file.readline)
    for type, token, (srow, scol), (erow, ecol), line in token_generator:
        tokens_dict['line'].append(srow)        
        tokens_dict['line_se'].append(((srow, scol), (erow, ecol)))    
        tokens_dict['type'].append(tokenize.tok_name[type])        
        tokens_dict['value'].append(repr(token))
        tokens_list.append([srow, tokenize.tok_name[type], repr(token)])
        
tokens = pd.DataFrame.from_dict(tokens_dict)

In [281]:
# tokens

In [282]:
# token_types, type_counts = np.unique(tokens['type'], return_counts=True)
# print(token_types,type_counts)

token_type_freq = pd.Series.value_counts(tokens.type, sort=True)
print(token_type_freq)

NAME         152
OP           148
NL            72
NUMBER         4
NEWLINE        1
ENDMARKER      1
Name: type, dtype: int64


In [283]:
# replace "NEWLINE" with "NL"
tokens['type'] = tokens['type'].apply(lambda x: 'NL' if x == 'NEWLINE' else x)

In [284]:
token_freq = pd.Series.value_counts(tokens.value, sort=True)
print(token_freq[:20])

'\n'             73
';'              32
'('              29
')'              29
'uint256'        12
'='              10
','              10
'public'          9
'.'               9
'}'               8
'{'               8
'function'        7
'address'         7
'require'         6
'msg'             5
'beneficiary'     5
'&'               4
'0'               4
'value'           4
'bool'            4
Name: value, dtype: int64


In [285]:
op_freq = pd.Series.value_counts(tokens[tokens.type == 'OP'].value, sort=True)
print(op_freq[:20])

';'     32
'('     29
')'     29
'='     10
','     10
'.'      9
'}'      8
'{'      8
'&'      4
'!='     3
'>='     3
'>'      2
'<='     1
Name: value, dtype: int64


In [286]:
name_freq = pd.Series.value_counts(tokens[tokens.type == 'NAME'].value, sort=True)
print(name_freq[:20])

'uint256'        12
'public'          9
'function'        7
'address'         7
'require'         6
'beneficiary'     5
'msg'             5
'endTime'         4
'now'             4
'bool'            4
'value'           4
'weiAmount'       4
'_startTime'      4
'_wallet'         3
'weiRaised'       3
'wallet'          3
'rate'            3
'tokens'          3
'return'          3
'returns'         3
Name: value, dtype: int64


In [287]:
## Next steps:

In [288]:
# 1. write tokenization function that receives a list of paths 
# and does the tokenization and counts for all files in paths.


In [289]:
# create a pandas dataframe with the filenames in our dataset

# paths
data_path =  '/home/ourownstory/Documents/SOL/derived/test/cleaned/'
out_path = '/home/ourownstory/Documents/SOL/derived/test/'


df_files_dict = {
    'root': [],
    'file_name': [],
}
for root, subdirs, df_files in os.walk(data_path):
    for file_name in df_files:
        df_files_dict['root'].append(root[len(data_path):])
        df_files_dict['file_name'].append(file_name)
df_files = pd.DataFrame.from_dict(df_files_dict)
root_list = df_files['root'].values
root_list = [root.split('/') for root in root_list]
df_files['class'] = [r.pop(0) for r in root_list]
df_files['company'] = [r.pop(0) for r in root_list]
df_files['root'] = ["/".join(r) for r in root_list]
file_name_list = df_files['file_name'].values
df_files['extension'] = [e.split('.')[-1] for e in file_name_list]

# filter all files that are not .sol
df_files = df_files[df_files.pop('extension') == 'sol']
# filter all comment-files
df_files['is_comment'] = df_files['file_name'].apply(lambda x: bool(re.match('comments_', x)))
df_files = df_files[df_files.pop('is_comment') == False]


# helper func to get file-path

def get_filename_for_row(row):
    filename = os.path.join(
        data_path, 
        row.loc['class'], 
        row.loc['company'], 
        row.loc['root'], 
        row.loc['file_name']
    )
    return filename

In [195]:
def tokenize_file(filename):
    tokens_dict = {
        'line': [],
        'line_se': [],
        'type': [],
        'value': [],
    }
    tokens_list = []
    with open(filename, 'r') as file:
        token_generator = tokenize.generate_tokens(file.readline)
        for type, token, (srow, scol), (erow, ecol), line in token_generator:
            tokens_dict['line'].append(srow)        
            tokens_dict['line_se'].append(((srow, scol), (erow, ecol)))    
            tokens_dict['type'].append(tokenize.tok_name[type])        
            tokens_dict['value'].append(repr(token))
            tokens_list.append([srow, tokenize.tok_name[type], repr(token)])
            
    tokens_dict['type'] = ['NL' if x == 'NEWLINE' else x for x in tokens_dict['type']]

    return tokens_dict

def tokenize_file_from_row(row):
    filename = get_filename_for_row(row)
#     print filename
    tokens_dict = tokenize_file(filename)
    row['lines'] = tokens_dict['line_se']
    row['types'] = tokens_dict['type']
    row['values'] = tokens_dict['value']
    return row    

In [196]:
df_files = df_files.apply(tokenize_file_from_row, axis=1)
df_files

Unnamed: 0,file_name,root,class,company,lines,types,values
3,PullPayInterface.sol,,notICO,AceBusters,"[((1, 0), (1, 1)), ((2, 0), (2, 8)), ((2, 9), ...","[NL, NAME, NAME, OP, NL, NAME, NAME, OP, NAME,...","['\n', 'contract', 'PullPayInterface', '{', '\..."
4,ERC20Basic.sol,,notICO,AceBusters,"[((1, 0), (1, 1)), ((2, 0), (2, 1)), ((3, 0), ...","[NL, NL, NAME, NAME, OP, NL, NAME, NAME, OP, O...","['\n', '\n', 'contract', 'ERC20Basic', '{', '\..."
6,Migrations.sol,,notICO,AceBusters,"[((1, 0), (1, 1)), ((2, 0), (2, 8)), ((2, 9), ...","[NL, NAME, NAME, OP, NL, NAME, NAME, NAME, OP,...","['\n', 'contract', 'Migrations', '{', '\n', 'a..."
7,ERC223Basic.sol,,notICO,AceBusters,"[((1, 0), (1, 1)), ((2, 0), (2, 6)), ((2, 7), ...","[NL, NAME, STRING, OP, NEWLINE, NL, NAME, NAME...","['\n', 'import', ""'./ERC20Basic.sol'"", ';', '\..."
11,ERC20.sol,,notICO,AceBusters,"[((1, 0), (1, 1)), ((2, 0), (2, 1)), ((3, 0), ...","[NL, NL, NAME, STRING, OP, NEWLINE, NL, NL, NA...","['\n', '\n', 'import', ""'./ERC223Basic.sol'"", ..."
12,ERC223ReceivingContract.sol,,notICO,AceBusters,"[((1, 0), (1, 1)), ((2, 1), (2, 2)), ((3, 0), ...","[NL, NL, NAME, NAME, OP, NL, NAME, NAME, OP, N...","['\n', '\n', 'contract', 'ERC223ReceivingContr..."
13,SafeMath.sol,,notICO,AceBusters,"[((1, 0), (1, 1)), ((2, 0), (2, 1)), ((3, 0), ...","[NL, NL, NAME, NAME, OP, NL, NAME, NAME, OP, N...","['\n', '\n', 'library', 'SafeMath', '{', '\n',..."
14,PullPayment.sol,satelites,notICO,AceBusters,"[((1, 0), (1, 1)), ((2, 0), (2, 1)), ((3, 0), ...","[NL, NL, NAME, STRING, OP, NEWLINE, NAME, STRI...","['\n', '\n', 'import', ""'../SafeMath.sol'"", ';..."
15,Storage.sol,satelites,notICO,AceBusters,"[((1, 0), (1, 1)), ((2, 0), (2, 6)), ((2, 7), ...","[NL, NAME, STRING, OP, NEWLINE, NL, NAME, NAME...","['\n', 'import', '""../ownership/Ownable.sol""',..."
19,Nutz.sol,satelites,notICO,AceBusters,"[((1, 0), (1, 1)), ((2, 0), (2, 6)), ((2, 7), ...","[NL, NAME, STRING, OP, NEWLINE, NAME, STRING, ...","['\n', 'import', '""../ERC20.sol""', ';', '\n', ..."


In [197]:
# 2. Plot the Data (with matplotlib or port over to R)

In [198]:
tokens = pd.DataFrame()

In [244]:
def expand_df(df):
    lens = [len(item) for item in df['types']]
    return pd.DataFrame( {
        "file_name" : np.repeat(df['file_name'].values, lens), 
        "class" : np.repeat(df['class'].values, lens), 
        "company" : np.repeat(df['company'].values, lens), 
        "root" : np.repeat(df['root'].values, lens), 
        "type" : np.concatenate(df['types'].values),
        "value" : np.concatenate(df['values'].values),
        "lines" : [item for sublist in df['lines'].values for item in sublist],
    })

In [245]:
tokens = expand_df(df_files)
len(tokens)
# len(tokens[tokens.isnull().any(axis=1)])

219914

In [262]:
# type freq
token_type_freq = pd.Series.value_counts(tokens.type, sort=True)
print(token_type_freq)

OP            91444
NAME          86899
NL            33396
NUMBER         3737
NEWLINE        1814
STRING         1610
ENDMARKER       522
ERRORTOKEN      440
INDENT           26
DEDENT           26
Name: type, dtype: int64


In [270]:
# value freq
freq = pd.Series.value_counts(tokens.value, sort=True)
# print(freq[:50])

In [250]:
# tokens[tokens['type']=='ERRORTOKEN']

In [269]:
freq = pd.Series.value_counts(tokens[tokens.type == 'OP'].value, sort=True)
# print(freq[:50])

In [268]:
freq = pd.Series.value_counts(tokens[tokens.type == 'NAME'].value, sort=True)
# print(freq[:100])

In [299]:
import StringIO
def tokenize_string(string):
    tokens_dict = {
        'line': [],
        'line_se': [],
        'type': [],
        'value': [],
    }
    tokens_list = []
    buf = StringIO.StringIO(string)
    token_generator = tokenize.generate_tokens(buf.readline)
    for type, token, (srow, scol), (erow, ecol), line in token_generator:
        tokens_dict['line'].append(srow)        
        tokens_dict['line_se'].append(((srow, scol), (erow, ecol)))    
        tokens_dict['type'].append(tokenize.tok_name[type])        
        tokens_dict['value'].append(repr(token))
        tokens_list.append([srow, tokenize.tok_name[type], repr(token)])
            
    tokens_dict['type'] = ['NL' if x == 'NEWLINE' else x for x in tokens_dict['type']]

    return tokens_dict


def tokenize_string_from_row(row):
#     filename = get_filename_for_row(row)
#     print filename
    tokens_dict = tokenize_string(row['src'])
    row['lines'] = tokens_dict['line_se']
    row['types'] = tokens_dict['type']
    row['values'] = tokens_dict['value']
    return row   

In [302]:

# strips pragma statements and comments
def read_src_nocomments(file_name, return_only_comments=False):
    with open(file_name, 'r') as f:
        src_list = []
        comments_list = []
        for line in f.readlines():
            # skip pragma
            if re.match('pragma solidity .*;', line.strip()):
                continue
            # single or multiline comments
            if re.match('/', line.strip()) or re.match('\*', line.strip()):
                comments_list.append(line)
                continue
            # inline comments
#             if re.search('//', line):
#                 inline_comment = re.findall('[\s]*//.*\n', line)
#                 comments_list.extend(inline_comment)
#                 line = re.sub('[\s]*//.*\n', '\n', line)
#                 continue
                
            # add current src line
            src_list.append(line)
            
        if return_only_comments:
            return ''.join(comments_list)
        else:
            return ''.join(src_list)

def get_file_src(row):
    return read_src(get_filename_for_row(row))

def get_file_src_nocomments(row):
    return read_src_nocomments(get_filename_for_row(row))

def get_file_src_only_comments(row):
    return read_src_nocomments(get_filename_for_row(row), return_only_comments=True)
    
df_files['src'] = df_files.apply(get_file_src_nocomments, axis=1)
df_files

Unnamed: 0,file_name,root,class,company,src
3,PullPayInterface.sol,,notICO,AceBusters,\ncontract PullPayInterface {\n function asyn...
4,ERC20Basic.sol,,notICO,AceBusters,\n\ncontract ERC20Basic {\n function totalSup...
6,Migrations.sol,,notICO,AceBusters,\ncontract Migrations {\n address public owne...
7,ERC223Basic.sol,,notICO,AceBusters,\nimport './ERC20Basic.sol';\n\ncontract ERC22...
11,ERC20.sol,,notICO,AceBusters,\n\nimport './ERC223Basic.sol';\n\n\ncontract ...
12,ERC223ReceivingContract.sol,,notICO,AceBusters,\n \ncontract ERC223ReceivingContract {\n f...
13,SafeMath.sol,,notICO,AceBusters,\n\nlibrary SafeMath {\n function mul(uint256...
14,PullPayment.sol,satelites,notICO,AceBusters,"\n\nimport '../SafeMath.sol';\nimport ""../owne..."
15,Storage.sol,satelites,notICO,AceBusters,"\nimport ""../ownership/Ownable.sol"";\n\ncontra..."
19,Nutz.sol,satelites,notICO,AceBusters,"\nimport ""../ERC20.sol"";\nimport ""../ownership..."


In [303]:
df_files = df_files.apply(tokenize_string_from_row, axis=1)
df_files

Unnamed: 0,file_name,root,class,company,src,lines,types,values
3,PullPayInterface.sol,,notICO,AceBusters,\ncontract PullPayInterface {\n function asyn...,"[((1, 0), (1, 1)), ((2, 0), (2, 8)), ((2, 9), ...","[NL, NAME, NAME, OP, NL, NAME, NAME, OP, NAME,...","['\n', 'contract', 'PullPayInterface', '{', '\..."
4,ERC20Basic.sol,,notICO,AceBusters,\n\ncontract ERC20Basic {\n function totalSup...,"[((1, 0), (1, 1)), ((2, 0), (2, 1)), ((3, 0), ...","[NL, NL, NAME, NAME, OP, NL, NAME, NAME, OP, O...","['\n', '\n', 'contract', 'ERC20Basic', '{', '\..."
6,Migrations.sol,,notICO,AceBusters,\ncontract Migrations {\n address public owne...,"[((1, 0), (1, 1)), ((2, 0), (2, 8)), ((2, 9), ...","[NL, NAME, NAME, OP, NL, NAME, NAME, NAME, OP,...","['\n', 'contract', 'Migrations', '{', '\n', 'a..."
7,ERC223Basic.sol,,notICO,AceBusters,\nimport './ERC20Basic.sol';\n\ncontract ERC22...,"[((1, 0), (1, 1)), ((2, 0), (2, 6)), ((2, 7), ...","[NL, NAME, STRING, OP, NL, NL, NAME, NAME, NAM...","['\n', 'import', ""'./ERC20Basic.sol'"", ';', '\..."
11,ERC20.sol,,notICO,AceBusters,\n\nimport './ERC223Basic.sol';\n\n\ncontract ...,"[((1, 0), (1, 1)), ((2, 0), (2, 1)), ((3, 0), ...","[NL, NL, NAME, STRING, OP, NL, NL, NL, NAME, N...","['\n', '\n', 'import', ""'./ERC223Basic.sol'"", ..."
12,ERC223ReceivingContract.sol,,notICO,AceBusters,\n \ncontract ERC223ReceivingContract {\n f...,"[((1, 0), (1, 1)), ((2, 1), (2, 2)), ((3, 0), ...","[NL, NL, NAME, NAME, OP, NL, NAME, NAME, OP, N...","['\n', '\n', 'contract', 'ERC223ReceivingContr..."
13,SafeMath.sol,,notICO,AceBusters,\n\nlibrary SafeMath {\n function mul(uint256...,"[((1, 0), (1, 1)), ((2, 0), (2, 1)), ((3, 0), ...","[NL, NL, NAME, NAME, OP, NL, NAME, NAME, OP, N...","['\n', '\n', 'library', 'SafeMath', '{', '\n',..."
14,PullPayment.sol,satelites,notICO,AceBusters,"\n\nimport '../SafeMath.sol';\nimport ""../owne...","[((1, 0), (1, 1)), ((2, 0), (2, 1)), ((3, 0), ...","[NL, NL, NAME, STRING, OP, NL, NAME, STRING, O...","['\n', '\n', 'import', ""'../SafeMath.sol'"", ';..."
15,Storage.sol,satelites,notICO,AceBusters,"\nimport ""../ownership/Ownable.sol"";\n\ncontra...","[((1, 0), (1, 1)), ((2, 0), (2, 6)), ((2, 7), ...","[NL, NAME, STRING, OP, NL, NL, NAME, NAME, NAM...","['\n', 'import', '""../ownership/Ownable.sol""',..."
19,Nutz.sol,satelites,notICO,AceBusters,"\nimport ""../ERC20.sol"";\nimport ""../ownership...","[((1, 0), (1, 1)), ((2, 0), (2, 6)), ((2, 7), ...","[NL, NAME, STRING, OP, NL, NAME, STRING, OP, N...","['\n', 'import', '""../ERC20.sol""', ';', '\n', ..."


In [304]:
tokens = expand_df(df_files)
len(tokens)

219914

In [329]:
def token_frequencies(tokens):
    df = tokens.groupby(["type", "value"]).size().reset_index(name="num")
    num_tokens = len(tokens)
    df['num'] = df['num'].apply(lambda x: x / (1.0 * 522))
    return df.sort_values(by=['num'], ascending=False)

In [351]:
token_freq = token_frequencies(tokens)
token_freq_200 = token_freq[:200]
token_freq_200[:50]

Unnamed: 0,type,value,num
5120,NL,'\n',67.452107
5455,OP,'(',32.482759
5456,OP,')',32.482759
5470,OP,';',27.270115
5462,OP,"','",16.657088
5465,OP,'.',12.051724
5483,OP,'{',9.772031
5485,OP,'}',9.772031
5474,OP,'=',9.578544
2721,NAME,'function',6.417625


In [325]:
# NOTE: better dump to json!!
token_freq_200.to_csv(os.path.join(out_path, 'token_freq_200.csv'))


In [362]:
token_freq_200[token_freq_200['value'].apply(lambda x: bool(re.match("'\\\\n'", x)))]

Unnamed: 0,type,value,num
5120,NL,'\n',67.452107


In [358]:
token_freq_200.filter(like="'true'", axis=0)


Unnamed: 0,type,value,num


In [357]:
 token_freq_200['value'].values

array(["'\\n'", "'('", "')'", "';'", "','", "'.'", "'{'", "'}'", "'='",
       "'function'", "'address'", "'['", "']'", "'public'", "'uint'",
       "'uint256'", "'returns'", "'return'", "'0'", "'if'", "'msg'",
       "'sender'", "'i'", "'bool'", "'require'", "'import'", "'+'", "'=='",
       "'constant'", "'internal'", "'db'", "'>'", "'&'", "'bytes32'",
       "'true'", "'value'", "'contract'", "'_value'", "'<'", "''",
       "'length'", "'owner'", "'1'", "'string'", "'a'", "'event'",
       "'_amount'", "'throw'", "'b'", "'|'", "'!='", "'_to'", "'for'",
       "'is'", "'indexed'", "'balances'", "'amount'", "'transactionId'",
       "'add'", "'_from'", "'view'", "'>='", "'-'", "'onlyOwner'",
       "'uint8'", "'*'", "'/'", "':'", "'modifier'", "'_'", "'token'",
       "'_owner'", "'bytes'", "'!'", "'false'", "'else'", "'this'",
       "'jobId'", "'totalSupply'", "'sha3'", "'_spender'", "'now'",
       "'EthlanceDB'", "'mapping'", "'transfer'", "'args'", "'_market'",
       "'new'", "'