In [10]:
# Read raw CSV files one by one and generate dataframes for further analysis.
# (Aggregating all data into one large DF won't scale, since the dataset is too large.)

import pandas as pd
import numpy as np
import glob
import os.path
from tqdm.notebook import tqdm

# Path to dataset directory
DATASET_DIR=""

DATASETS=['f-droid', 'google', 'malware']

DECOMPILERS=['CFR','Fernflower','Jadx','Procyon']

DatsetCategory = pd.CategoricalDtype(categories=DATASETS)
OutcomeCategory = pd.CategoricalDtype(categories=['S', 'F', 'T','N'])

main_entries = []

bin_entries_jadx = []
bin_entries_methods = []

# Decompiler co-failure DF 
decomp_df = pd.DataFrame(0, index=DECOMPILERS, columns=['Total','>0','>1','All'])

with open(os.path.join(DATASET_DIR, 'data', 'ads.txt'), 'r') as f:
    ad_supported = set(l.strip() for l in f)


for dataset in DATASETS:
    n_failed = 0
    for path in tqdm(glob.glob(os.path.join(DATASET_DIR, 'data', dataset, '*.ecsv')), 
                     desc=("{} dataset".format(dataset))):
        # Read header
        header = []
        with open(path, 'r') as f:
            for n,l in enumerate(f):
                if n < 5:
                    header.append(l)
                else: break
        if header[0].split()[0] == "ERROR:":
            n_failed += 1
            continue
        else:
            packed = (header[0].split()[1] != "None")
            family = ' '.join((header[4].split()[1:])) if dataset == 'malware' else None

        name = '.'.join(os.path.basename(path).split('.')[:-1])
            
        ads = (name in ad_supported) if dataset == 'google' else None
        
        df = pd.read_csv(path,
                         engine='c',
                         sep=';', 
                         header=6, 
                         usecols=range(1,6),
                         dtype={'size': np.int32,
                                'C': OutcomeCategory, 
                                'F': OutcomeCategory, 
                                'J': OutcomeCategory, 
                                'P': OutcomeCategory})

        cfr_N        = 'N' in df.C.values
        fernflower_N = 'N' in df.F.values
        jadx_N       = 'N' in df.J.values
        procyon_N    = 'N' in df.P.values
        
        mask_cfr_F        = df.C == 'F'
        mask_fernflower_F = df.F == 'F'
        mask_jadx_F       = df.J == 'F'
        mask_procyon_F    = df.P == 'F'
        
        cfr_T        = 'T' in df.C.values
        fernflower_T = 'T' in df.F.values
        jadx_T       = 'T' in df.J.values
        procyon_T    = 'T' in df.P.values
       
        tot_successful = (df[['C','F','J','P']] == 'S').any(axis=1).sum()
    
        # Main DF
        main_entries.append([name, 
                             dataset, 
                             packed, 
                             family,
                             ads,
                             df.shape[0], 
                             mask_cfr_F.sum() if not cfr_N else None, 
                             mask_fernflower_F.sum() if not fernflower_N else None, 
                             mask_jadx_F.sum() if not jadx_N else None,
                             mask_procyon_F.sum() if not procyon_N else None,
                             cfr_T if not cfr_N else None,
                             fernflower_T if not fernflower_N else None, 
                             jadx_T if not jadx_N else None,
                             procyon_T if not procyon_N else None,
                             tot_successful])
       
        # Generate DFs with method size and failure rate distributions
        df['logbin'] = np.log2(df['size']).astype(np.int8)
        counts_methods = df['logbin'].value_counts()
        counts_jadx = df[mask_jadx_F]['logbin'].value_counts()
        
        # jadx failure distribution across different method sizes
        df_jadx = pd.DataFrame(counts_jadx).reindex(range(4, 18), fill_value=0).T
        
        # Method size distribution
        df_methods = pd.DataFrame(counts_methods).reindex(range(4, 18), fill_value=0).T
        
        # Turn integer indexes into strings
        df_jadx.columns = df_jadx.columns.map(str)
        df_methods.columns = df_methods.columns.map(str) 
            
        bin_entries_jadx.append(df_jadx)
        bin_entries_methods.append(df_methods)
    
        # Prepare DF to study co-failure distribution
        
        all_failed = pd.Series([np.int(1)] * len(df.index))
        num_failed = ((mask_cfr_F.astype(np.int) if not (cfr_T or cfr_N) else all_failed) + 
                      (mask_fernflower_F.astype(np.int) if not (fernflower_T or fernflower_N) else all_failed) + 
                      (mask_jadx_F.astype(np.int) if not (jadx_T or jadx_N) else all_failed) + 
                      (mask_procyon_F.astype(np.int) if not (procyon_T or procyon_N) else all_failed))
        num_failed.name = 'n_failed'
        failed_df = df.join(num_failed)
        
        # Only record fail distributions for f-droid, since we may get unreliable results 
        # due to dex2jar replacing heavily obfuscated methods with "stubs"
        if dataset == 'f-droid':
            new_df = pd.DataFrame(0, columns=decomp_df.columns, index=decomp_df.index)

            for d in DECOMPILERS:
                shorthand = d[0]
                # Get number of failed decompilers apart from current one
                failed_s = failed_df[failed_df[shorthand] != 'S']['n_failed'] - 1

                # Get distribution of number of other failing decompilers
                occurecnes = failed_s.value_counts().reindex(range(len(DECOMPILERS)), fill_value=0)

                rev_cumsum = occurecnes[::-1].cumsum()[::-1]
                total = len(failed_df[(failed_df[shorthand] == 'F') | (failed_df[shorthand] == 'T')])
                new_df.loc[d,:] = [total, rev_cumsum[1], rev_cumsum[2], rev_cumsum[3]]

                decomp_df += new_df
                
    if n_failed:
        print("NOTE: ignored {} failed apps".format(n_failed))

COLUMNS=["Name", 
         "Dataset", 
         "Packed",
         "Family",
         "Ads",
         "Methods", 
         "CFR_F", 
         "Fernflower_F", 
         "Jadx_F", 
         "Procyon_F",
         "CFR_T", 
         "Fernflower_T", 
         "Jadx_T", 
         "Procyon_T",
         "Tot_S"]

# Make main DF, one row per app
apps_df = pd.DataFrame(main_entries, columns=COLUMNS)
apps_df["Dataset"] = apps_df["Dataset"].astype(DatsetCategory)

# Make method size and failure rate distribution DFs. 
# Rows will be in same order as main DF, so we can join on index.
jadx_bins_df = pd.concat(bin_entries_jadx, ignore_index=True)
method_bins_df = pd.concat(bin_entries_methods, ignore_index=True)

HBox(children=(HTML(value='f-droid dataset'), FloatProgress(value=0.0, max=3018.0), HTML(value='')))




HBox(children=(HTML(value='google dataset'), FloatProgress(value=0.0, max=13601.0), HTML(value='')))


NOTE: ignored 7 failed apps


HBox(children=(HTML(value='malware dataset'), FloatProgress(value=0.0, max=24553.0), HTML(value='')))


NOTE: ignored 1220 failed apps


In [52]:
# Persist DFs to disk
apps_df.to_parquet('main_decompilation_df.parquet')
jadx_bins_df.to_parquet('jadx_bins_df.parquet')
method_bins_df.to_parquet('method_bins_df.parquet')
decomp_df.to_parquet('decomp_fail_dist_df.parquet')

In [53]:
# Read complementary CSV with inaccuracies due to inner classes and generics.
# (this is used to correct main DF)

import pandas as pd
import numpy as np
import os.path

# Path to dataset directory
DATASET_DIR=""

DATASETS=['f-droid', 'google', 'malware']

DECOMPILERS=['CFR','Fernflower','Jadx','Procyon']

to_concat = []

for dataset in DATASETS:
    df = pd.read_csv(os.path.join(DATASET_DIR, 'data', 'failures_{}.csv'.format(dataset)),
                     engine='c',
                     sep=';',
                     usecols=range(13),
                     dtype={ 'App':'string',
                             'CFR_no_match':np.float,
                             'CFR_multi_match':np.float,
                             'CFR_multi_match_average':np.float,
                             'fernflower_no_match':np.float,
                             'fernflower_multi_match':np.float,
                             'fernflower_multi_match_average':np.float,
                             'jadx_no_match':np.float,
                             'jadx_multi_match':np.float,
                             'jadx_multi_match_average':np.float,
                             'procyon_no_match':np.float,
                             'procyon_multi_match':np.float,
                             'procyon_multi_match_average':np.float},
                     na_values=['T','N'])
    df.insert(0, 'Dataset', dataset)
    to_concat.append(df)
match_fail_df = pd.concat(to_concat, ignore_index=True)
match_fail_df.columns = [s[0].upper()+s[1:] for s in match_fail_df.columns]
match_fail_df = match_fail_df.rename(columns={'App':'Name'})

for d in DECOMPILERS:
    match_fail_df[d+'_extra'] = (match_fail_df[d+'_multi_match'] * match_fail_df[d+'_multi_match_average']).apply(np.round)-match_fail_df[d+'_multi_match']

In [55]:
# Persist to disk
match_fail_df.to_parquet('match_fail_df.parquet')

In [57]:
# Record data for analysis of tokens associated with decompilation failures

import pandas as pd
import numpy as np
import glob
import os.path
from tqdm.notebook import tqdm

# Path to dataset directory
DATASET_DIR=""

DATASETS=['f-droid', 'google', 'malware']

token_method_freq = {ds:{} for ds in DATASETS}
token_fail_freq = {ds:{} for ds in DATASETS}
token_app_freq = {ds:{} for ds in DATASETS}

for dataset in DATASETS:
    for path in tqdm(glob.glob(os.path.join(DATASET_DIR, 'data', dataset, '*.ecsv')), 
                     desc=("{} dataset".format(dataset))):
        # Read header
        header = []
        with open(path, 'r') as f:
            if f.readline().split()[0] == "ERROR:":
                continue
        
        with open(path, 'r') as file:
            
            # Skip header
            for i in range(8): file.readline()
            
            app_tokens = set()
            
            for l in file:
                fields = l.split(';')
                signature = fields[0].split(' ')[0] # Get class/package part
                # This is 5-10x faster than using regular expressions...
                signature = signature.replace('$','.')
                tokens = set(signature.split('.'))
                for t in tokens:
                    app_tokens.add(t)
                    if t not in token_method_freq[dataset]:
                        token_method_freq[dataset][t] = 0
                        token_fail_freq[dataset][t] = 0
                        token_app_freq[dataset][t] = 0
                    token_method_freq[dataset][t] += 1
                    if fields[4] == 'F':
                        token_fail_freq[dataset][t] += 1
            for t in app_tokens:
                token_app_freq[dataset][t] += 1

HBox(children=(HTML(value='f-droid dataset'), FloatProgress(value=0.0, max=3018.0), HTML(value='')))




HBox(children=(HTML(value='google dataset'), FloatProgress(value=0.0, max=13601.0), HTML(value='')))




HBox(children=(HTML(value='malware dataset'), FloatProgress(value=0.0, max=24553.0), HTML(value='')))




In [58]:
# Persist data about tokens as pickled Python dictionaries

import lzma
import pickle

data = (token_method_freq, token_fail_freq, token_app_freq)
with lzma.open('token_frequencies.pickle.lzma', 'wb') as f:
    pickle.dump(data, f)