In [2]:
import pandas as pd
import numpy as np
import os
from os import listdir
from os.path import isfile, join
import sys
import math

In [3]:
existing_data_source = 'results/cross/Project_specific_2_5/'
existing_projects = [f for f in listdir(existing_data_source) if isfile(join(existing_data_source, f))]
len(existing_projects)

391

# Functions

In [22]:
def get_stats_all(existing_projects, data_source, project_names): 
    results = {'f1':{}, 
               'precision':{}, 
               'recall':{}, 
               'g-score':{}, 
               'd2h':{}, 
               'pci_20':{}, 
               'ifa':{}, 
               'pd':{}, 
               'pf':{}
              }
    for project in existing_projects:
        if project not in project_names:
            continue
        try:
            df = pd.read_pickle(data_source + project)
            for metric in results.keys():
                for model in df.keys():
#                     if model == 'coForest':
#                         continue
                    if model not in results[metric].keys():
                        results[metric][model] = []
                    if metric != 'pf':
                        if np.nanmedian(df[model][metric]) > 0.0:
                            results[metric][model].append(np.nanmedian(df[model][metric]))
                    else:
                        results[metric][model].append(np.nanmedian(df[model][metric]))
        except Exception as e:
            print(project, e)
            continue
    return results,df

def get_stats_selected(existing_projects, models, data_source, project_names): 
    results = {'f1':{}, 
               'precision':{}, 
               'recall':{}, 
               'g-score':{}, 
               'd2h':{}, 
               'pci_20':{}, 
               'ifa':{}, 
               'pd':{}, 
               'pf':{}
              }
    for project in existing_projects:
        if project not in project_names:
            continue
        try:
            df = pd.read_pickle(data_source + project)
            for metric in results.keys():
                for model in df.keys():
                    if model in models:
                        if model not in results[metric].keys():
                            results[metric][model] = []
                        results[metric][model].append(np.median(df[model][metric]))
        except Exception as e:
            print(project, e)
            continue
    return results

def get_stats_one(existing_projects, model_names, data_source, project_names): 
    results = {'f1':{}, 
               'precision':{}, 
               'recall':{}, 
               'g-score':{}, 
               'd2h':{}, 
               'pci_20':{}, 
               'ifa':{}, 
               'pd':{}, 
               'pf':{}
              }
    for project in existing_projects:
        if project not in project_names:
            continue
        try:
            df = pd.read_pickle(data_source + project)
            for metric in results.keys():
                for model in df.keys():
                    for model_name in model_names:
                        if model_name in model:
                            if model not in results[metric].keys():
                                results[metric][model] = []
                            results[metric][model].append(np.median(df[model][metric]))
        except Exception as e:
            print(project, e)
            continue
    return results

def get_stats_lower_projects(existing_projects, projects, models, data_source, project_names): 
    results = {'f1':{}, 
               'precision':{}, 
               'recall':{}, 
               'g-score':{}, 
               'd2h':{}, 
               'pci_20':{}, 
               'ifa':{}, 
               'pd':{}, 
               'pf':{}
              }
    for project in existing_projects:
        if project not in project_names:
            continue
        if project not in projects:
            try:
                df = pd.read_pickle(data_source + project)
                for metric in results.keys():
                    for model in df.keys():
                        if model in models:
                            if model not in results[metric].keys():
                                results[metric][model] = []
                            results[metric][model].append(np.median(df[model][metric]))
            except Exception as e:
                print(project, e)
                continue
    return results

def get_stats_selected_varying_size(existing_projects, models, data_source, project_names): 
    data_source = data_source.rsplit('_',1)[0]
    data_sources = [20]
    results = {'f1':{}, 
               'precision':{}, 
               'recall':{}, 
               'g-score':{}, 
               'd2h':{}, 
               'pci_20':{}, 
               'ifa':{}, 
               'pd':{}, 
               'pf':{}
              }
    for project in existing_projects:
        if project not in project_names:
            continue
        try:
            for i in data_sources:
                try:
                    df = pd.read_pickle(data_source + '_' + str(i) + '/' + project)
                    for metric in results.keys():
                        for model in df.keys():
                            if model in models:
                                if model + '_' + str(i) not in results[metric].keys():
                                    results[metric][model + '_' + str(i)] = []
                                results[metric][model + '_' + str(i)].append(np.mean(df[model][metric]))
                except:
                    continue
        except Exception as e:
            print(project, e)
            continue
    return results

In [5]:
def get_stats_all_names(existing_projects, data_source): 
    results = {'f1':{}, 
               'precision':{}, 
               'recall':{}, 
               'g-score':{}, 
               'd2h':{}, 
               'pci_20':{}, 
               'ifa':{}, 
               'pd':{}, 
               'pf':{}
              }
    for project in existing_projects:
        try:
            df = pd.read_pickle(data_source + project)
            for metric in results.keys():
                for model in df.keys():
#                     if model == 'coForest':
#                         continue
                    if model not in results[metric].keys():
                        results[metric][model] = {}
                    results[metric][model][project] = np.median(df[model][metric])
        except Exception as e:
            print(project, e)
            continue
    return results

In [6]:
def write_stats_file(results):
    for metric in results.keys():
        with open('results/Stats/' + metric + '.txt', 'w') as f:
            for key in results[metric].keys():
                f.write("%s \n" % key)
                for item in results[metric][key]:
                    try:
                        if item == 'nan':
                            print(item)
                            continue
                        f.write("%s " % round(item,2))
                    except:
                        print('error')
                        continue
                f.write("\n\n")

In [7]:
def get_lower_performing_projects(existing_projects, metric, selected_models):
    lower_projects = []
    for project in existing_projects:
        try:
            df = pd.read_pickle(existing_data_source + project)
            for metric in results.keys():
                for model in df.keys():
                    if model in selected_models:
                        if metric == 'f1':
                            if np.median(df[model][metric]) <= 0.5:
                                lower_projects.append(project)
        except Exception as e:
            print(project, e)
            continue
    return lower_projects

In [8]:
def norm(x,df):
    lo = df.min()
    hi = df.max()
#     print(x,lo,hi)
    return (x - lo) / (hi - lo +0.00000001)

def dominate(_df,t,row_model_name,goals):
    wins = 0
    for i in range(_df.shape[0]):
        model_name = _df.iloc[i].model
        row = _df.iloc[i].tolist()
        if model_name != row_model_name:
            if dominationCompare(row, t,goals,_df):
                wins += 1
    return wins

def dominationCompare(other_row, t,goals,df):
    cols = df.columns.values.tolist()
    pos = []
    for goal in goals:
        pos.append(cols.index(goal))
    cols.index('recall')
    n = len(goals)
    weight = {'recall':3,'precision':1,'pf':-3.5}
    sum1, sum2 = 0,0
    for i in range(len(goals)):
        _df = df[goals[i]]
        w = weight[goals[i]]
        x = t[pos[i]]
        y = other_row[pos[i]]
        x = norm(x,_df)
        y = norm(y,_df)
        sum1 = sum1 - math.e**(w * (x-y)/n)
        sum2 = sum2 - math.e**(w * (y-x)/n)
    return sum1/n < sum2/n

# Projects

In [14]:
all_results = get_stats_all_names(existing_projects, existing_data_source)
result_df = pd.DataFrame.from_dict(all_results['f1'], orient = 'columns')
med = result_df.median(axis = 0)
q3 = result_df.quantile(0.75, axis = 1)
q2 = result_df.quantile(0.5, axis = 1)
q1 = result_df.quantile(0.25, axis = 1)
iqr = q3 -q1
iqr.sort_values(ascending = False)
project_names = q2[q2 > q2.quantile(0.2)].index

def get_projects(existing_projects, existing_data_source):
    results = get_stats_all_names(existing_projects, existing_data_source)
    result_df = pd.DataFrame.from_dict(results['f1'], orient = 'columns')
    med = result_df.median(axis = 0)
    q3 = result_df.quantile(0.75, axis = 1)
    q2 = result_df.quantile(0.5, axis = 1)
    q1 = result_df.quantile(0.25, axis = 1)
    iqr = q3 -q1
    iqr.sort_values(ascending = False)
    project_names = q2[q2 > q2.quantile(0.25)].index
    return project_names

In [10]:
q2.quantile(0.5)

0.485

# Get all projects all models

In [70]:
new_dict = {}
for metric in results.keys():
    new_dict[metric] = {}
    for model in results[metric].keys():
        new_dict[metric][model] = np.nanmedian(results[metric][model])
new_df = pd.DataFrame.from_dict(new_dict, orient = 'columns')
new_df.reset_index(inplace = True)
new_df.columns = ['model', 'f1', 'precision', 
                  'recall', 'g-score', 'd2h', 
                  'pci_20', 'ifa','pd', 'pf']

In [71]:
goals = ['recall','precision','pf']
dom_score = []
for row_id in range(new_df.shape[0]):
    model_name = new_df.iloc[row_id].model
    row = new_df.iloc[row_id].tolist()
    wins = dominate(new_df,row,model_name,goals)
    dom_score.append(wins)
new_df['wins'] = dom_score

In [72]:
new_df.sort_values('wins', ascending=False)

Unnamed: 0,model,f1,precision,recall,g-score,d2h,pci_20,ifa,pd,pf,wins
14,Semi_GMM,0.485,0.66,0.51,0.25,0.88,0.71,0.0,0.51,0.07,53
49,boosting_RF,0.5,0.48,0.565,0.345,0.79,0.74,0.0,0.565,0.12,52
47,boosting_LR,0.47,0.44,0.565,0.2275,0.95,0.71,0.0,0.565,0.12,51
51,boosting_SVM,0.47,0.445,0.57,0.2775,0.925,0.7075,0.0,0.57,0.15,50
52,boosting_KNN,0.48,0.43,0.635,0.31,0.8925,0.71,0.0,0.635,0.36,49
39,co_training_mv_RF_GNB,0.39,0.35,0.5,0.025,1.0,0.62,0.0,0.5,0.0925,48
9,self_training_GNB,0.39,0.33,0.535,0.0,1.0,0.61,0.0,0.535,0.17,47
17,co_training_sv_LR_GNB,0.37,0.38,0.47,0.01,1.0,0.63,0.0,0.47,0.08,46
43,co_training_mv_GNB_KNN,0.44,0.43,0.56,0.17,0.965,0.71,0.0,0.56,0.2925,45
27,co_training_sv_GNB_SVM,0.36,0.38,0.47,0.01,1.0,0.62,0.0,0.47,0.095,44


In [47]:
final_result = new_df.sort_values('wins', ascending=False)
final_result.to_csv('ranks_2_5_new.csv')

# Get Specific Models

In [177]:
# models = ['LR', 'DT', 'RF', 'GNB', 'SVM', 'KNN',
#           'self_training_SVM', 
#           'LabelPropagation', 'LabelSpreading', 
#           'Semi_GMM', 'co_training_sv_LR_SVM',
#           'co_training_mv_LR_SVM', 'EATT', 
#           'coForest', 'boosting_KNN', 'S3VM']
models = ['co_training_sv_RF_SVM','SVM','EATT',
         'co_training_mv_RF_SVM','self_training_SVM','RF',
         'LR','LabelPropagation','LabelSpreading','boosting_SVM',
         'GNB','KNN','DT','S3VM','Semi_GMM']
results = get_stats_selected(existing_projects, models, existing_data_source, project_names)
write_stats_file(results)

# Get One Model

In [182]:
model_name = ['co_training_mv','co_training_sv']
results = get_stats_one(existing_projects, model_name, existing_data_source, project_names)
write_stats_file(results)

# Project Specific

In [7]:
selected_models = ['LR']#, 'DT', 'RF', 'GNB', 'SVM', 'KNN']
models = ['LR', 'self_training_LR', 
          'LabelPropagation', 'LabelSpreading', 
          'Semi_GMM', 'co_training_sv_LR_SVM',
          'co_training_mv_LR_SVM', 'EATT', 
          'coForest', 'boosting_KNN', 'S3VM','DT', 'RF', 'GNB', 'SVM', 'KNN']
metric = 'f1'
lower_projects = get_lower_performing_projects(existing_projects, metric, selected_models)
results = get_stats_lower_projects(existing_projects, lower_projects, models, project_names)
write_stats_file(results)

graphicsfuzz.pkl name 'results' is not defined
xmrwallet.pkl name 'results' is not defined
GroupedRecyclerViewAdapter.pkl name 'results' is not defined
WaniKani-for-Android.pkl name 'results' is not defined
cm_api.pkl name 'results' is not defined
SimpleCropView.pkl name 'results' is not defined
jenkins-gitlab-merge-request-builder-plugin.pkl name 'results' is not defined
owner.pkl name 'results' is not defined
streamex.pkl name 'results' is not defined
SimianArmy.pkl name 'results' is not defined
DebugDrawer.pkl name 'results' is not defined
AndroidPicker.pkl name 'results' is not defined
android-kline.pkl name 'results' is not defined
AppiumTestDistribution.pkl name 'results' is not defined
LicensesDialog.pkl name 'results' is not defined
Android-SlideExpandableListView.pkl name 'results' is not defined
ActionBarSherlock.pkl name 'results' is not defined
flinkStreamSQL.pkl name 'results' is not defined
Discord4J.pkl name 'results' is not defined
DragonProxy.pkl name 'results' is not 

failsafe.pkl name 'results' is not defined
SelectorInjection.pkl name 'results' is not defined
Android-ReactiveLocation.pkl name 'results' is not defined
programming.pkl name 'results' is not defined
facebook-java-business-sdk.pkl name 'results' is not defined
spring-cloud-zuul-ratelimit.pkl name 'results' is not defined
s3proxy.pkl name 'results' is not defined
dynamic-load-apk.pkl name 'results' is not defined
AndroidTraining.pkl name 'results' is not defined
FizzBuzzEnterpriseEdition.pkl name 'results' is not defined
extentreports-java.pkl name 'results' is not defined
seata-samples.pkl name 'results' is not defined
cordova-plugin-googleplus.pkl name 'results' is not defined
RxActivityResult.pkl name 'results' is not defined
TextFieldBoxes.pkl name 'results' is not defined
android-flowlayout.pkl name 'results' is not defined
XmlToJson.pkl name 'results' is not defined
CtCI-6th-Edition.pkl name 'results' is not defined
BaseRecyclerViewAdapterHelper.pkl name 'results' is not defined
t

TypeError: get_stats_lower_projects() missing 1 required positional argument: 'data_source'

# Multiple Sizes

In [8]:
models = ['LR', 'DT', 'GNB', 'SVM', 'KNN',
          'self_training_SVM', 
          'LabelPropagation', 'LabelSpreading', 
          'Semi_GMM', 'co_training_sv_LR_SVM',
          'co_training_mv_LR_SVM', 'EATT', 
          'coForest', 'boosting_KNN', 'S3VM']
results = get_stats_selected_varying_size(existing_projects, models, existing_data_source, project_names)
write_stats_file(results)