In [33]:
import pandas as pd
import numpy as np
from numpy import argmax
from numpy import sqrt
import math
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.utils import resample
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.model_selection import KFold
from matplotlib import pyplot
from statistics import median
import pickle
import csv
import warnings
import datetime
import multiprocess
import os
warnings.filterwarnings("ignore")

In [34]:
project_list = ['rails.csv', 'gradle.csv', 'jruby.csv', 'metasploit-framework.csv', 'cloudify.csv', 'vagrant.csv', 'rubinius.csv', 'open-build-service.csv', 'sonarqube.csv', 'loomio.csv', 'fog.csv', 'opal.csv', 'cloud_controller_ng.csv', 'puppet.csv', 'concerto.csv', 'sufia.csv', 'geoserver.csv', 'orbeon-forms.csv', 'graylog2-server.csv']


In [35]:
fi_dict = {}

In [36]:
for p in project_list:
    p_name = p.split('.')[0]
    fi_dict[p] = {}
    path = 'dump_data/' + p_name + '_models/'
    contents = os.listdir(path)
    
    for f in contents:
        model_file = path + f
        end_p = int(f.split('.csv')[1].split('_')[1])
        print(f, end_p)
        with open(model_file, 'rb') as file:
            model = pickle.load(file)
        
        #print(p, ver)
        
        
        important_features_dict = {}
        for idx, val in enumerate(model.feature_importances_):
            important_features_dict[idx] = val
        
        important_features_list = sorted(important_features_dict,
                                 key=important_features_dict.get,
                                 reverse=True)
        
        #print(important_features_list)
        
        fi_dict[p][end_p] = {}
        fi_dict[p][end_p]['list'] = important_features_list
        fi_dict[p][end_p]['value'] = important_features_dict

rq2_rails.csv_8478_best_model.pkl 8478
rq2_rails.csv_12717_best_model.pkl 12717
rq2_rails.csv_14130_best_model.pkl 14130
rq2_rails.csv_4239_best_model.pkl 4239
rq2_rails.csv_5652_best_model.pkl 5652
rq2_rails.csv_9891_best_model.pkl 9891
rq2_rails.csv_11304_best_model.pkl 11304
rq2_rails.csv_7065_best_model.pkl 7065
rq2_rails.csv_2826_best_model.pkl 2826
rq2_gradle.csv_2493_best_model.pkl 2493
rq2_gradle.csv_3561_best_model.pkl 3561
rq2_gradle.csv_1069_best_model.pkl 1069
rq2_gradle.csv_2849_best_model.pkl 2849
rq2_gradle.csv_1781_best_model.pkl 1781
rq2_gradle.csv_713_best_model.pkl 713
rq2_gradle.csv_1425_best_model.pkl 1425
rq2_gradle.csv_2137_best_model.pkl 2137
rq2_gradle.csv_3205_best_model.pkl 3205
rq2_jruby.csv_8271_best_model.pkl 8271
rq2_jruby.csv_1655_best_model.pkl 1655
rq2_jruby.csv_2482_best_model.pkl 2482
rq2_jruby.csv_4136_best_model.pkl 4136
rq2_jruby.csv_3309_best_model.pkl 3309
rq2_jruby.csv_7444_best_model.pkl 7444
rq2_jruby.csv_6617_best_model.pkl 6617
rq2_jruby.cs

In [28]:
fi_dict

{'rails.csv': {8478: {'list': [8, 7, 0, 2, 5, 6, 1, 3, 4],
   'value': {0: 0.12644561961041068,
    1: 0.016702082369563796,
    2: 0.09805919835648268,
    3: 0.009024229978987582,
    4: 0.0024918670257588283,
    5: 0.051998380890863456,
    6: 0.03203068060830203,
    7: 0.24171957601278887,
    8: 0.4215283651468421}},
  12717: {'list': [7, 8, 0, 2, 5, 6, 3, 1, 4],
   'value': {0: 0.15761852883559888,
    1: 0.021387605329666413,
    2: 0.1018481808385284,
    3: 0.02491511597545472,
    4: 0.007218732421967146,
    5: 0.06954808145917532,
    6: 0.03404893299980024,
    7: 0.3095682120975595,
    8: 0.2738466100422493}},
  14130: {'list': [7, 8, 0, 2, 5, 6, 1, 3, 4],
   'value': {0: 0.1718201044299063,
    1: 0.018104388781072272,
    2: 0.12142344094898505,
    3: 0.012683369715696181,
    4: 0.00623214148239064,
    5: 0.0767590689122818,
    6: 0.02548892700531606,
    7: 0.34494784451658156,
    8: 0.22254071420777005}},
  4239: {'list': [8, 7, 0, 2, 5, 6, 3, 4, 1],
   'value

In [25]:
feature_names = ['git_diff_src_churn', 'gh_num_commit_comments', 'git_diff_test_churn', 'gh_diff_files_added', 'gh_diff_files_deleted', 'gh_diff_files_modified', 'git_num_all_built_commits', 'gh_num_commits_on_files_touched', 'time_out']
combined_for_project = {}
values_for_projects = {}


In [37]:
for p in project_list:
    data = fi_dict[p]
    flist = [[], [], [], [], [], [], [], [], []]
    vals = [[], [], [], [], [], [], [], [], []]
    fi = 0
    
    for ver in data:
        for x in range(0,9):
            flist[x].append(data[ver]['list'][x])
            
    for ver in data:  
        for x in range(0,9):
            vals[x].append(data[ver]['value'][x])
    
    final = [0,0,0,0,0,0,0,0,0]
    final_value = [0,0,0,0,0,0,0,0,0]
    
    for i in range(0, 9):
        final[i] = max(set(flist[i]), key = flist[i].count)
    
    for i in final:
        final_value[i] = median(vals[i])
    
    values_for_projects[p] = final_value
    combined_for_project[p] = final

In [38]:
combined_for_project

{'rails.csv': [8, 7, 0, 2, 5, 6, 1, 3, 4],
 'gradle.csv': [8, 0, 0, 5, 6, 3, 3, 4, 1],
 'jruby.csv': [8, 7, 0, 5, 2, 2, 3, 4, 1],
 'metasploit-framework.csv': [7, 8, 0, 5, 3, 6, 2, 4, 1],
 'cloudify.csv': [8, 7, 0, 5, 2, 6, 3, 4, 1],
 'vagrant.csv': [8, 7, 0, 2, 5, 6, 3, 1, 4],
 'rubinius.csv': [7, 0, 8, 2, 2, 6, 3, 4, 1],
 'open-build-service.csv': [7, 8, 0, 5, 5, 6, 3, 4, 1],
 'sonarqube.csv': [8, 7, 0, 2, 5, 3, 3, 4, 1],
 'loomio.csv': [7, 8, 0, 5, 2, 3, 2, 4, 1],
 'fog.csv': [8, 7, 0, 5, 2, 3, 6, 4, 1],
 'opal.csv': [7, 8, 0, 5, 2, 5, 6, 1, 1],
 'cloud_controller_ng.csv': [7, 8, 0, 0, 5, 3, 6, 4, 1],
 'puppet.csv': [8, 7, 2, 0, 5, 6, 3, 4, 1],
 'concerto.csv': [8, 7, 0, 5, 2, 2, 4, 3, 1],
 'sufia.csv': [8, 7, 0, 2, 5, 3, 4, 6, 1],
 'geoserver.csv': [7, 8, 0, 2, 5, 3, 6, 4, 1],
 'orbeon-forms.csv': [8, 7, 0, 5, 6, 3, 4, 2, 1],
 'graylog2-server.csv': [8, 7, 0, 5, 6, 2, 3, 1, 4]}

In [31]:
#after manual interference
combined_for_project['vagrant.csv'][-1] = 1
combined_for_project['vagrant.csv'][-2] = 4
combined_for_project['graylog2-server.csv'][2] = 0
combined_for_project['graylog2-server.csv'][-1] = 1

In [39]:
final_fi = [0, 0, 0, 0, 0, 0, 0, 0, 0]

In [40]:
flist = [[], [], [], [], [], [], [], [], []]
for p in project_list:
    data = combined_for_project[p]
    for x in range(0,9):
        flist[x].append(data[x])

In [41]:
for i in range(0, 9):
    final_fi[i] = max(set(flist[i]), key = flist[i].count)

In [42]:
#after manual interference
final_fi[4] = 2

In [43]:
final_fi

[8, 7, 0, 5, 2, 3, 3, 4, 1]

In [44]:
final_fi_values = [0, 0, 0, 0, 0, 0, 0, 0, 0]
fvalue = [[], [], [], [], [], [], [], [], []]

for p in values_for_projects:
    for x in range(0,9):
        fvalue[x].append(values_for_projects[p][x])

for i in range(0, 9):
    final_fi_values[i] = median(fvalue[i])

In [45]:
for i in range(0, 9):
    print('{:>5} {:>40} {:>30}'.format(final_fi[i], feature_names[final_fi[i]], final_fi_values[final_fi[i]]))

    8                                 time_out             0.2586862255345642
    7          gh_num_commits_on_files_touched            0.25235010333078517
    0                       git_diff_src_churn            0.16524873461422038
    5                   gh_diff_files_modified            0.08991025041433495
    2                      git_diff_test_churn            0.06946100505509234
    3                      gh_diff_files_added            0.03668756095402584
    3                      gh_diff_files_added            0.03668756095402584
    4                    gh_diff_files_deleted            0.01367876028697946
    1                   gh_num_commit_comments           0.001598304267195297
