In [1]:
import pandas as pd
import numpy as np
import math
import pickle

from scipy import stats
import scipy.io
from scipy.spatial.distance import pdist
from scipy.linalg import cholesky
from scipy.io import loadmat

import matlab.engine as engi
import matlab as mat

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report,roc_auc_score,recall_score,precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from pyearth import Earth

from src import SMOTE
from src import CFS
from src import metrices_V2 as metrices

import platform
from os import listdir
from os.path import isfile, join
from glob import glob
from pathlib import Path
import sys
import os
import copy
import traceback
from pathlib import Path

import matplotlib.pyplot as plt

In [6]:
def load_data(project):
    understand_path = 'data/understand_files_all/' + project + '_understand.csv'
    commit_guru_path = 'data/commit_guru/' + project + '.csv'
    understand_df = pd.read_csv(understand_path)
    understand_df = understand_df.dropna(axis = 1,how='all')
    cols_list = understand_df.columns.values.tolist()
    for item in ['Kind', 'Name','commit_hash', 'Bugs']:
        if item in cols_list:
            cols_list.remove(item)
            cols_list.insert(0,item)
    understand_df = understand_df[cols_list]
    commit_guru_df = pd.read_csv(commit_guru_path)
    cols = understand_df.columns.tolist()
    
    commit_guru_df = commit_guru_df.drop(labels = ['parent_hashes','author_name','author_name',
                                                   'author_email','fileschanged','author_date',
                                                   'author_date_unix_timestamp', 'commit_message',
                                                  'classification', 'fix', 'contains_bug','fixes',],axis=1)

    understand_df = understand_df.drop_duplicates(cols[4:len(cols)])
    df = understand_df.merge(commit_guru_df,on='commit_hash')
    cols = df.columns.tolist()
    cols = cols[1:] + [cols[0]]
    df = df[cols]
    for item in ['Kind', 'Name','commit_hash']:
        if item in cols:
            df = df.drop(labels = [item],axis=1)
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    df.to_csv('data/converted/'+ project + '_understand.csv',index=False)
    y = df.Bugs
    X = df.drop('Bugs',axis = 1)
    cols = X.columns
    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)
    X = pd.DataFrame(X,columns = cols)
    return X,y

def apply_smote(df):
    cols = df.columns
    smt = SMOTE.smote(df)
    df = smt.run()
    df.columns = cols
    return df

def apply_cfs(df):
        y = df.Bugs.values
        X = df.drop(labels = ['Bugs'],axis = 1)
        X = X.values
        selected_cols = CFS.cfs(X,y)
        cols = df.columns[[selected_cols]].tolist()
        cols.append('Bugs')
        return df[cols],cols

In [7]:
def run_self(project):
    X,y = load_data(project)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=18)
    loc = X_test.CountLineCode
    df_smote = pd.concat([X_train,y_train],axis = 1)
    df_smote = apply_smote(df_smote)
    y_train = df_smote.Bugs
    X_train = df_smote.drop('Bugs',axis = 1)
    clf = RandomForestClassifier()
    clf.fit(X_train,y_train)
    importance = clf.feature_importances_
    predicted = clf.predict(X_test)
    abcd = metrices.measures(y_test,predicted,loc)
    pf = abcd.get_pf()
    recall = abcd.calculate_recall()
    precision = abcd.calculate_precision()
    f1 = abcd.calculate_f1_score()
    g_score = abcd.get_g_score()
    pci_20 = abcd.get_pci_20()
    ifa = abcd.get_ifa()
    try:
        auc = roc_auc_score(y_test, predicted)
    except:
        auc = 0
    print(classification_report(y_test, predicted))
    return recall,precision,pf,f1,g_score,auc,pci_20,ifa,importance

In [8]:
def run_self_CFS(project):
    X,y = load_data(project)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=18)
    loc = X_test.CountLineCode
    df_smote = pd.concat([X_train,y_train],axis = 1)
    df_smote = apply_smote(df_smote)
    df_smote,cols = apply_cfs(df_smote)
    y_train = df_smote.Bugs
    X_train = df_smote.drop('Bugs',axis = 1)
    clf = RandomForestClassifier()
    clf.fit(X_train,y_train)
    importance = clf.feature_importances_
    predicted = clf.predict(X_test[cols[:-1]])
    abcd = metrices.measures(y_test,predicted,loc)
    pf = abcd.get_pf()
    recall = abcd.calculate_recall()
    precision = abcd.calculate_precision()
    f1 = abcd.calculate_f1_score()
    g_score = abcd.get_g_score()
    pci_20 = abcd.get_pci_20()
    ifa = abcd.get_ifa()
    try:
        auc = roc_auc_score(y_test, predicted)
    except:
        auc = 0
    print(classification_report(y_test, predicted))
    return recall,precision,pf,f1,g_score,auc,pci_20,ifa,importance

In [9]:
proj_df = pd.read_csv('projects.csv')
projects = proj_df.repo_name.tolist()

In [None]:
precision_list = {}
recall_list = {}
pf_list = {}
f1_list = {}
g_list = {}
auc_list = {}
pci_20_list = {}
ifa_list = {}
featue_importance = {}
for project in projects:
    try:
        if project == '.DS_Store':
            continue
#         if project != 'org.alloytools.alloy':
#             continue
#         if project != 'guice':
#             continue
        print("+++++++++++++++++   "  + project + "  +++++++++++++++++")
        recall,precision,pf,f1,g_score,auc,pci_20,ifa,importance = run_self_CFS(project)
        recall_list[project] = recall
        precision_list[project] = precision
        pf_list[project] = pf
        f1_list[project] = f1
        g_list[project] = g_score
        auc_list[project] = auc
        pci_20_list[project] = pci_20
        ifa_list[project] = ifa
        featue_importance[project] = importance
    except Exception as e:
        print(e)
        continue
final_result = {}
final_result['precision'] = precision_list
final_result['recall'] = recall_list
final_result['pf'] = pf_list
final_result['f1'] = f1_list
final_result['g'] = g_list
final_result['auc'] = auc_list
final_result['pci_20'] = pci_20_list
final_result['ifa'] = ifa_list
with open('results/Performance/process+product_CFS.pkl', 'wb') as handle:
    pickle.dump(final_result, handle, protocol=pickle.HIGHEST_PROTOCOL)

+++++++++++++++++   org.alloytools.alloy  +++++++++++++++++
              precision    recall  f1-score   support

           0       0.90      0.93      0.91       138
           1       0.94      0.91      0.93       168

    accuracy                           0.92       306
   macro avg       0.92      0.92      0.92       306
weighted avg       0.92      0.92      0.92       306

+++++++++++++++++   qpython  +++++++++++++++++
float division by zero
+++++++++++++++++   friendlychat-android  +++++++++++++++++
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       1.00      1.00      1.00         7

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10

+++++++++++++++++   paho.mqtt.android  +++++++++++++++++
              precision    recall  f1-score   support

           0       0.95      0.95      0.95 

              precision    recall  f1-score   support

           0       1.00      0.90      0.95        30
           1       0.40      1.00      0.57         2

    accuracy                           0.91        32
   macro avg       0.70      0.95      0.76        32
weighted avg       0.96      0.91      0.92        32

+++++++++++++++++   amazon-kinesis-connectors  +++++++++++++++++
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       1.00      1.00      1.00        30

    accuracy                           1.00        33
   macro avg       1.00      1.00      1.00        33
weighted avg       1.00      1.00      1.00        33

+++++++++++++++++   Android-ExpandIcon  +++++++++++++++++
index 1 is out of bounds for axis 0 with size 1
+++++++++++++++++   MyLittleCanvas  +++++++++++++++++
              precision    recall  f1-score   support

           0       0.93      0.96      0.95        28
           1

              precision    recall  f1-score   support

           0       0.77      0.83      0.80        12
           1       0.33      0.25      0.29         4

    accuracy                           0.69        16
   macro avg       0.55      0.54      0.54        16
weighted avg       0.66      0.69      0.67        16

+++++++++++++++++   restcountries  +++++++++++++++++
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.50      1.00      0.67         2

    accuracy                           0.50         4
   macro avg       0.25      0.50      0.33         4
weighted avg       0.25      0.50      0.33         4

+++++++++++++++++   DataFixerUpper  +++++++++++++++++
index 1 is out of bounds for axis 0 with size 1
+++++++++++++++++   apk-dependency-graph  +++++++++++++++++
index 1 is out of bounds for axis 0 with size 1
+++++++++++++++++   PDFLayoutTextStripper  +++++++++++++++++
index 1 is out of boun

              precision    recall  f1-score   support

           0       0.83      0.71      0.77         7
           1       0.75      0.86      0.80         7

    accuracy                           0.79        14
   macro avg       0.79      0.79      0.78        14
weighted avg       0.79      0.79      0.78        14

+++++++++++++++++   SHSegmentControl  +++++++++++++++++
              precision    recall  f1-score   support

           0       1.00      0.25      0.40         4
           1       0.40      1.00      0.57         2

    accuracy                           0.50         6
   macro avg       0.70      0.62      0.49         6
weighted avg       0.80      0.50      0.46         6

+++++++++++++++++   MaterialAbout  +++++++++++++++++
              precision    recall  f1-score   support

           0       0.54      0.70      0.61        10
           1       0.50      0.33      0.40         9

    accuracy                           0.53        19
   macro avg       

              precision    recall  f1-score   support

           0       0.40      0.29      0.33         7
           1       0.44      0.57      0.50         7

    accuracy                           0.43        14
   macro avg       0.42      0.43      0.42        14
weighted avg       0.42      0.43      0.42        14

+++++++++++++++++   spring-cloud-kubernetes  +++++++++++++++++
              precision    recall  f1-score   support

           0       0.74      0.70      0.72        20
           1       0.50      0.55      0.52        11

    accuracy                           0.65        31
   macro avg       0.62      0.62      0.62        31
weighted avg       0.65      0.65      0.65        31

+++++++++++++++++   cordova-imagePicker  +++++++++++++++++
index 1 is out of bounds for axis 0 with size 1
+++++++++++++++++   Android-SlideExpandableListView  +++++++++++++++++
              precision    recall  f1-score   support

           0       0.90      0.82      0.86       

              precision    recall  f1-score   support

           0       0.81      0.78      0.79       114
           1       0.68      0.71      0.69        73

    accuracy                           0.75       187
   macro avg       0.74      0.75      0.74       187
weighted avg       0.76      0.75      0.76       187

+++++++++++++++++   aws-apigateway-importer  +++++++++++++++++
              precision    recall  f1-score   support

           0       0.75      0.60      0.67         5
           1       0.86      0.92      0.89        13

    accuracy                           0.83        18
   macro avg       0.80      0.76      0.78        18
weighted avg       0.83      0.83      0.83        18

+++++++++++++++++   Long-Shadows  +++++++++++++++++
              precision    recall  f1-score   support

           0       0.60      0.46      0.52        13
           1       0.50      0.64      0.56        11

    accuracy                           0.54        24
   macro avg 

              precision    recall  f1-score   support

           0       0.65      0.65      0.65        17
           1       0.54      0.54      0.54        13

    accuracy                           0.60        30
   macro avg       0.59      0.59      0.59        30
weighted avg       0.60      0.60      0.60        30

+++++++++++++++++   smart-show  +++++++++++++++++
              precision    recall  f1-score   support

           0       0.89      0.90      0.89        98
           1       0.23      0.21      0.22        14

    accuracy                           0.81       112
   macro avg       0.56      0.56      0.56       112
weighted avg       0.81      0.81      0.81       112

+++++++++++++++++   PatternLock  +++++++++++++++++
              precision    recall  f1-score   support

           0       0.83      0.83      0.83        18
           1       0.86      0.86      0.86        21

    accuracy                           0.85        39
   macro avg       0.85    

              precision    recall  f1-score   support

           0       0.88      0.90      0.89        70
           1       0.68      0.62      0.65        24

    accuracy                           0.83        94
   macro avg       0.78      0.76      0.77        94
weighted avg       0.83      0.83      0.83        94

+++++++++++++++++   android-app  +++++++++++++++++
No columns to parse from file
+++++++++++++++++   Android-RateThisApp  +++++++++++++++++
              precision    recall  f1-score   support

           0       0.64      0.90      0.75        10
           1       0.00      0.00      0.00         5

    accuracy                           0.60        15
   macro avg       0.32      0.45      0.38        15
weighted avg       0.43      0.60      0.50        15

+++++++++++++++++   RESTMock  +++++++++++++++++
              precision    recall  f1-score   support

           0       0.74      1.00      0.85        32
           1       1.00      0.56      0.72      

              precision    recall  f1-score   support

           0       0.95      0.77      0.85        78
           1       0.67      0.93      0.78        40

    accuracy                           0.82       118
   macro avg       0.81      0.85      0.82       118
weighted avg       0.86      0.82      0.83       118

+++++++++++++++++   HomeMirror  +++++++++++++++++
              precision    recall  f1-score   support

           0       0.79      0.76      0.77        29
           1       0.42      0.45      0.43        11

    accuracy                           0.68        40
   macro avg       0.60      0.61      0.60        40
weighted avg       0.68      0.68      0.68        40

+++++++++++++++++   emojicon  +++++++++++++++++
              precision    recall  f1-score   support

           0       0.61      0.93      0.74        15
           1       0.86      0.40      0.55        15

    accuracy                           0.67        30
   macro avg       0.73      0

              precision    recall  f1-score   support

           0       0.71      0.67      0.69        18
           1       0.50      0.55      0.52        11

    accuracy                           0.62        29
   macro avg       0.60      0.61      0.60        29
weighted avg       0.63      0.62      0.62        29

+++++++++++++++++   Space-Navigation-View  +++++++++++++++++
              precision    recall  f1-score   support

           0       0.78      0.67      0.72        21
           1       0.65      0.76      0.70        17

    accuracy                           0.71        38
   macro avg       0.71      0.72      0.71        38
weighted avg       0.72      0.71      0.71        38

+++++++++++++++++   mqtt-client  +++++++++++++++++
              precision    recall  f1-score   support

           0       0.98      0.81      0.89        77
           1       0.61      0.96      0.74        24

    accuracy                           0.84       101
   macro avg    

              precision    recall  f1-score   support

           0       0.73      0.83      0.77        29
           1       0.79      0.68      0.73        28

    accuracy                           0.75        57
   macro avg       0.76      0.75      0.75        57
weighted avg       0.76      0.75      0.75        57

+++++++++++++++++   Qigsaw  +++++++++++++++++
              precision    recall  f1-score   support

           0       0.76      0.70      0.73        93
           1       0.74      0.79      0.77       102

    accuracy                           0.75       195
   macro avg       0.75      0.75      0.75       195
weighted avg       0.75      0.75      0.75       195

+++++++++++++++++   SuperAdapter  +++++++++++++++++
              precision    recall  f1-score   support

           0       0.77      0.89      0.82        37
           1       0.50      0.29      0.36        14

    accuracy                           0.73        51
   macro avg       0.63      0

              precision    recall  f1-score   support

           0       0.71      0.69      0.70        32
           1       0.60      0.62      0.61        24

    accuracy                           0.66        56
   macro avg       0.65      0.66      0.66        56
weighted avg       0.66      0.66      0.66        56

+++++++++++++++++   tut-spring-boot-oauth2  +++++++++++++++++
              precision    recall  f1-score   support

           0       0.50      1.00      0.67         3
           1       1.00      0.73      0.84        11

    accuracy                           0.79        14
   macro avg       0.75      0.86      0.75        14
weighted avg       0.89      0.79      0.80        14

+++++++++++++++++   android-viewflow  +++++++++++++++++
              precision    recall  f1-score   support

           0       1.00      0.25      0.40        12
           1       0.57      1.00      0.73        12

    accuracy                           0.62        24
   macro a

              precision    recall  f1-score   support

           0       0.91      0.76      0.83        51
           1       0.71      0.88      0.79        34

    accuracy                           0.81        85
   macro avg       0.81      0.82      0.81        85
weighted avg       0.83      0.81      0.81        85

+++++++++++++++++   AndroidTraining  +++++++++++++++++
              precision    recall  f1-score   support

           0       0.99      0.96      0.97       273
           1       0.43      0.75      0.55        12

    accuracy                           0.95       285
   macro avg       0.71      0.85      0.76       285
weighted avg       0.97      0.95      0.95       285

+++++++++++++++++   browsermob-proxy  +++++++++++++++++
              precision    recall  f1-score   support

           0       0.93      0.98      0.96       252
           1       0.62      0.31      0.41        26

    accuracy                           0.92       278
   macro avg     

              precision    recall  f1-score   support

           0       0.73      0.73      0.73        67
           1       0.84      0.84      0.84       110

    accuracy                           0.80       177
   macro avg       0.78      0.78      0.78       177
weighted avg       0.80      0.80      0.80       177

+++++++++++++++++   RxJavaFX  +++++++++++++++++
              precision    recall  f1-score   support

           0       0.79      0.87      0.83        47
           1       0.57      0.42      0.48        19

    accuracy                           0.74        66
   macro avg       0.68      0.65      0.66        66
weighted avg       0.73      0.74      0.73        66

+++++++++++++++++   google-oauth-java-client  +++++++++++++++++
              precision    recall  f1-score   support

           0       0.74      0.84      0.78       106
           1       0.74      0.60      0.67        81

    accuracy                           0.74       187
   macro avg    

              precision    recall  f1-score   support

           0       0.83      0.85      0.84       190
           1       0.88      0.86      0.87       238

    accuracy                           0.86       428
   macro avg       0.85      0.85      0.85       428
weighted avg       0.86      0.86      0.86       428

+++++++++++++++++   Paper  +++++++++++++++++
              precision    recall  f1-score   support

           0       0.73      0.67      0.70        33
           1       0.73      0.79      0.76        38

    accuracy                           0.73        71
   macro avg       0.73      0.73      0.73        71
weighted avg       0.73      0.73      0.73        71

+++++++++++++++++   OpenNoteScanner  +++++++++++++++++
              precision    recall  f1-score   support

           0       0.76      0.78      0.77        50
           1       0.61      0.59      0.60        29

    accuracy                           0.71        79
   macro avg       0.69     

              precision    recall  f1-score   support

           0       0.72      0.81      0.76        16
           1       0.77      0.67      0.71        15

    accuracy                           0.74        31
   macro avg       0.75      0.74      0.74        31
weighted avg       0.74      0.74      0.74        31

+++++++++++++++++   UsbSerial  +++++++++++++++++
              precision    recall  f1-score   support

           0       0.84      0.87      0.86        99
           1       0.54      0.48      0.51        31

    accuracy                           0.78       130
   macro avg       0.69      0.68      0.68       130
weighted avg       0.77      0.78      0.77       130

+++++++++++++++++   cordova-crosswalk-engine  +++++++++++++++++
              precision    recall  f1-score   support

           0       0.97      0.93      0.95        40
           1       0.40      0.67      0.50         3

    accuracy                           0.91        43
   macro avg   

              precision    recall  f1-score   support

           0       0.66      0.71      0.69        69
           1       0.72      0.68      0.70        77

    accuracy                           0.69       146
   macro avg       0.69      0.69      0.69       146
weighted avg       0.69      0.69      0.69       146

+++++++++++++++++   AndroidJSCore  +++++++++++++++++
              precision    recall  f1-score   support

           0       0.90      0.96      0.93       224
           1       0.77      0.60      0.67        57

    accuracy                           0.88       281
   macro avg       0.84      0.78      0.80       281
weighted avg       0.88      0.88      0.88       281

+++++++++++++++++   LicensesDialog  +++++++++++++++++
              precision    recall  f1-score   support

           0       0.68      0.88      0.77        34
           1       0.73      0.44      0.55        25

    accuracy                           0.69        59
   macro avg       0.

              precision    recall  f1-score   support

           0       0.70      0.70      0.70        46
           1       0.71      0.71      0.71        49

    accuracy                           0.71        95
   macro avg       0.70      0.70      0.70        95
weighted avg       0.71      0.71      0.71        95

+++++++++++++++++   neurolab-android  +++++++++++++++++
              precision    recall  f1-score   support

           0       0.75      0.88      0.81        90
           1       0.35      0.19      0.24        32

    accuracy                           0.70       122
   macro avg       0.55      0.53      0.53       122
weighted avg       0.65      0.70      0.66       122

+++++++++++++++++   soul  +++++++++++++++++
              precision    recall  f1-score   support

           0       0.85      0.85      0.85       263
           1       0.59      0.60      0.59        94

    accuracy                           0.78       357
   macro avg       0.72     

              precision    recall  f1-score   support

           0       0.68      0.74      0.71        78
           1       0.51      0.44      0.47        48

    accuracy                           0.63       126
   macro avg       0.60      0.59      0.59       126
weighted avg       0.62      0.63      0.62       126

+++++++++++++++++   js-graphql-intellij-plugin  +++++++++++++++++
              precision    recall  f1-score   support

           0       0.89      0.86      0.88       114
           1       0.88      0.90      0.89       126

    accuracy                           0.88       240
   macro avg       0.88      0.88      0.88       240
weighted avg       0.88      0.88      0.88       240

+++++++++++++++++   merlin  +++++++++++++++++
              precision    recall  f1-score   support

           0       0.78      0.94      0.85       132
           1       0.71      0.36      0.48        56

    accuracy                           0.77       188
   macro avg    

              precision    recall  f1-score   support

           0       0.94      0.91      0.93       110
           1       0.73      0.82      0.77        33

    accuracy                           0.89       143
   macro avg       0.84      0.86      0.85       143
weighted avg       0.89      0.89      0.89       143

+++++++++++++++++   flutter_webview_plugin  +++++++++++++++++
              precision    recall  f1-score   support

           0       0.80      0.57      0.67        14
           1       0.54      0.78      0.64         9

    accuracy                           0.65        23
   macro avg       0.67      0.67      0.65        23
weighted avg       0.70      0.65      0.65        23

+++++++++++++++++   android-basic-samples  +++++++++++++++++
              precision    recall  f1-score   support

           0       0.79      0.77      0.78        65
           1       0.62      0.65      0.63        37

    accuracy                           0.73       102
   ma