In [3]:
import pandas as pd
import numpy as np
import math
import pickle

from scipy import stats
import scipy.io
from scipy.spatial.distance import pdist
from scipy.linalg import cholesky
from scipy.io import loadmat

import matlab.engine as engi
import matlab as mat

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report,roc_auc_score,recall_score,precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from pyearth import Earth

from src import SMOTE
from src import CFS
from src import metrices_V2 as metrices

import platform
from os import listdir
from os.path import isfile, join
from glob import glob
from pathlib import Path
import sys
import os
import copy
import traceback
from pathlib import Path

import matplotlib.pyplot as plt

# Data Load and other util function

In [12]:
def load_data(project):
    understand_path = 'data/understand_files_all/' + project + '_understand.csv'
    commit_guru_path = 'data/commit_guru/' + project + '.csv'
    understand_df = pd.read_csv(understand_path)
    understand_df = understand_df.dropna(axis = 1,how='all')
    cols_list = understand_df.columns.values.tolist()
    for item in ['Kind', 'Name','commit_hash', 'Bugs']:
        if item in cols_list:
            cols_list.remove(item)
            cols_list.insert(0,item)
    understand_df = understand_df[cols_list]
    commit_guru_df = pd.read_csv(commit_guru_path)
    cols = understand_df.columns.tolist()
    
    commit_guru_df = commit_guru_df.drop(labels = ['parent_hashes','author_name','author_name',
                                                   'author_email','fileschanged','author_date',
                                                   'author_date_unix_timestamp', 'commit_message',
                                                  'classification', 'fix', 'contains_bug','fixes',],axis=1)

#     understand_df = understand_df.drop_duplicates(cols[4:len(cols)])
    df = understand_df.merge(commit_guru_df,on='commit_hash')
    cols = df.columns.tolist()
    cols = cols[1:] + [cols[0]]
    df = df[cols]
    for item in ['Kind','commit_hash']:
        if item in cols:
            df = df.drop(labels = [item],axis=1)
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

def apply_smote(df):
    cols = df.columns
    smt = SMOTE.smote(df)
    df = smt.run()
    df.columns = cols
    return df

def apply_cfs(df):
        y = df.Bugs.values
        X = df.drop(labels = ['Bugs'],axis = 1)
        X = X.values
        selected_cols = CFS.cfs(X,y)
        cols = df.columns[[selected_cols]].tolist()
        cols.append('Bugs')
        return df[cols],cols

def get_spearmanr(df):
    df = df.drop('Bugs',axis = 1)
    file_corr = []
    for name in df.Name.unique():
        sub_df = df[df['Name'] == name]
        if sub_df.shape[0] < 2:
            continue
        sub_df = sub_df.drop('Name',axis = 1)
        rho, pval = stats.spearmanr(sub_df.values,sub_df.values,axis = 1)
        file_corr.append(np.median(rho[:,:int(len(rho)/2)]))
    return file_corr

# Getting List of projects

In [13]:
proj_df = pd.read_csv('projects.csv')
projects = proj_df.repo_name.tolist()

# Run the experiment

In [None]:
project_corr = {}
for project in projects:
    try:
        if project == '.DS_Store':
            continue
#         if project != 'Pokemap':
#             continue
        print("+++++++++++++++++   "  + project + "  +++++++++++++++++")
        df = load_data(project)
        corr = get_spearmanr(df)
        project_corr[project] = corr
        print(np.median(corr))
    except Exception as e:
        print(e)
        continue

+++++++++++++++++   org.alloytools.alloy  +++++++++++++++++
0.9955580965627484
+++++++++++++++++   qpython  +++++++++++++++++
0.9911648810899507
+++++++++++++++++   friendlychat-android  +++++++++++++++++
0.9766941737696968
+++++++++++++++++   paho.mqtt.android  +++++++++++++++++
0.9023924001702086
+++++++++++++++++   paho.mqtt.java  +++++++++++++++++
0.8466614843548863
+++++++++++++++++   android-docs-samples  +++++++++++++++++
0.9935521231951169
+++++++++++++++++   Toasty  +++++++++++++++++
0.9546751561956854
+++++++++++++++++   android-mvvm-architecture  +++++++++++++++++
0.9859389324740899
+++++++++++++++++   recipes-rss  +++++++++++++++++
0.9246761697769359
+++++++++++++++++   HoloGraphLibrary  +++++++++++++++++
nan
+++++++++++++++++   HorizontalWheelView  +++++++++++++++++
0.9783142731584092
+++++++++++++++++   OpenMemories-Tweak  +++++++++++++++++
0.9784581289284191
+++++++++++++++++   HoldingButton  +++++++++++++++++
0.9984040162761393
+++++++++++++++++   ExpandableLayout  ++++

0.9639147397379615
+++++++++++++++++   chips-input-layout  +++++++++++++++++
0.954001215504768
+++++++++++++++++   elasticsearch-analysis-mmseg  +++++++++++++++++
0.9571527919084715
+++++++++++++++++   XUpdate  +++++++++++++++++
0.9355424535503449
+++++++++++++++++   jenkins-hipchat-plugin  +++++++++++++++++
0.9184235760246265
+++++++++++++++++   spring-cloud-kubernetes  +++++++++++++++++
0.9554972636640695
+++++++++++++++++   cordova-imagePicker  +++++++++++++++++
0.8921268236179061
+++++++++++++++++   Android-SlideExpandableListView  +++++++++++++++++
0.9033707318032367
+++++++++++++++++   smarthome  +++++++++++++++++
0.9139098995968351
+++++++++++++++++   richeditor-android  +++++++++++++++++
0.8919273875372774
+++++++++++++++++   CtCI-6th-Edition  +++++++++++++++++
0.8433316851123235
+++++++++++++++++   spring-mvc-showcase  +++++++++++++++++
0.9602645323504402
+++++++++++++++++   RMSwitch  +++++++++++++++++
0.9630350586101841
+++++++++++++++++   easy-adapter  +++++++++++++++++
0.88

0.9081500843309156
+++++++++++++++++   gs-uploading-files  +++++++++++++++++
0.93129764436074
+++++++++++++++++   android-flowlayout  +++++++++++++++++
0.9551806218719094
+++++++++++++++++   dbeaver  +++++++++++++++++
0.8955960761764424
+++++++++++++++++   android-mvp-architecture  +++++++++++++++++
0.9398794474537855
+++++++++++++++++   otter  +++++++++++++++++
0.8994925250955181
+++++++++++++++++   AlphabetIndex-Fast-Scroll-RecyclerView  +++++++++++++++++
0.8887962954111012
+++++++++++++++++   CalendarFX  +++++++++++++++++
0.9247377378908221
+++++++++++++++++   knife  +++++++++++++++++
0.9576644191618195
+++++++++++++++++   StepView  +++++++++++++++++
0.9598499765776842
+++++++++++++++++   Space-Navigation-View  +++++++++++++++++
0.9441966486478068
+++++++++++++++++   mqtt-client  +++++++++++++++++
0.908739710447864
+++++++++++++++++   emoji-java  +++++++++++++++++
0.904033888183748
+++++++++++++++++   reflections  +++++++++++++++++
0.8982110160754657
+++++++++++++++++   databus  +++

0.8833093610961252
+++++++++++++++++   gpmall  +++++++++++++++++
0.9252855251130184
+++++++++++++++++   openhab1-addons  +++++++++++++++++
0.8911704441796868
+++++++++++++++++   re2j  +++++++++++++++++
0.8666324469081994
+++++++++++++++++   mortar  +++++++++++++++++
0.9387399176225943
+++++++++++++++++   vertx-sql-client  +++++++++++++++++
0.9674795576214584
+++++++++++++++++   groupie  +++++++++++++++++
0.8892245595252295
+++++++++++++++++   UrlImageViewHelper  +++++++++++++++++
0.9092561193614803
+++++++++++++++++   jgit-cookbook  +++++++++++++++++
0.8688326033827147
+++++++++++++++++   ice  +++++++++++++++++
0.8792312464253333
+++++++++++++++++   dashboard-demo  +++++++++++++++++
0.9403822422708107
+++++++++++++++++   qmq  +++++++++++++++++
0.8956955581503949
+++++++++++++++++   Paper  +++++++++++++++++
0.9374005227919646
+++++++++++++++++   OpenNoteScanner  +++++++++++++++++
0.9382456206780827
+++++++++++++++++   plantuml-server  +++++++++++++++++
0.9376837745439588
+++++++++++++++

In [92]:
project_corr

{'Pokemap': 1.0}

In [87]:
df = df.drop('Bugs',axis = 1)
for name in df.Name.unique():
    sub_df = df[df['Name'] == name]
    sub_df = sub_df.drop('Name',axis = 1)
    rho, pval = stats.spearmanr(sub_df.values,sub_df.values,axis = 1)
    print(np.median(rho))

1.0
1.0
0.982977093922288
0.9999999999999999
1.0
1.0
0.9999999999999999
1.0
1.0
1.0
1.0
0.9999999999999999
0.9999999999999999
1.0
0.9999999999999999
1.0
0.9999999999999999
0.9999999999999999
1.0
1.0
0.9999999999999999
0.9999999999999999
1.0
1.0
0.9999999999999999
0.9999999999999999
1.0
0.9999999999999999
0.9999999999999999
1.0
0.9999999999999999
1.0
0.9999999999999999
0.9999999999999999
0.9999999999999999
0.9999999999999999
0.9999999999999999
1.0
0.9999999999999999
0.9999999999999999
1.0
1.0
0.9999999999999999
1.0
1.0
0.9999999999999999
1.0
1.0
0.9999999999999999
1.0
1.0
0.9999999999999999
1.0
1.0
0.9999999999999999
1.0
1.0
0.9999999999999999
0.9999999999999999
1.0
0.9999999999999999
0.9999999999999999
0.9999999999999999
0.9999999999999999
1.0
1.0
0.9999999999999999
1.0
1.0
0.9999999999999999
1.0
1.0
0.9999999999999999
0.9999999999999999
0.9999999999999999
0.9999999999999999
1.0
1.0
0.9999999999999999
0.9999999999999999
0.9999999999999999
0.9999999999999999
0.9999999999999999
0.9999999

In [51]:
sub_df.values.shape

(7, 45)

In [43]:
x2n=np.random.randn(100,60)
y2n=np.random.randn(100,60)

In [82]:
rho, pval = stats.spearmanr(sub_df.values,sub_df.values,axis = 1)

In [109]:
rho[:,:int(len(rho)/2)]

array([[1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1.]])