# Thsis notebook presents the data processing and survival analysis to investigate the impacts of multi-language design smells on software fault-proneness

In [42]:
import os
import pandas as pd
import numpy as np
from scipy import stats
from pathlib import Path
import datetime
import seaborn as sb
import matplotlib.pyplot as plt
from lifelines import KaplanMeierFitter

In [2]:
 path_root = os.getcwd()
 data_path_root = Path(path_root).parent

In [3]:
def load_csv(path):
    data = pd.read_csv(path)
    return data

In [4]:
sys_list = ['conscrypt', 'frostwire','javacpp','jna','OpenDDS','pljava','realm-java','rocksdb'] 

snapshot_0 ={'conscrypt':{'commit':'10da3cb', 'date': '2008-10-21'},
             'frostwire':{'commit':'d922745', 'date': '2015-11-03'},
             'javacpp': {'commit':'c3248e6', 'date': '2012-04-08'},
             'jna': {'commit':'9813273', 'date': '1998-10-01'},
             'OpenDDS': {'commit':'3b2e748', 'date': '2005-06-10'},
             'pljava': {'commit':'92cd3f7', 'date': '2000-01-29'},
             'realm-java': {'commit':'b03c621', 'date': '2012-04-20'},
             'rocksdb': {'commit':'54f1fd7', 'date': '2011-03-02'}
            }

snapshot_last ={'conscrypt':{'commit':'b1220d7', 'date': '2020-01-27 00:00:01'},
             'frostwire':{'commit':'16aca17', 'date': '2019-11-20 00:00:01'},
             'javacpp': {'commit':'', 'date': '2019-06-12 00:00:01'},
             'jna': {'commit':'424fc00', 'date': '2020-02-10 00:00:01'},
             'OpenDDS': {'commit':'e3a2193', 'date': '2020-02-04 00:00:01'},
             'pljava': {'commit':'485cb54', 'date': '2020-01-11 00:00:01'},
             'realm-java': {'commit':'b96e28a', 'date': '2019-11-01 00:00:01'},
             'rocksdb': {'commit':'debc4ef', 'date': '2020-02-10 00:00:01'}
            }

In [5]:
sys = 'conscrypt'
print(snapshot_last[sys]['date'])

2020-01-27 00:00:01


In [6]:
def reformat_renamed_n_removed_date(rdates):
    formatted_list = ''
    date_list = rdates.split('/')
    for dt in date_list:
        if len(dt)>0:
           formatted_list = formatted_list + dt[:19]+'/'
    formatted_list = formatted_list[:-1]
    return formatted_list

def reformat_inducing_n_fixing_date(ind_dates):
    formatted_list = ''
    date_list = ind_dates.split('/')
    for dt in date_list:
        if len(dt)>0:
           formatted_list = formatted_list + dt.replace('T',' ')[:19]+'/'
    formatted_list = formatted_list[:-1]
    return formatted_list

In [7]:
def date_diff_in_hours(row):
    date1 = row.CreatedAt
    sys = row.System
#     print(sys)
    date2=''
    if str(row.InducingDates)=='nan':
        date2=snapshot_last[sys]['date']
    else:
        dates2 = str(row.InducingDates).split('/')
        date2= dates2[len(dates2)-1] # taking the date of the earliest bug
    if len(date2)>0:  
        date1 = datetime.datetime.strptime(date1[:19], '%Y-%m-%d %H:%M:%S')
        date2 = datetime.datetime.strptime(date2[:19], '%Y-%m-%d %H:%M:%S')
        date_diff = (date2 - date1)
        days = str(date_diff).split(',')
        day = 0
        if len(days)>1:
            day= int(days[0].split(' ')[0])
            hour = int(days[1].split(':')[0])
        else:
            hour = int(days[0].split(':')[0]) 
        
        diff_hour = day*24+hour
    if diff_hour <0 :
        print('System: {} Hour: {} Creation: {} inducing:{}'.format(row.System, diff_hour, date1, date2))
    return diff_hour

In [15]:
smell_list = ['ExcessiveInterlangCommunication', 'Toomuchclustring',
       'ToomuchScattering', 'UnusedMethodDeclaration',
       'UnusedMethodImplementation', 'UnusedParameter',
       'AssumingSafeReturnValue', 'ExcessiveObjects', 'NotHandlingExceptions',
       'NotCachingObjects', 'NotSecuringLibraries', 'HardCodingLibraries',
       'NotUsingRelativePath', 'MemoryManagementMismatch',
       'LocalReferencesAbuse']

selected_columns = ['ExcessiveInterlangCommunication', 'Toomuchclustring',
       'ToomuchScattering', 'UnusedMethodDeclaration',
       'UnusedMethodImplementation', 'UnusedParameter',
       'AssumingSafeReturnValue', 'ExcessiveObjects', 'NotHandlingExceptions',
       'NotCachingObjects', 'NotSecuringLibraries', 'HardCodingLibraries',
       'NotUsingRelativePath', 'MemoryManagementMismatch',
       'LocalReferencesAbuse', 
       'inducingflag','Smelly','SurvivalTime','LOC','prev_fixing']

selected_columns1 = ['ExcessiveInterlangCommunication', 'Toomuchclustring',
       'UnusedMethodDeclaration',
       'UnusedParameter',
       'AssumingSafeReturnValue', 'NotHandlingExceptions',
       'NotSecuringLibraries', 
       'NotUsingRelativePath', 'MemoryManagementMismatch',
       'LocalReferencesAbuse', 
       'inducingflag','Smelly','SurvivalTime','LOC','prev_fixing']


selected_columns2 = ['ExcessiveInterlangCommunication', 
       'UnusedParameter',
       'AssumingSafeReturnValue', 'NotHandlingExceptions',
       'NotSecuringLibraries', 
       'NotUsingRelativePath', 'MemoryManagementMismatch',
       'LocalReferencesAbuse', 
       'LOC','prev_fixing']

selected_columns3 = ['LOC','prev_fixing']
# ['ToomuchScattering', 'UnusedMethodImplementation', 'ExcessiveObjects', 'NotCachingObjects', 'HardCodingLibraries']

In [9]:
# Test 
# print(reformat_renamed_n_removed_date('/2010-05-03 12:57:15-07:00'))

In [10]:
# print(reformat_inducing_n_fixing_date('2009-04-03T06:50:03Z/2009-03-04T03:28:47Z/'))

In [None]:
sys_name = 'OpenDDS'
path_root = os.getcwd()
data_path = os.path.join(data_path_root, 'data', 'survival', sys_name + '_merged2.csv')
cleaned_data_path = os.path.join(data_path_root, 'data', 'survival','cleaned', sys_name + '_merged2_cleaned.csv')
data_df = load_csv(data_path)

data_df['System'] = sys_name

#Correct Release Date
data_df['Release'] = data_df['Release'].apply(lambda x: str(x).replace('T',' ')[:19])

#Reformat Creat Date
data_df['CreatedAt'] = data_df['CreatedAt'].apply(lambda x: str(x)[:19])

#Reformat Renaming and Removed Date(s)
data_df['RenamedAt'] = data_df['RenamedAt'].apply(lambda x: reformat_renamed_n_removed_date(str(x)))
data_df['RemovedDate'] = data_df['RemovedDate'].apply(lambda x: reformat_renamed_n_removed_date(str(x)))

#Reformat Inducing and Fixing Date(s)
data_df['InducingDates'] = data_df['InducingDates'].apply(lambda x: reformat_inducing_n_fixing_date(str(x)))
data_df['FixingDates'] = data_df['FixingDates'].apply(lambda x: reformat_inducing_n_fixing_date(str(x)))

if sys_name == 'conscrypt':
    data_df.loc[data_df['Version']=='conscrypt_0','Release']= '2008-10-21 00:00:00'

if sys_name == 'jna':
    data_df.loc[data_df['Version']=='jna_6','Release']= '2002-12-06 01:28:03'
    data_df.loc[data_df['Version']=='jna_7','Release']= '2003-11-04 06:09:08'
    data_df.loc[data_df['Version']=='jna_8','Release']= '2004-05-30 01:51:57'
    data_df.loc[data_df['Version']=='jna_9','Release']= '2006-06-04 23:22:24'

data_df['SurvivalTime'] = data_df.apply(date_diff_in_hours, axis=1)
data_df = data_df[data_df.SurvivalTime >= 0]    
data_df.to_csv(cleaned_data_path, index = False) 


In [None]:
data_df['SurvivalTime'].min()

In [None]:
# Add flags for smells
path_root = os.getcwd()
for sys in sys_list:   
    data_path = os.path.join(data_path_root, 'data', 'survival', 'cleaned', sys + '_merged2_cleaned.csv')
    data_path1 = os.path.join(data_path_root, 'data', 'survival', 'cleaned', sys + '_merged2_cleaned_time.csv')
    data_df = load_csv(data_path)
    for sml in smell_list:
        col_name = 'Smelly_'+sml
        data_df[col_name]= data_df[sml].apply(lambda x: 1 if int(x)>0 else 0)
    data_df.to_csv(data_path1, index = False)

In [11]:
sys_name = 'OpenDDS'
cleaned_data_path = os.path.join(data_path_root, 'data', 'survival','cleaned', sys_name + '_merged2_cleaned_time.csv')
data_cl = load_csv(cleaned_data_path)
data_cl.columns

  if (await self.run_code(code, result,  async_=asy)):


Index(['Id_db', 'File', 'System', 'Version', 'Package', 'Release', 'Class',
       'ExcessiveInterlangCommunication', 'Toomuchclustring',
       'ToomuchScattering', 'UnusedMethodDeclaration',
       'UnusedMethodImplementation', 'UnusedParameter',
       'AssumingSafeReturnValue', 'ExcessiveObjects', 'NotHandlingExceptions',
       'NotCachingObjects', 'NotSecuringLibraries', 'HardCodingLibraries',
       'NotUsingRelativePath', 'MemoryManagementMismatch',
       'LocalReferencesAbuse', 'FilePath', 'inducing', 'fixing',
       'inducingflag', 'fixingFlag', 'Smelly', 'LOC', 'Time', 'CLOC',
       'LOC_inducing', 'prev_inducing', 'prev_fixing', 'cum_inducing',
       'cum_fixing', 'Message_induce', 'Message_fix', 'dev-inducing',
       'dev-fixing', 'CreatedAt', 'RenamedFrom', 'RenamedTo', 'RenamedAt',
       'RemovedDate', 'InducingDates', 'FixingDates', 'status', 'SurvivalTime',
       'Smelly_ExcessiveInterlangCommunication', 'Smelly_Toomuchclustring',
       'Smelly_ToomuchScatterin

In [None]:
cleaned_data_path = os.path.join(data_path_root, 'data', 'survival','cleaned', sys_name + '_merged2_cleaned.csv')
data_cl = load_csv(cleaned_data_path)
data_cl['SurvivalTime'] = data_cl.apply(date_diff_in_hours, axis=1)

In [None]:
# Merge all smell data
cleaned_data_path_merged = os.path.join(data_path_root, 'data', 'survival','cleaned', 'merged_cleaned_smell_data_time.csv')
merged_df = pd.DataFrame()
for sys_name in sys_list:
    print('Merging smell data from:{}'.format(sys_name))
    cleaned_data_path = os.path.join(data_path_root, 'data', 'survival','cleaned', sys_name + '_merged2_cleaned.csv')
    data_cl = list(load_csv(cleaned_data_path))
    rel_list =data_cl['Version']
    data_cl['SurvivalTime'] = data_cl.apply(date_diff_in_hours, axis=1)
    data_cl = data_cl[data_cl.SurvivalTime >= 0]
    cleaned_data_path_time = os.path.join(data_path_root, 'data', 'survival','cleaned', sys_name + '_merged2_cleaned_time.csv')
    data_cl.to_csv(cleaned_data_path_time, index = False)
    merged_df = pd.concat([merged_df, data_cl])
merged_df.to_csv(cleaned_data_path_merged, index=False)
print('Merged data saved to: {}'.format(cleaned_data_path_merged))

In [None]:
merged_df.head(5)

In [None]:
merged_df.tail(5)

In [None]:
merged_df.columns

In [None]:
cleand_file = os.path.join(data_path_root, 'data', 'survival','cleaned', 'merged_cleaned_smell_data_s_time2.csv')
analysis_df['SurvivalTime'] = analysis_df.apply(date_diff_in_hours, axis=1)
cleaned_data_path_merged_time = os.path.join(data_path_root, 'data', 'survival','cleaned', 'merged_cleaned_smell_data_s_time2.csv')
analysis_df.to_csv(cleaned_data_path_merged_time,index =False)
analysis_df.head(5)

In [None]:
analysis_df = merged_df.query ('System==\"conscrypt\"')
# analysis_df = analysis_df[analysis_df['Release']=='']
# analysis_df = analysis_df[analysis_df['Release'].isna()]
analysis_df.shape

In [None]:
# analysis_df = merged_df
analysis_df['SurvivalTime'] = analysis_df.apply(date_diff_in_hours, axis=1)
cleaned_data_path_merged_time = os.path.join(data_path_root, 'data', 'survival','cleaned', 'merged_cleaned_smell_data_s_time2.csv')
analysis_df.to_csv(cleaned_data_path_merged_time,index =False)
analysis_df.head(5)

In [None]:
analysis_df['SurvivalTime'].max()

In [None]:
analysis_df['SurvivalTime'].min()

In [None]:
sys_name = 'conscrypt'
analysis_df_sys = analysis_df.query('System==\"'+ sys_name+'\"')
analysis_df_sys.shape

In [None]:
analysis_df_sys.columns

In [None]:
analysis_df_sys= analysis_df_sys[selected_columns3]
# analysis_df_sys= analysis_df_sys.query('inducingflag==1')
print(analysis_df_sys.head(5))

In [None]:
corr_data = analysis_df_sys[selected_columns2]
plt.figure(figsize=(16, 16))
sb.heatmap(corr_data.corr(method='spearman'))

In [None]:
sa_columns = selected_columns3.extend(['inducingflag','Smelly','SurvivalTime'])
print(selected_columns3)

In [None]:
# analysis_df_sys = analysis_df_sys[sa_columns] 
analysis_df_sys.head(5)

In [None]:
# sys_list = ['conscrypt', 'frostwire','javacpp','jna','OpenDDS','pljava','realm-java','rocksdb'] 
sys = 'rocksdb'
data_path1 = os.path.join(data_path_root, 'data', 'survival', 'cleaned', sys + '_merged2_cleaned_time.csv')
analysis_df_sys= load_csv(data_path1)
# analysis_df_sys= analysis_df_sys.query('inducingflag==1')
analysis_df_sys.columns

In [None]:
analysis_df_sys['SurvivalTime'].min()

In [None]:
kmf = KaplanMeierFitter() ## instantiate the class to create an object

T = analysis_df_sys['SurvivalTime']     ## time to event
E = analysis_df_sys['inducingflag'] 

groups = analysis_df_sys['Smelly']   
i1 = (groups == 1)      ## group i1 , smelly
i2 = (groups == 0)     ## group i2 , non-smelly


## fit the model for smelly group
kmf.fit(T[i1], E[i1], label='Smelly')
a1 = kmf.plot()

## fit the model for non-smelly group
kmf.fit(T[i2], E[i2], label='Non-smelly')
kmf.plot(ax=a1)

In [None]:
analysis_df_sys= analysis_df_[selected_columns1]
analysis_df_sys= analysis_df_sys.query('inducingflag==1')

kmf = KaplanMeierFitter() ## instantiate the class to create an object

T = analysis_df_sys['SurvivalTime']     ## time to event
E = analysis_df_sys['inducingflag'] 

groups = analysis_df_sys['Smelly']   
i1 = (groups == 1)      ## group i1 , smelly
i2 = (groups == 0)     ## group i2 , non-smelly


## fit the model for smelly group
kmf.fit(T[i1], E[i1], label='Smelly')
a1 = kmf.plot()

## fit the model for non-smelly group
kmf.fit(T[i2], E[i2], label='Non-smelly')
kmf.plot(ax=a1)


In [None]:
analysis_df_sys= analysis_df_sys[selected_columns1]
# analysis_df_sys= analysis_df_sys.query('inducingflag==1')

In [None]:
analysis_df_sys.head() ## have a look at the data

In [None]:
from lifelines import CoxPHFitter
# Using Cox Proportional Hazards model
cph = CoxPHFitter()   ## Instantiate the class to create a cph object
cph.fit(analysis_df_sys, 'SurvivalTime', event_col='inducingflag',show_progress=True)   ## Fit the data to train the model
cph.print_summary()    ## HAve a look at the significance of the features

In [None]:
import decimal

# create a new context for this task
ctx = decimal.Context()

# 20 digits should be enough for everyone :D
ctx.prec = 10

def float_to_str(f):
    """
    Convert the given float to a string,
    without resorting to scientific notation
    """
    d1 = ctx.create_decimal(repr(f))
    return format(d1, 'f')

In [None]:
sys_name = sys_list[7]
#     path_root = os.getcwd()
data_path = os.path.join(data_path_root, 'data', 'survival','cleaned', sys_name + '_merged2_cleaned_time.csv')
fig_path = os.path.join(data_path_root, 'results', 'figures', sys_name + '_survival_allsmell.pdf')
data_df = load_csv(data_path)

analysis_df = data_df
analysis_df= analysis_df[selected_columns]
# analysis_df = analysis_df.query('inducingflag==1')

kmf1 = KaplanMeierFitter() ## instantiate the class to create an object

T = analysis_df['SurvivalTime']     ## time to event
E = analysis_df['inducingflag'] 

groups = analysis_df['Smelly']   
i1 = (groups == 1)      ## group i1 , smelly
i2 = (groups == 0)     ## group i2 , non-smelly


## fit the model for smelly group
kmf1.fit(T[i1], E[i1], label='Smelly')
a1 = kmf1.plot()

## fit the model for non-smelly group
kmf1.fit(T[i2], E[i2], label='Non-smelly')
plt = kmf1.plot(ax=a1)

plt.set_xlabel("Time (in Hours)")
plt.set_ylabel("Survival Probability")

fig= plt.get_figure()

fig.savefig(fig_path)

In [None]:
sys_name = sys_list[0]
#     path_root = os.getcwd()
data_path = os.path.join(data_path_root, 'data', 'survival','cleaned',  sys_name + '_merged2_cleaned_time.csv')
for sml in smell_list:
    sml1 = smell_list[0]
    sml2 = smell_list[1]
    fig_path = os.path.join(data_path_root, 'results', 'figures', sys_name, sys_name + '_survival_'+ sml +'.pdf')
    data_df = load_csv(data_path)
    smell_type_flag1 = 'Smelly_'+ sml1
    smell_type_flag2 = 'Smelly_'+ sml2

    analysis_df = data_df
    print(analysis_df.shape)
    #   analysis_df= analysis_df[selected_columns]
    # analysis_df = analysis_df.query(smell_type_flag+'==1')

    print(analysis_df.shape)

    kmf1 = KaplanMeierFitter() ## instantiate the class to create an object

    T = analysis_df['SurvivalTime']     ## time to event
    E = analysis_df['inducingflag'] 

    groups = analysis_df[smell_type_flag1]   
    i1 = (groups == 1)      ## group i1 , smelly
    i2 = (groups == 0)     ## group i2 , non-smelly


    # ## fit the model for smelly group
    kmf1.fit(T[i1], E[i1], label=sml1)
    a1 = kmf1.plot()

    groups = analysis_df[smell_type_flag2]   
    i1 = (groups == 1)      ## group i1 , smelly
    i2 = (groups == 0)     ## group i2 , non-smelly
    ## fit the model for non-smelly group
    kmf1.fit(T[i1], E[i1], label = sml2)
    plt = kmf1.plot(ax= a1)

    plt.set_xlabel("Time (in Hours)")
    plt.set_ylabel("Survival Probability")

    fig= plt.get_figure()

    # fig.savefig(fig_path)
    # fig.clear()


In [40]:
# print(smell_list)
sys_name = sys_list[2]
#     path_root = os.getcwd()
data_path = os.path.join(data_path_root, 'data', 'survival','cleaned', sys_name + '_merged2_cleaned_time.csv')
data_path_flag = os.path.join(data_path_root, 'data', 'survival','cleaned', sys_name + '_merged2_cleaned_flag_count.csv')
data_df = load_csv(data_path)
sm_lst =[]
data_count =[]
for sm in smell_list:
    col = 'Smelly_' + str(sm)
    sm_lst.append(col)
    data_count.append(data_df[col].sum())    
    print('{} : {}'.format(col, data_df[col].sum()))

count_df = pd.DataFrame(data = np.reshape(data_count,(1,15)) , columns = sm_lst)
count_df.to_csv(data_path_flag, index = False)

Smelly_ExcessiveInterlangCommunication : 254
Smelly_Toomuchclustring : 67
Smelly_ToomuchScattering : 342
Smelly_UnusedMethodDeclaration : 123
Smelly_UnusedMethodImplementation : 0
Smelly_UnusedParameter : 9
Smelly_AssumingSafeReturnValue : 0
Smelly_ExcessiveObjects : 0
Smelly_NotHandlingExceptions : 0
Smelly_NotCachingObjects : 0
Smelly_NotSecuringLibraries : 30
Smelly_HardCodingLibraries : 7
Smelly_NotUsingRelativePath : 0
Smelly_MemoryManagementMismatch : 0
Smelly_LocalReferencesAbuse : 0


In [52]:
sys_name = sys_list[7]
#     path_root = os.getcwd()
data_path = os.path.join(data_path_root, 'data', 'survival','cleaned',  sys_name + '_merged2_cleaned_time.csv')
data_df = load_csv(data_path)
data_count_path = os.path.join(data_path_root, 'data', 'survival','cleaned',  sys_name + '_merged2_cleaned_flag_count.csv')
count_df = load_csv(data_count_path)

for sml in smell_list:
    fig_path = os.path.join(data_path_root, 'results', 'figures', sys_name, sys_name + '_survival_'+ sml +'.pdf')
      
    smell_type_flag = 'Smelly_'+ sml
   
    smelly_count = count_df[smell_type_flag].max()
    print('{} : {}'.format(smell_type_flag,smelly_count))
        
    if smelly_count > 0 :
        analysis_df = data_df
    #     print(analysis_df.shape)
        #   analysis_df= analysis_df[selected_columns]
        # analysis_df = analysis_df.query(smell_type_flag+'==1')

    #     print(analysis_df.shape)

        kmf1 = KaplanMeierFitter() ## instantiate the class to create an object

        T = analysis_df['SurvivalTime']     ## time to event
        E = analysis_df['inducingflag'] 

        groups = analysis_df[smell_type_flag]   
        i1 = (groups == 1)      ## group i1 , smelly
        i2 = (groups == 0)     ## group i2 , non-smelly


        # ## fit the model for smelly group
        kmf1.fit(T[i1], E[i1], label= 'Smelly')
        a1 = kmf1.plot()

        kmf1.fit(T[i2], E[i2], label = 'Non-smelly')
        plt = kmf1.plot(ax= a1)

        plt.set_xlabel("Time (in Hours)")
        plt.set_ylabel("Survival Probability")

        fig= plt.get_figure()

        fig.savefig(fig_path)
        fig.clear()


Smelly_ExcessiveInterlangCommunication : 287
Smelly_Toomuchclustring : 480
Smelly_ToomuchScattering : 798
Smelly_UnusedMethodDeclaration : 138
Smelly_UnusedMethodImplementation : 0
Smelly_UnusedParameter : 1485
Smelly_AssumingSafeReturnValue : 63
Smelly_ExcessiveObjects : 0
Smelly_NotHandlingExceptions : 63
Smelly_NotCachingObjects : 0
Smelly_NotSecuringLibraries : 68
Smelly_HardCodingLibraries : 21
Smelly_NotUsingRelativePath : 27
Smelly_MemoryManagementMismatch : 28
Smelly_LocalReferencesAbuse : 19


<Figure size 432x288 with 0 Axes>

In [None]:
data_cols = ['ExcessiveInterlangCommunication', 'Toomuchclustring',
       'ToomuchScattering', 'UnusedMethodDeclaration',
       'UnusedMethodImplementation', 'UnusedParameter',
       'AssumingSafeReturnValue', 'ExcessiveObjects', 'NotHandlingExceptions',
       'NotCachingObjects', 'NotSecuringLibraries', 'HardCodingLibraries',
       'NotUsingRelativePath', 'MemoryManagementMismatch',
       'LocalReferencesAbuse', 
       'inducingflag', 'Smelly', 'LOC', 
       'prev_fixing', 'SurvivalTime'
       ]

smell_flags = ['Smelly_ExcessiveInterlangCommunication', 'Smelly_Toomuchclustring',
       'Smelly_ToomuchScattering', 'Smelly_UnusedMethodDeclaration',
       'Smelly_UnusedMethodImplementation', 'Smelly_UnusedParameter',
       'Smelly_AssumingSafeReturnValue', 'Smelly_ExcessiveObjects',
       'Smelly_NotHandlingExceptions', 'Smelly_NotCachingObjects',
       'Smelly_NotSecuringLibraries', 'Smelly_HardCodingLibraries',
       'Smelly_NotUsingRelativePath', 'Smelly_MemoryManagementMismatch',
       'Smelly_LocalReferencesAbuse']

In [None]:
sys_name = sys_list[0]
#     path_root = os.getcwd()
data_path = os.path.join(data_path_root, 'data', 'survival','cleaned', sys_name + '_merged2_cleaned_time.csv')
data_path_exp = os.path.join(data_path_root, 'data', 'survival','cleaned', sys_name + '_merged2_cleaned_time_filtered.csv')
fig_path = os.path.join(data_path_root, 'results', 'figures', sys_name + '_survival_allsmell_new.pdf')
data_df = load_csv(data_path)

analysis_df = data_df
analysis_df= analysis_df[data_cols]
analysis_df.to_csv(data_path_exp, index = False)
# analysis_df = analysis_df.query('inducingflag==1')

kmf1 = KaplanMeierFitter() ## instantiate the class to create an object

T = analysis_df['SurvivalTime']     ## time to event
E = analysis_df['inducingflag'] 

groups = analysis_df['Smelly']   
i1 = (groups == 1)      ## group i1 , smelly
i2 = (groups == 0)     ## group i2 , non-smelly


## fit the model for smelly group
kmf1.fit(T[i1], E[i1], label='Smelly')
a1 = kmf1.plot()

## fit the model for non-smelly group
kmf1.fit(T[i2], E[i2], label='Non-smelly')
plt = kmf1.plot(ax=a1)

plt.set_xlabel("Time (in Hours)")
plt.set_ylabel("Survival Probability")

fig= plt.get_figure()

fig.savefig(fig_path)

In [None]:
import seaborn as sb
import matplotlib.pyplot as plt
plt.figure(figsize=(16, 16))
sb.heatmap(data_df[selected_columns].corr(method='spearman'))

In [None]:
sys_list = ['rocksdb', 'pljava', 'realm-java', 'jpype', 'javacpp', 'zstd-jni', 'java-smt','vlc-android', 'conscrypt'] #'vlc-android', , 'conscrypt'

In [None]:
sys_name = 'conscrypt'
path_root = os.getcwd()
data_path = os.path.join(data_path_root, 'data', 'survival', sys_name + '_merged2.csv')
data = load_csv(data_path)

In [None]:
def reformat_inducing_commits(rcommit):
    formatted_list = ''
    commit_list = rcommit.split('/')
    for c in commit_list:
        if len(c)>0:
           formatted_list = formatted_list + c[:7]+'/'
    formatted_list = formatted_list[:-1]
    return formatted_list

def reformat_inducing_n_fixing_date(ind_dates):
    formatted_list = ''
    date_list = ind_dates.split('/')
    for dt in date_list:
        if len(dt)>0:
           formatted_list = formatted_list + dt.replace('T',' ')[:19]+'/'
    formatted_list = formatted_list[:-1]
    return formatted_list

In [None]:
# Bug data cleaning
sys_name = 'conscrypt'
data_path = os.path.join(data_path_root, 'data', 'survival','bug-data', sys_name + '_bug.csv')
data_df = load_csv(data_path)
data_df['Project'] = sys_name
data_df=data_df.drop(0)
data_df=data_df.reindex()


#Correct Fixing commit
data_df['BugFixing'] = data_df['BugFixing'].apply(lambda x: str(x)[:7])


#Reformat Fixing Dates
data_df['FixingDates'] = data_df['FixingDates'].apply(lambda x: str(x)[:19])


#Reformat inducing Commits
data_df['BugInducing'] = data_df['BugInducing'].apply(lambda x: reformat_inducing_commits(str(x)))

#Reformat inducing Commits
data_df['InducingDate'] = data_df['InducingDate'].apply(lambda x: reformat_inducing_n_fixing_date(str(x)))

data_df.head(50)


# #Reformat Renaming and Removed Date(s)
# data_df['RenamedAt'] = data_df['RenamedAt'].apply(lambda x: reformat_renamed_n_removed_date(str(x)))
# data_df['RemovedDate'] = data_df['RemovedDate'].apply(lambda x: reformat_renamed_n_removed_date(str(x)))

# #Reformat Inducing and Fixing Date(s)
# data_df['InducingDates'] = data_df['InducingDates'].apply(lambda x: reformat_inducing_n_fixing_date(str(x)))
# data_df['FixingDates'] = data_df['FixingDates'].apply(lambda x: reformat_inducing_n_fixing_date(str(x)))

# # condi = data_df.query('Version == \'conscrypt_0\' & Release == \'nan\'')
# # print(condi.shape)

# # Updating snapshot date for snapshot_0
# data_df.loc[data_df['Version']=='conscrypt_0','Release']= '2008-10-21 00:00:00'


In [None]:
data_df['InducingDate'][24]