In [27]:
import pandas as pd
import numpy as np
from sklearn.externals.joblib import Parallel, delayed
from tqdm import tqdm

from utils import aux_functions

testcases = pd.read_csv('../data/mozilla_firefox_v2/firefoxDataset/docs_english/TC/testcases.csv')
print('Test Cases Shape: {}'.format(testcases.shape))

bugreports_p1 = pd.read_csv('../data/mozilla_firefox_v2/firefoxDataset/docs_english/BR/all_bugs_p1.csv', sep="|")
bugreports_p2 = pd.read_csv('../data/mozilla_firefox_v2/firefoxDataset/docs_english/BR/all_bugs_p2.csv', sep='|')
bugreports_p3 = pd.read_csv('../data/mozilla_firefox_v2/firefoxDataset/docs_english/BR/all_bugs_p3.csv', sep='|')

bugreports = pd.concat([bugreports_p1, bugreports_p2, bugreports_p3])
print('Bug Reports shape: {}'.format(bugreports.shape))

Test Cases Shape: (207, 8)
Bug Reports shape: (35955, 10)


#### Removing Duplicate Bugs

In [28]:
print('BR previous shape: {}'.format(bugreports.shape))
bugreports.drop_duplicates('Bug_Number', keep=False, inplace=True)
print('BR shape: {}'.format(bugreports.shape))

BR previous shape: (35955, 10)
BR shape: (35314, 10)


#### Bug Reports Names and Descriptions

In [29]:
bugreports['br_name'] = bugreports.apply(lambda row : 'BR_' + str(row['Bug_Number']) + '_SRC', axis=1)
bugreports['br_desc'] = bugreports.apply(lambda row : ' '.join([str(el) for el in row]), axis=1) 
bugreports.head()

Unnamed: 0,Bug_Number,Summary,Platform,Component,Version,Creation_Time,Whiteboard,QA_Whiteboard,First_Comment_Text,First_Comment_Creation_Time,br_name,br_desc
0,506297,Livemarks with null site/feed uris cause sync ...,All,Sync,unspecified,2009-07-24T17:08:43Z,,,2009-07-24 09:54:28 FaultTolerance D...,2009-07-24T17:08:43Z,BR_506297_SRC,506297 Livemarks with null site/feed uris caus...
1,506338,Enhance Crash Recovery to better help the user,All,Session Restore,Trunk,2009-07-24T19:17:21Z,[crashkill][crashkill-metrics],,When our users crash they are pretty much in t...,2009-07-24T19:17:21Z,BR_506338_SRC,506338 Enhance Crash Recovery to better help t...
2,506507,Dragging multiple bookmarks in the bookmarks s...,x86,Bookmarks & History,Trunk,2009-07-26T06:16:02Z,,,User-Agent: Mozilla/5.0 (Windows; U; Win...,2009-07-26T06:16:02Z,BR_506507_SRC,506507 Dragging multiple bookmarks in the book...
3,506550,Unreliable Back Button navigating nytimes.com,x86,Extension Compatibility,3.5 Branch,2009-07-26T16:12:10Z,[caused by adblock plus][platform-rel-NYTimes],,User-Agent: Mozilla/5.0 (Windows; U; Win...,2009-07-26T16:12:10Z,BR_506550_SRC,506550 Unreliable Back Button navigating nytim...
4,506575,ALT + F4 when dropdown of autocomplete is open...,x86,Address Bar,3.5 Branch,2009-07-26T20:14:54Z,,,Pressing ALT + F4 when the autocomplete dropdo...,2009-07-26T20:14:54Z,BR_506575_SRC,506575 ALT + F4 when dropdown of autocomplete ...


#### Test Cases Names and Descriptions

In [30]:
testcases['tc_name'] = testcases.apply(lambda row : 'TC_' + str(row[0]) + '_TRG', axis=1)
testcases['tc_desc'] = testcases.apply(lambda row : ' '.join([str(el) for el in row]), axis=1)
testcases.head()

Unnamed: 0,TC_Number,TestDay,Gen_Title,Crt_Nr,Title,Preconditions,Steps,Expected_Result,tc_name,tc_desc
0,1,20181221,<notificationbox> \nand\n <notification>\n cha...,1,Notification - Popup Block,,1. Launch Firefox\n2. Navigate to http://www.p...,1. Firefox is successfully launched\n9. The al...,TC_1_TRG,1 20181221 <notificationbox> \nand\n <notifica...
1,2,20181221,<notificationbox> \nand\n <notification>\n cha...,2,Notification - Process Hang,,"1. Launch Firefox\n2. In the URL bar, navigate...",1. Firefox is successfully launched\n2. Firefo...,TC_2_TRG,2 20181221 <notificationbox> \nand\n <notifica...
2,3,20181221,<notificationbox> \nand\n <notification>\n cha...,3,Verify Notifications appear in RTL Mode,,"1. Launch Firefox\n2. In about:config, change ...",1. Firefox is successfully launched\n2.The for...,TC_3_TRG,3 20181221 <notificationbox> \nand\n <notifica...
3,4,20181221,<notificationbox> \nand\n <notification>\n cha...,4,Verify Notifications appear in High Contrast M...,,"1. While the browser is in High Contrast Mode,...",1. Firefox has been launched.\n2. Firefox begi...,TC_4_TRG,4 20181221 <notificationbox> \nand\n <notifica...
4,5,20181221,<notificationbox> \nand\n <notification>\n cha...,5,Verify notifications react to differing Zoom l...,,"1. While the browser is in High Contrast Mode,...",1. Firefox has been launched.\n2. Firefox begi...,TC_5_TRG,5 20181221 <notificationbox> \nand\n <notifica...


In [31]:
print('Expected instances amount: {}'.format(len(bugreports) * len(testcases)))

print('Num BRs 48 Branch: {}'.format(len(bugreports[bugreports.Version == '48 Branch'])))
print('Num BRs 49 Branch: {}'.format(len(bugreports[bugreports.Version == '49 Branch'])))
print('Num BRs 50 Branch: {}'.format(len(bugreports[bugreports.Version == '50 Branch'])))
print('Num BRs 51 Branch: {}'.format(len(bugreports[bugreports.Version == '51 Branch'])))

print('Num TCs: {}'.format(len(testcases)))

Expected instances amount: 7309998
Num BRs 48 Branch: 412
Num BRs 49 Branch: 353
Num BRs 50 Branch: 518
Num BRs 51 Branch: 461
Num TCs: 207


#### Estimating Oracle Memory Size

In [32]:
ex_df = pd.DataFrame(index=testcases.tc_name, columns=bugreports.br_name, data=0, dtype='int8')
print(ex_df.shape)
print(ex_df.info())
ex_df.head()

(207, 35314)
<class 'pandas.core.frame.DataFrame'>
Index: 207 entries, TC_1_TRG to TC_208_TRG
Columns: 35314 entries, BR_506297_SRC to BR_1516895_SRC
dtypes: int8(35314)
memory usage: 7.0+ MB
None


br_name,BR_506297_SRC,BR_506338_SRC,BR_506507_SRC,BR_506550_SRC,BR_506575_SRC,BR_506729_SRC,BR_506768_SRC,BR_506795_SRC,BR_506820_SRC,BR_506831_SRC,...,BR_1516270_SRC,BR_1516329_SRC,BR_1516358_SRC,BR_1516416_SRC,BR_1516505_SRC,BR_1516547_SRC,BR_1516582_SRC,BR_1516749_SRC,BR_1516792_SRC,BR_1516895_SRC
tc_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TC_1_TRG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TC_2_TRG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TC_3_TRG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TC_4_TRG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TC_5_TRG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Oracle Functions and Auxiliary Variables

In [34]:
list_fversion_to_testday = [('48 Branch','20160603'),('48 Branch','20160624'),('48 Branch','20160708'),
                            ('49 Branch','20160722'),('49 Branch','20160812'),('49 Branch','20160826'),
                            ('50 Branch','20160909'),('50 Branch','20160930'),('50 Branch','20161014'),
                            ('51 Branch','20161028'),('51 Branch','20161125'),('51 Branch','20170106')]

NUMBER_SUBSETS = 7

def check_link_condition(br, tc):
    for tup in [(br['Version'],tday) for tday in tc['TestDay'].split(' + ')]:
        if tup in list_fversion_to_testday:
            return True
    return False


def create_links(idx, tc_df, br_df):
    oracle_df = pd.DataFrame(columns=br_df.br_name, index=tc_df.tc_name, data=np.zeros(shape=(len(tc_df),len(br_df))), dtype='int8')
    for idx_1,br in tqdm(br_df.iterrows()):
        for idx_2,tc in tc_df.iterrows():
            if check_link_condition(br, tc):
                oracle_df.at[tc.tc_name, br.br_name] = 1
            else:
                oracle_df.at[tc.tc_name, br.br_name] = 0
    
    oracle_df.to_csv('../data/mozilla_firefox_v2/firefoxDataset/oracle/output/part/trace_matrix_{}.csv'.format(idx))

def create_br_dfs_list():
    list_br_dfs = []
    for i in range(0, 35315, 5045):   # 35315 / 5045 == NUMBER_SUBSETS
        list_br_dfs.append(bugreports.iloc[i:i+5045,:])
    return list_br_dfs

def create_tc_dfs_list():
    return [testcases.copy() for i in range(NUMBER_SUBSETS)]

#### Create Small Size Oracle

In [35]:
br_aux = bugreports[(bugreports.Version == '50 Branch') | (bugreports.Version == '60 Branch')].sample(15, random_state=42)
tc_aux = testcases[(testcases.TestDay.str.contains('20161014')) | (testcases.TestDay.str.contains('20161028'))].sample(10, random_state=1000)

br_aux[br_aux.Version == '50 Branch'].loc[:, ['Bug_Number','Version']].head(100)

Unnamed: 0,Bug_Number,Version
15072,1319983,50 Branch
14933,1318407,50 Branch
12654,1287109,50 Branch
15151,1320548,50 Branch
15537,1325288,50 Branch
12229,1280856,50 Branch


In [36]:
tc_aux[tc_aux.TestDay.str.contains('20161014')].loc[:,['TC_Number','TestDay']].head(100)

Unnamed: 0,TC_Number,TestDay
18,19,20160603 + 20160624 + 20161014
15,16,20160603 + 20160624 + 20161014
14,15,20160603 + 20160624 + 20161014


In [37]:
create_links('small', tc_aux, br_aux)

small_orc = pd.read_csv('../data/mozilla_firefox_v2/firefoxDataset/oracle/output/part/trace_matrix_small.csv')
aux_functions.highlight_df(small_orc)

15it [00:00, 326.46it/s]


Unnamed: 0,tc_name,BR_1441532_SRC,BR_1319983_SRC,BR_1443343_SRC,BR_1464815_SRC,BR_1318407_SRC,BR_1468122_SRC,BR_1445895_SRC,BR_1459431_SRC,BR_1287109_SRC,BR_1320548_SRC,BR_1469153_SRC,BR_1325288_SRC,BR_1463768_SRC,BR_1469753_SRC,BR_1280856_SRC
0,TC_166_TRG,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,TC_19_TRG,0,1,0,0,1,0,0,0,1,1,0,1,0,0,1
2,TC_153_TRG,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,TC_16_TRG,0,1,0,0,1,0,0,0,1,1,0,1,0,0,1
4,TC_161_TRG,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,TC_170_TRG,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,TC_146_TRG,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,TC_15_TRG,0,1,0,0,1,0,0,0,1,1,0,1,0,0,1
8,TC_150_TRG,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,TC_168_TRG,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### Create Entire Oracle

In [38]:
tasks = [(idx,tc_df, br_df) for idx,(tc_df,br_df) in enumerate(zip(create_tc_dfs_list(),create_br_dfs_list()))]
results = Parallel(n_jobs=7, verbose=3)(delayed(create_links)(idx,tc_df,br_df) for idx,tc_df,br_df in tasks)

[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done   2 out of   7 | elapsed:  3.4min remaining:  8.5min
[Parallel(n_jobs=7)]: Done   7 out of   7 | elapsed:  3.5min finished


#### Analyze Oracle Parts Created

In [39]:
oo_df_2 = pd.read_csv('../data/mozilla_firefox_v2/firefoxDataset/oracle/output/part/trace_matrix_2.csv')
oo_df_2.set_index('tc_name', inplace=True)

print(oo_df_2.loc['TC_15_TRG', 'BR_1319983_SRC'])
print(oo_df_2.loc['TC_16_TRG', 'BR_1319983_SRC'])
print(oo_df_2.loc['TC_19_TRG', 'BR_1319983_SRC'])

1
1
1


In [55]:
oo_dfs = []
for i in range(NUMBER_SUBSETS):
    df = pd.read_csv('../data/mozilla_firefox_v2/firefoxDataset/oracle/output/part/trace_matrix_{}.csv'.format(i))
    oo_dfs.append(df)
    print(df.shape)

(207, 5046)
(207, 5046)
(207, 5046)
(207, 5046)
(207, 5046)
(207, 5046)
(207, 5045)


#### Join Oracle Parts

In [57]:
oo_df = pd.DataFrame(index=testcases.tc_name, dtype='int8')
for i in range(NUMBER_SUBSETS):
    aux_df = pd.read_csv('../data/mozilla_firefox_v2/firefoxDataset/oracle/output/part/trace_matrix_{}.csv'.format(i))
    aux_df.set_index('tc_name', inplace=True)
    oo_df = oo_df.join(aux_df)

print(oo_df.shape)
print(oo_df.info())

(207, 35314)
<class 'pandas.core.frame.DataFrame'>
Index: 207 entries, TC_1_TRG to TC_208_TRG
Columns: 35314 entries, BR_506297_SRC to BR_1516895_SRC
dtypes: int64(35314)
memory usage: 55.8+ MB
None


#### Save DataFrames

In [58]:
oo_df.to_csv('../data/mozilla_firefox_v2/firefoxDataset/oracle/output/trace_matrix_final.csv')
bugreports.to_csv('../data/mozilla_firefox_v2/firefoxDataset/docs_english/BR/bugreports_final.csv', index=False)
testcases.to_csv('../data/mozilla_firefox_v2/firefoxDataset/docs_english/TC/testcases_final.csv', index=False)

#### Analyze Entire Oracle Created

In [60]:
oo_df_full = pd.read_csv('../data/mozilla_firefox_v2/firefoxDataset/oracle/output/trace_matrix_final.csv')
oo_df_full.set_index('tc_name', inplace=True)

print(oo_df_full.loc['TC_15_TRG', 'BR_1319983_SRC'])
print(oo_df_full.loc['TC_16_TRG', 'BR_1319983_SRC'])
print(oo_df_full.loc['TC_19_TRG', 'BR_1319983_SRC'])

1
1
1


### -----

#### Checking Values - FVersion to TestDay

In [61]:
ck_df = pd.DataFrame(columns=['testday','f_version','features_released','testcases_list'])
ck_df.testday = ['20160603', '20160624', '20160708', 
                 '20160722', '20160812', '20160826', 
                 '20160909', '20160930', '20161014', 
                 '20161028', '20161125', '20170106']
ck_df.f_version = ['48 Branch', '48 Branch', '48 Branch', 
                  '49 Branch', '49 Branch', '49 Branch', 
                  '50 Branch', '50 Branch', '50 Branch', 
                  '51 Branch', '51 Branch', '51 Branch' ]
ck_df.features_released = [
    "Awesome Bar Search, Awesome Bar Icons - Left, Awesome Bar Icons - Right",
    "Awesome Bar Search, Awesome Bar Icons - Left, Awesome Bar Icons - Right",
    "apz, Scrolling using different devices (wired mouse, wireless mouse, trackpad/touchpad) - where available devices",
    'context menu - exploratory testing, context menu - full functional testing, pdf viewer, browser customization',
    'windows 10 compatibility, text to speech in reader mode, text to speech on desktop',
    'webgl compatibility, exploratory testing',
    '',
    'Pointer Lock API, WebM EME support for Widevine',
    'New Awesome Bar',
    'Zoom indicator, Downloads dropmaker',
    'WebGL2,  FLAC support,  Indicator for device permissions,  Zoom Indicator',
    'WebGL2, Zoom Indicator, Flash support']

ck_df.testcases_list = ""

for i,tc in testcases.iterrows():
    for j,row in ck_df.iterrows(): 
        if row['testday'] in tc['TestDay']:
            if ck_df.at[j,'testcases_list'] == "":
                ck_df.at[j,'testcases_list'] = str(tc.TC_Number)
            else:
                ck_df.at[j,'testcases_list'] = ck_df.at[j,'testcases_list'] + " " + str(tc.TC_Number)

ck_df.head(20)

Unnamed: 0,testday,f_version,features_released,testcases_list
0,20160603,48 Branch,"Awesome Bar Search, Awesome Bar Icons - Left, ...",13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 2...
1,20160624,48 Branch,"Awesome Bar Search, Awesome Bar Icons - Left, ...",13 14 15 16 17 18 19 20 21 22 23 24 25
2,20160708,48 Branch,"apz, Scrolling using different devices (wired ...",38 39 40 41 42 43 44 45 46 47 48
3,20160722,49 Branch,"context menu - exploratory testing, context me...",60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 7...
4,20160812,49 Branch,"windows 10 compatibility, text to speech in re...",105 106 107 108 109 110 111 112 113 114 115 11...
5,20160826,49 Branch,"webgl compatibility, exploratory testing",121 122 123 124 125
6,20160909,50 Branch,,
7,20160930,50 Branch,"Pointer Lock API, WebM EME support for Widevine",126 127 128 129 130 131 132 133 134 135 136 13...
8,20161014,50 Branch,New Awesome Bar,13 14 15 16 17 18 19 20 21
9,20161028,51 Branch,"Zoom indicator, Downloads dropmaker",143 144 145 146 147 148 149 150 151 152 153 15...


In [62]:
ck_df.to_csv('../data/mozilla_firefox_v2/firefoxDataset/docs_english/TD_2_FVersion/testday_to_fversion.csv')

#### Checking Values [2]

In [63]:
oracle = pd.read_csv('../data/mozilla_firefox_v2/firefoxDataset/oracle/output/trace_matrix_final.csv')
oracle.set_index('tc_name', inplace=True, drop=True)

print(oracle.shape)

(207, 35314)


In [64]:
bugreports_subset_df = bugreports[(bugreports.Version == '50 Branch') | (bugreports.Version == '60 Branch')].sample(15, random_state=42)
bugreports_subset_df[bugreports_subset_df.Version == '50 Branch'].loc[:, ['Bug_Number','Version']].head(100)

Unnamed: 0,Bug_Number,Version
15072,1319983,50 Branch
14933,1318407,50 Branch
12654,1287109,50 Branch
15151,1320548,50 Branch
15537,1325288,50 Branch
12229,1280856,50 Branch


In [65]:
testcases_subset_df = testcases[(testcases.TestDay.str.contains('20161014')) | (testcases.TestDay.str.contains('20161028'))].sample(10, random_state=1000)

selected_testcases = ['TC_{}_TRG'.format(tc_num) for tc_num in [13,14,15,16,17,18,19,20,21]]  # should link with 50 Branch
aux_tc = testcases[testcases.tc_name.isin(selected_testcases)]

tc_subset_df = testcases_subset_df.append(aux_tc)
tc_subset_df.drop_duplicates(inplace=True)

tc_subset_df[tc_subset_df.TestDay.str.contains('20161014')].loc[:,['TC_Number','TestDay']].head(100)

Unnamed: 0,TC_Number,TestDay
18,19,20160603 + 20160624 + 20161014
15,16,20160603 + 20160624 + 20161014
14,15,20160603 + 20160624 + 20161014
12,13,20160603 + 20160624 + 20161014
13,14,20160603 + 20160624 + 20161014
16,17,20160603 + 20160624 + 20161014
17,18,20160603 + 20160624 + 20161014
19,20,20160603 + 20160624 + 20161014
20,21,20160603 + 20160624 + 20161014


In [66]:
testcases_names_subset = tc_subset_df.tc_name
bug_reports_names_subset = bugreports_subset_df.br_name
orc_subset_df = oracle.loc[testcases_names_subset, bug_reports_names_subset]

print('TestCases Subset Shape: {}'.format(tc_subset_df.shape))
print('BugReports Subset Shape: {}'.format(bugreports_subset_df.shape))
print('Oracle Subset Shape: {}'.format(orc_subset_df.shape))

TestCases Subset Shape: (16, 10)
BugReports Subset Shape: (15, 12)
Oracle Subset Shape: (16, 15)


In [67]:
aux_functions.highlight_df(orc_subset_df)

Unnamed: 0_level_0,BR_1441532_SRC,BR_1319983_SRC,BR_1443343_SRC,BR_1464815_SRC,BR_1318407_SRC,BR_1468122_SRC,BR_1445895_SRC,BR_1459431_SRC,BR_1287109_SRC,BR_1320548_SRC,BR_1469153_SRC,BR_1325288_SRC,BR_1463768_SRC,BR_1469753_SRC,BR_1280856_SRC
tc_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
TC_166_TRG,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
TC_19_TRG,0,1,0,0,1,0,0,0,1,1,0,1,0,0,1
TC_153_TRG,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
TC_16_TRG,0,1,0,0,1,0,0,0,1,1,0,1,0,0,1
TC_161_TRG,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
TC_170_TRG,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
TC_146_TRG,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
TC_15_TRG,0,1,0,0,1,0,0,0,1,1,0,1,0,0,1
TC_150_TRG,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
TC_168_TRG,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
