# Resample to the desired sample sizes

In [3]:
# import importlib
# importlib.reload(config)
# import utils.config as config

n_iter = 6 # 0, 1, 2

## Exclude notebooks that do not fit into any ML pipeline stages, e.g., tutoral notebooks

In [4]:
import pandas as pd
import utils.config as config
import numpy as np

# what have been labeled
df_mlerr_labels = pd.read_excel(config.path_default.joinpath('tmp/cluster_sampled_labeled_{}.xlsx'.format(n_iter)),
                                sheet_name = "Del-All",
                                keep_default_na=False)

df_mlerr_labels_g_sum = df_mlerr_labels[df_mlerr_labels.nb_source==config.NB_SOURCE["github"]]
df_mlerr_labels_k_sum = df_mlerr_labels[df_mlerr_labels.nb_source==config.NB_SOURCE["kaggle"]]

# before filtering
print(df_mlerr_labels_g_sum.eid.nunique())
print(df_mlerr_labels_k_sum.eid.nunique())

exclude_other = ['should exclude', "intentional"] #[should exclude']
exclude_ml_pipeline_sum = config.label_ML_pipeline["no ML pipeline"]

exclude_g_filenames = df_mlerr_labels_g_sum[(df_mlerr_labels_g_sum.label_ML_pipeline.isin(exclude_ml_pipeline_sum))|(df_mlerr_labels_g_sum.other.isin(exclude_other))].fname
exclude_k_filenames = df_mlerr_labels_k_sum[(df_mlerr_labels_k_sum.label_ML_pipeline.isin(exclude_ml_pipeline_sum))|(df_mlerr_labels_k_sum.other.isin(exclude_other))].fname

df_mlerr_labels_g_sum = df_mlerr_labels_g_sum[(~df_mlerr_labels_g_sum.label_ML_pipeline.isin(exclude_ml_pipeline_sum))&(~df_mlerr_labels_g_sum.other.isin(exclude_other))]
df_mlerr_labels_k_sum = df_mlerr_labels_k_sum[(~df_mlerr_labels_k_sum.label_ML_pipeline.isin(exclude_ml_pipeline_sum))&(~df_mlerr_labels_k_sum.other.isin(exclude_other))]

# after filtering
print(df_mlerr_labels_g_sum.eid.nunique())
print(df_mlerr_labels_k_sum.eid.nunique())

390
356
390
356


  warn(msg)


In [5]:

# until the 6th iteration, the resampling ends
print(df_mlerr_labels_g_sum.fname.nunique())
print(df_mlerr_labels_k_sum.fname.nunique())


388
336


In [3]:
my_file = open(config.path_default.joinpath('tmp/exclude_g_filenames.txt'), "r") 
exclude_g_filenames_exist = my_file.read() 
exclude_g_filenames_exist = exclude_g_filenames_exist.split("\n") 
my_file.close() 

my_file = open(config.path_default.joinpath('tmp/exclude_k_filenames.txt'), "r") 
exclude_k_filenames_exist = my_file.read() 
exclude_k_filenames_exist = exclude_k_filenames_exist.split("\n") 
my_file.close() 

for i in exclude_g_filenames:
    exclude_g_filenames_exist.append(i)
for i in exclude_k_filenames:
    exclude_k_filenames_exist.append(i)
    
exclude_g_filenames_exist = set(exclude_g_filenames_exist)
exclude_k_filenames_exist = set(exclude_k_filenames_exist)

print(len(exclude_g_filenames_exist))
print(len(exclude_k_filenames_exist))

with open(config.path_default.joinpath('tmp/exclude_g_filenames.txt'), 'w') as f:
    for line in exclude_g_filenames_exist:
        f.write(f"{line}\n")
with open(config.path_default.joinpath('tmp/exclude_k_filenames.txt'), 'w') as f:
    for line in exclude_k_filenames_exist:
        f.write(f"{line}\n")

179
44


In [4]:
print("need resample {} more for GitHub, and {} more for Kaggle.".format(len(exclude_g_filenames), len(exclude_k_filenames)))

need resample 1 more for GitHub, and 0 more for Kaggle.


## Get the overall population from GitHub and Kaggle

We sample for GitHub and Kaggle separately

In [5]:
df_err_grouped_k = pd.read_excel(config.path_default.joinpath('Clustering/clusters_Kaggle.xlsx'))
df_err_grouped_g = pd.read_excel(config.path_default.joinpath('Clustering/clusters_GitHub.xlsx'))

In [6]:
print((df_err_grouped_g.eid.nunique()))
print((df_err_grouped_g.fname.nunique()))

print((df_err_grouped_k.eid.nunique()))
print((df_err_grouped_k.fname.nunique()))

88667
61342
3875
2689


## Get "proportional sampling to cluster size" config

390 GH, 356 Kaggle

In [7]:
selected_clusters_g = pd.read_excel(config.path_default.joinpath('tmp/df_err_processed_pregroup_cluster_size_samples_g.xlsx'))
selected_clusters_k = pd.read_excel(config.path_default.joinpath('tmp/df_err_processed_pregroup_cluster_size_samples_k.xlsx'))

In [8]:
print(sum(selected_clusters_g.sample_size))
print(sum(selected_clusters_k.sample_size))

390
356


## Resample

In [17]:
sample_g_new = None
sample_k_new = None

for _, row in selected_clusters_g.iterrows():
    sc_id = row.cluster_id
    df_sample_size_g = selected_clusters_g[selected_clusters_g.cluster_id==sc_id]
    sample_size_g = df_sample_size_g.sample_size.iloc[0] if len(df_sample_size_g) > 0 else 0
    
    # github
    df_already_sampled_g = df_mlerr_labels_g_sum.loc[(df_mlerr_labels_g_sum.cluster_id == sc_id)]
    if len(df_already_sampled_g) < sample_size_g:
        # sample more, not repeat
        tmp_g = df_err_grouped_g[(df_err_grouped_g.cluster_id == sc_id)]
        tmp_g = tmp_g[~tmp_g["eid"].isin(df_already_sampled_g.eid.tolist())]
        tmp_g = tmp_g[~tmp_g["fname"].isin(exclude_g_filenames_exist)]
        sample_g = tmp_g.sample(n=(sample_size_g-len(df_already_sampled_g)), random_state=30)
    else:
        sample_g = None
    
    if sample_g is not None:
        sample_g_new = pd.concat([sample_g_new, sample_g], ignore_index=True)
    
for _, row in selected_clusters_k.iterrows():
    sc_id = row.cluster_id
    df_sample_size_k = selected_clusters_k[selected_clusters_k.cluster_id==sc_id]
    sample_size_k = df_sample_size_k.sample_size.iloc[0] if len(df_sample_size_k) > 0 else 0
    
    # kaggle
    df_already_sampled_k = df_mlerr_labels_k_sum.loc[(df_mlerr_labels_k_sum.cluster_id == sc_id)]
    if len(df_already_sampled_k) < sample_size_k:
        # sample more, not repeat
        tmp_k = df_err_grouped_k[(df_err_grouped_k.cluster_id == sc_id)]
        tmp_k = tmp_k[~tmp_k["eid"].isin(df_already_sampled_k.eid.tolist())]
        tmp_k = tmp_k[~tmp_k["fname"].isin(exclude_k_filenames_exist)]
        sample_k = tmp_k.sample(n=sample_size_k-len(df_already_sampled_k), random_state=30)
    else:
        sample_k = None
    
    if sample_k is not None:
        sample_k_new = pd.concat([sample_k_new, sample_k], ignore_index=True)
        
sample_new = pd.concat([sample_g_new, sample_k_new], ignore_index=True)

In [18]:
sample_all = pd.concat([df_mlerr_labels_g_sum, df_mlerr_labels_k_sum, sample_new], ignore_index=True)

In [19]:
assert(sample_all.cluster_id.nunique()==len(set(selected_clusters_g.cluster_id.values).union(set(selected_clusters_k.cluster_id.values))))
print(sample_all[sample_all.nb_source==config.NB_SOURCE["github"]].eid.nunique(), sum(selected_clusters_g.sample_size))
print(sample_all[sample_all.nb_source==config.NB_SOURCE["kaggle"]].eid.nunique(), sum(selected_clusters_k.sample_size))
# assert(sample_all.eid.nunique()==(sum(selected_clusters_g.sample_size)+sum(selected_clusters_k.sample_size)))

390 390
356 356


In [20]:
sample_all = sample_all.drop(columns=sample_all.columns[~sample_all.columns.isin(df_mlerr_labels_k_sum.columns)])

In [21]:
# sort based on cluster id for better manual labeling
sample_all["size_counts"] = sample_all["cluster_id"].map(sample_all["cluster_id"].value_counts())
assert(len(sample_all[sample_all["size_counts"]==1].cluster_id)==sample_all[sample_all["size_counts"]==1].cluster_id.nunique())
sample_all.sort_values(by=['size_counts',"cluster_id"], ascending=False, inplace=True) # by=['size_counts',"cluster_id"]

In [22]:
sample_all.columns

Index(['fname', 'eid', 'ename', 'evalue', 'traceback', 'ename_mapped',
       'nb_source', 'evalue_processed', 'pregroup_cluster', 'cluster_id',
       'label_ML_pipeline', 'label_if_ML_bug', 'label_refined_exp_type',
       'label_if_runinfo_help', 'label_if_code_error_align',
       'label_if_error_chain', 'label_root_cause', 'Comment', 'other',
       'Labeler', 'Reviewer', 'Review_res', 'Review_note',
       'Resolution comment', 'size_counts'],
      dtype='object')

In [23]:
# sample_new.to_excel(config.path_default.joinpath("tmp/resampled_new.xlsx"), index=False, engine='xlsxwriter')
sample_new.to_csv(config.path_default.joinpath("tmp/resampled_new_{}.csv".format(n_iter+1)), index=False) 

In [24]:
import pandas as pd

with pd.ExcelWriter(config.path_default.joinpath("tmp/cluster_resampled_{}.xlsx".format(n_iter+1))) as writer:
    sample_all = sample_all.drop(['size_counts'], axis=1)
    
    sample_all.to_excel(writer, sheet_name="Default", index=False)
    sample_all.to_excel(writer, sheet_name="Del-All", index=False)