In [14]:
from pathlib import Path
import logging
import numpy as np

log = logging.getLogger(__name__)

def file_len(filename: Path) -> int:
    try:
        with filename.open() as f:
            i = 0
            for i, _ in enumerate(f):
                pass
    except FileNotFoundError:
        log.exception()
    finally:
        return i

def read_as_str_list(filename: Path) -> list[str]:
    try:
        with filename.open('r') as f:
            lines = f.readlines()
    except FileNotFoundError:
        log.exception()
    finally:
        return lines

def write_lines(filename: Path, lines: list[str]) -> None:
    try:
        with filename.open('w') as f:
            f.writelines(lines)
    except FileNotFoundError:
        log.exception()


        
project_name = "SpringFramework"
data_folder = Path("./exp_data/origin/")
data_file = f"data--{project_name}.txt"
full_data_path = data_folder / data_file

label_file = f"label--{project_name}.txt"
full_label_path = data_folder / label_file

data_len = file_len(full_data_path)
label_len = file_len(full_label_path)


assert data_len == label_len



In [15]:
ke_data_folder = Path("./exp_data_ke/origin/")
full_ke_data_path = ke_data_folder / data_file
full_ke_label_path = ke_data_folder / label_file

projects_file = "projects"
full_projects_path = data_folder / projects_file

# TODO get indices from projects file 
# DO NOT have to do anything with data-- and label-- files. These are automatically generated by some methods like Pattern
# using the comments, labels and projects file
# use projects file indices for specific project name to get the right labels and comments
# then create new files for projects, labels and comments for only the specific project.
# TODO make sure to sample with root thing or whatever it is called so will sample same indices in labels as in data!



project_names = np.asarray(read_as_str_list(full_projects_path))
# project_names = np.array([x.replace("\n", "") for x in read_as_str_list(full_projects_path)])

# Don't mind the len being only 1 as the len 1 is because the 1 item is a array of idx
project_idx = np.where(project_names == project_name + "\n")
len(project_names)

118316

In [16]:
labels_file = "labels"
full_labels_path = data_folder / labels_file


labels = np.asarray(read_as_str_list(full_labels_path))

comments_file = "comments"
full_comments_path = data_folder / comments_file

comments = np.asarray(read_as_str_list(full_comments_path))
assert len(comments) == len(project_names)


In [17]:
assert len(project_names) == len(comments)
assert len(labels) == len(comments)

In [18]:
filt_labels = labels[project_idx]
filt_comments = comments[project_idx]
filt_project_names = project_names[project_idx]

full_ke_projects_path = ke_data_folder / projects_file
full_ke_comments_path = ke_data_folder / comments_file
full_ke_labels_path = ke_data_folder / labels_file

write_lines(full_ke_projects_path, filt_project_names)
write_lines(full_ke_comments_path, filt_comments)
write_lines(full_ke_labels_path, filt_labels)

In [19]:
assert file_len(full_ke_comments_path) == file_len(full_ke_labels_path)
assert file_len(ke_data_folder / "label--SpringFramework.txt") == file_len(full_ke_labels_path)
file_len(full_ke_comments_path)



7711

In [49]:
import random
import pandas as pd

# sample size for 90% CI
# 10% error margin
# https://www.qualtrics.com/blog/calculating-sample-size/
sample_size = 68
# simple random sampling without replacement
# for reproducability
random.seed(11)
simple_rand_sample_idx = np.asarray(random.sample(range(len(filt_labels)), sample_size))

df = pd.DataFrame(dict(
        LABELS=filt_labels,
        COMMENTS=filt_comments,
        PROJECTS=filt_project_names,
        IDX=range(len(filt_labels))
    ))

# stratified sample
sample_total_frac = sample_size / len(filt_labels) 
low_sample_df = pd.DataFrame(dict(
    LABELS=filt_labels[simple_rand_sample_idx],
    COMMENTS=filt_comments[simple_rand_sample_idx],
    PROJECTS=filt_project_names[simple_rand_sample_idx],
    IDX=simple_rand_sample_idx
))


# sample size for 90% CI
# 10% error margin
# https://www.qualtrics.com/blog/calculating-sample-size/
sample_size = 1831
# stratified sample
sample_total_frac = sample_size / len(filt_labels) 
better_margin_sample_df = df.groupby('LABELS', group_keys=False).apply(lambda x: x.sample(frac=sample_total_frac, random_state=20))

low_sample_df.groupby('LABELS').count()

better_margin_sample_df.groupby('LABELS').count()

# now need to write the labels, comments and projects columns to files again in a seperate exp_data folder with its own result folder
# for simple random sample still only have the idx so still have to use this idx to filter a sample from the filt_ arrays
# then we can get results by just running the MAT on each of these exp_data folders



Unnamed: 0_level_0,COMMENTS,PROJECTS,IDX
LABELS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SATD\n,23,23,23
WITHOUT_CLASSIFICATION\n,1808,1808,1808


In [37]:
# Small sample data writing


small_simple_random_ke_data_folder = Path("./exp_data_ke_simple_small/origin/")
full_ke_small_simple_comments_path = small_simple_random_ke_data_folder / comments_file
full_ke_small_simple_projects_path = small_simple_random_ke_data_folder / projects_file
full_ke_small_simple_labels_path = small_simple_random_ke_data_folder / labels_file


write_lines(full_ke_small_simple_projects_path, low_sample_df['PROJECTS'])
write_lines(full_ke_small_simple_comments_path, low_sample_df['COMMENTS'])
write_lines(full_ke_small_simple_labels_path, low_sample_df['LABELS'])

labels_dash_dash_sample = np.asarray(read_as_str_list(full_label_path))[low_sample_df['IDX']]
full_ke_small_simple_labels_dash_dash_path = small_simple_random_ke_data_folder / label_file
write_lines(full_ke_small_simple_labels_dash_dash_path, labels_dash_dash_sample)

full_ke_small_simple_data_dash_dash_path = small_simple_random_ke_data_folder / data_file
write_lines(full_ke_small_simple_data_dash_dash_path, low_sample_df['COMMENTS'])



# larger stratified sample data writing
larger_strat_random_ke_data_folder = Path("./exp_data_ke_strat_large/origin/")
full_ke_larger_strat_comments_path = larger_strat_random_ke_data_folder / comments_file
full_ke_larger_strat_projects_path = larger_strat_random_ke_data_folder / projects_file
full_ke_larger_strat_labels_path = larger_strat_random_ke_data_folder / labels_file


write_lines(full_ke_larger_strat_projects_path, better_margin_sample_df['PROJECTS'])
write_lines(full_ke_larger_strat_comments_path, better_margin_sample_df['COMMENTS'])
write_lines(full_ke_larger_strat_labels_path, better_margin_sample_df['LABELS'])

labels_dash_dash_larger_sample = np.asarray(read_as_str_list(full_label_path))[better_margin_sample_df['IDX']]
full_ke_larger_strat_labels_dash_dash_path = larger_strat_random_ke_data_folder / label_file
write_lines(full_ke_larger_strat_labels_dash_dash_path, labels_dash_dash_larger_sample)

full_ke_larger_strat_data_dash_dash_path = larger_strat_random_ke_data_folder / data_file
write_lines(full_ke_larger_strat_data_dash_dash_path, better_margin_sample_df['COMMENTS'])

labels_dash_dash_larger_sample

array(['positive\n', 'positive\n', 'positive\n', ..., 'negative\n',
       'negative\n', 'negative\n'], dtype='<U9')

In [45]:
# simple accuracy
(0+65)/(0+1+1+65)

# accuracy if everything is just always predicted as non-satd
(0+67)/(0+67+0+1)


0.9852941176470589

In [50]:
23/1808

0.012721238938053098