In [1]:
import pymongo
import random
import pandas as pd
import configparser
from pathlib import Path

# check average length of content
# check min and max lengths of content 

config = configparser.RawConfigParser()
config.read(Path("../application.properties"))
user = config.get('DatabaseSection', 'mongodb.user')
password = config.get('DatabaseSection', 'mongodb.password')
hostname = config.get('DatabaseSection', 'mongodb.hostname')
port = config.get('DatabaseSection', 'mongodb.port')
options = config.get('DatabaseSection', 'mongodb.options')
comments_db = config.get('DatabaseSection', 'mongodb.database.comments')

conn_str = f"mongodb://{user}:{password}@{hostname}:{port}/{options}"

client = pymongo.MongoClient(conn_str, serverSelectionTimeoutMS=5000)

db = client[comments_db]


dedupl_non_filt_comments = list(db.deduplicated_nonfiltered_comments.find())





In [2]:
from math import ceil


def reprod_shuffle_on_groups(df: pd.DataFrame, sample_frac: float) -> pd.DataFrame:

   random.seed(1092022)
   random_reprod_seeds = random.sample(range(1,10000), len(df['PROJECT'].unique()))

   df_groups = [df[df['PROJECT']==project].sample(frac=1, random_state=random_reprod_seeds[i]) for i, project in enumerate(df['PROJECT'].unique())]
   return pd.concat(df_groups)

def stratify_prop_to_project_name(df: pd.DataFrame, sample_frac) -> pd.DataFrame:
   random_seed_strat = 28082022
   df['PROJECT'].unique()[0]

   return df.sample(max(1, int(sample_frac*len(df))), random_state=random_seed_strat)



# statistical sample size not needed as we do not use Cohens kappa on a sample of labeled data for agreement.
# check agreement with Cohens kappa and agreement level according to Fleiss (see duplicate SATD paper)
# Do need to get a statistical significant proportion of sample 

# C.I. = 5% and confidence level 95% then sample size 384..to get a statistically sign. sample for proportion p of SATD/non-SATD
# C.I. = 7% and confidence level 95% then sample size 196..to get a statistically sign. sample for proportion p of SATD/non-SATD
# C.I. = 8% and confidence level 95% then sample size 150..to get a statistically sign. sample for proportion p of SATD/non-SATD 
# C.I. = 10% and confidence level 95% then sample size 96..to get a statistically sign. sample for proportion p of SATD/non-SATD 
# start with 96 and see how it goes?
# can also compare p with proportions found in Guo et al and Maldonado datasets
stat_sign_sample_size = 98

# seed for reproducability. Can increase sample size to append extra comments to sample
random.seed(28082022)
# stat_sign_sample = random.sample(dedupl_non_filt_comments, stat_sign_sample_size)
contents = [comment_dict['_id']['content'] for comment_dict in dedupl_non_filt_comments]
project_names = [comment_dict['_id']['project_name'] for comment_dict in dedupl_non_filt_comments]
counts = [comment_dict['count'] for comment_dict in dedupl_non_filt_comments]

df = pd.DataFrame(dict(
   CONTENT=contents,
   PROJECT=project_names,
   DUPL_COUNT=counts 
))

random_seed_shuffle = 1092022
sample_frac = stat_sign_sample_size / len(df)
# shuffled_df = reprod_shuffle_on_groups(df)
shuffled_df = df.sample(frac=1, random_state=random_seed_shuffle)


stratified_df = shuffled_df.groupby('PROJECT', group_keys=False).apply(lambda x: stratify_prop_to_project_name(x, sample_frac))

original_project_count = len(shuffled_df['PROJECT'].unique())
print(f'amount of project included in original dedupl comments df: {original_project_count}')
strat_project_count = len(stratified_df['PROJECT'].unique())
print(f'amount of project included in stratified sample: {strat_project_count}')

with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(stratified_df)

amount of project included in original dedupl comments df: 96
amount of project included in stratified sample: 96


Unnamed: 0,CONTENT,PROJECT,DUPL_COUNT
17092,"* If the area of interest covers the world, we...",activemq,3
28837,Remove wrapper from fragment,airavata,8
62683,* Test descriptions before CRS creation.\n ...,archiva,2
87500,Startpoint should delete after checkpoint commit,avro,4
111491,"strip start and end """,bigtop,5
128226,"* At this point, 'i' is at the beginning of th...",calcite,1
142317,Test: TestModule2_RenderURLCanBeClicked,cayenne,1
177345,Typo in GCOM-C version 1.00.,commons-bcel,3
180319,Check if the attributes are correct,commons-beanutils,1
182209,create cgroup container for JDs,commons-codec,1


In [None]:

project_dedupl_com_cnt_df = shuffled_df.groupby('PROJECT').sum()
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(project_dedupl_com_cnt_df)


In [65]:

sample_excel_path = Path(f'./data/sample_comments_size_{stat_sign_sample_size}.xlsx')
stratified_df.to_excel(sample_excel_path, index=False)



In [23]:
from cmath import nan
from typing import Any, Union
import krippendorff

def label_converter(label: str) -> int:
    if label == 'satd':
        return 1
    elif label == 'non-satd':
        return 0
    else:
        return nan

satd_labeled_sample_path = Path('./data/SATD data quality check.xlsx')
labeled_sample_df = pd.read_excel(satd_labeled_sample_path, sheet_name='Comments Labeled',
    usecols='D:F',
    na_values="...",
    converters={0: label_converter, 1: label_converter, 2: label_converter},
    nrows=len(stratified_df))

reliability_data = [labeled_sample_df[column].tolist() for column in labeled_sample_df]

# gives back nan if whole row of one labeler is nan
krippendorff.alpha(reliability_data=reliability_data, level_of_measurement='nominal')


0.12344113295286419

In [21]:
labeled_sample_df


Unnamed: 0,labeler 1 (Nathan),labeler 2 (Alexander),labeler 3 (Twan)
0,,non-satd,non-satd
1,,non-satd,non-satd
2,,satd,non-satd
3,,satd,non-satd
4,,non-satd,non-satd
...,...,...,...
91,,satd,satd
92,,non-satd,non-satd
93,,satd,non-satd
94,,non-satd,non-satd
