In [3]:
import pymongo
import random
import pandas as pd
import configparser
from pathlib import Path
from typing import Any

# check average length of content
# check min and max lengths of content 

config = configparser.RawConfigParser()
config.read(Path("../application.properties"))

def get_mongoclient(config: configparser.RawConfigParser) -> pymongo.MongoClient:

    user = config.get('DatabaseSection', 'mongodb.user')
    password = config.get('DatabaseSection', 'mongodb.password')
    hostname = config.get('DatabaseSection', 'mongodb.hostname')
    port = config.get('DatabaseSection', 'mongodb.port')
    options = config.get('DatabaseSection', 'mongodb.options')

    conn_str = f"mongodb://{user}:{password}@{hostname}:{port}/{options}"

    return pymongo.MongoClient(conn_str, serverSelectionTimeoutMS=5000, unicode_decode_error_handler='ignore')

comments_db = config.get('DatabaseSection', 'mongodb.database.comments')
db = get_mongoclient(config=config)[comments_db]
comments = list(db.comments.find({"filtered": False}))


In [2]:
from typing import Dict, List

def stratify_prop_to_project_name(df: pd.DataFrame, sample_frac: float) -> pd.DataFrame:
   random_seed_strat = 2802234
   df['PROJECT'].unique()[0]

#    return df.sample(max(1, int(sample_frac*len(df))), random_state=random_seed_strat)
   return df.sample(1, random_state=random_seed_strat)

def get_rand_item(comments: List[Dict]) -> Dict:
    random.seed(26225)
    return random.sample(comments, 1)

def get_comment_df(comments: Dict) -> pd.DataFrame:
    ids = [comment_dict['_id'] for comment_dict in comments]
    contents = [comment_dict['content'] for comment_dict in comments]
    project_names = [comment_dict['project_name'] for comment_dict in comments]
    comm_types = [comment_dict['type'] for comment_dict in comments]
    comm_dates = [comment_dict['committer_date'] for comment_dict in comments]
    comm_file_path = [comment_dict['file_path'] for comment_dict in comments]
    comm_vcs = ['{}/commit/{}'.format(comment_dict['vcs_url'].replace('.git', ''),
                                       comment_dict['commit_hash']) for comment_dict in comments]
    comm_hunk_old_line = [comment_dict['hunk_old_start'] for comment_dict in comments]
    comm_hunk_new_line = [comment_dict['hunk_new_start'] for comment_dict in comments]

    return pd.DataFrame(dict(
    ID=ids,
    FILE_PATH=comm_file_path,
    COMM_TYPE=comm_types,
    COMM_DATE=comm_dates,
    COMM_COMMIT_URL=comm_vcs,
    HUNK_OLD_LINE=comm_hunk_old_line,
    HUNK_NEW_LINE=comm_hunk_new_line,
    CONTENT=contents,
    PROJECT=project_names,

    
    ))

def get_strat_comment_df(comments: pd.DataFrame, sample_frac: float, seed: int) -> pd.DataFrame:
    shuffled_df = comments.sample(frac=1, random_state=seed)

    return shuffled_df.groupby('PROJECT', group_keys=False).apply(lambda x: stratify_prop_to_project_name(x, sample_frac))



# statistical sample size not needed as we do not use Cohens kappa on a sample of labeled data for agreement.
# check agreement with Cohens kappa and agreement level according to Fleiss (see duplicate SATD paper)
# Do need to get a statistical significant proportion of sample 

# C.I. = 5% and confidence level 95% then sample size 384..to get a statistically sign. sample for proportion p of SATD/non-SATD
# C.I. = 7% and confidence level 95% then sample size 196..to get a statistically sign. sample for proportion p of SATD/non-SATD
# C.I. = 8% and confidence level 95% then sample size 150..to get a statistically sign. sample for proportion p of SATD/non-SATD 
# C.I. = 10% and confidence level 95% then sample size 96..to get a statistically sign. sample for proportion p of SATD/non-SATD 
# start with 96 and see how it goes?
# can also compare p with proportions found in Guo et al and Maldonado datasets
stat_sign_sample_size = 95


# TODO get url to commit like this: <vcs_url without .git extension>/commit/commit_hash
# https://stackoverflow.com/questions/12214746/find-a-commit-on-github-given-the-commit-hash


# TODO create dataframe with rows for extra comment information to find context and understand it better
# use extra info rows and copy these to separate tab in google spreadsheets and link this extra info with comments through their ids (ID column)

# add file_path in the labeler's sheet as a column to more easily check the extension for a .java file!

comments_df = get_comment_df(comments)

random_seed_shuffle = 122324
sample_frac = stat_sign_sample_size / len(comments_df)
stratified_df = get_strat_comment_df(comments=comments_df, sample_frac=sample_frac, seed=random_seed_shuffle)

strat_project_count = len(stratified_df['PROJECT'].unique())
print(f'amount of project included in stratified sample: {strat_project_count}')

with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(stratified_df)

ZeroDivisionError: division by zero

In [6]:
len(stratified_df)

95

In [7]:

sample_excel_path = Path(f'./data/adj_extractor_sample_comments_size_{stat_sign_sample_size}.xlsx')


stratified_df.to_excel(sample_excel_path, index=False)



In [23]:
from cmath import nan
from typing import Any, Union
import krippendorff

def label_converter(label: str) -> int:
    if label == 'satd':
        return 1
    elif label == 'non-satd':
        return 0
    else:
        return nan

satd_labeled_sample_path = Path('./data/SATD data quality check.xlsx')
labeled_sample_df = pd.read_excel(satd_labeled_sample_path, sheet_name='Comments Labeled',
    usecols='D:F',
    na_values="...",
    converters={0: label_converter, 1: label_converter, 2: label_converter},
    nrows=len(stratified_df))

reliability_data = [labeled_sample_df[column].tolist() for column in labeled_sample_df]

# gives back nan if whole row of one labeler is nan
krippendorff.alpha(reliability_data=reliability_data, level_of_measurement='nominal')


0.12344113295286419

In [21]:
labeled_sample_df


Unnamed: 0,labeler 1 (Nathan),labeler 2 (Alexander),labeler 3 (Twan)
0,,non-satd,non-satd
1,,non-satd,non-satd
2,,satd,non-satd
3,,satd,non-satd
4,,non-satd,non-satd
...,...,...,...
91,,satd,satd
92,,non-satd,non-satd
93,,satd,non-satd
94,,non-satd,non-satd
