In [4]:
import pymongo
import random
import pandas as pd
import configparser
from pathlib import Path
from typing import Any

# check average length of content
# check min and max lengths of content 

config = configparser.RawConfigParser()
config.read(Path("../application.properties"))

def get_mongoclient(config: configparser.RawConfigParser) -> pymongo.MongoClient:

    user = config.get('DatabaseSection', 'mongodb.user')
    password = config.get('DatabaseSection', 'mongodb.password')
    hostname = config.get('DatabaseSection', 'mongodb.hostname')
    port = config.get('DatabaseSection', 'mongodb.port')
    options = config.get('DatabaseSection', 'mongodb.options')

    conn_str = f"mongodb://{user}:{password}@{hostname}:{port}/{options}"

    return pymongo.MongoClient(conn_str, serverSelectionTimeoutMS=5000, unicode_decode_error_handler='ignore')

comments_db = config.get('DatabaseSection', 'mongodb.database.comments')
db = get_mongoclient(config=config)[comments_db]
comments = list(db.comments.find({"filtered": False}))


In [7]:
len(comments)

737005

In [5]:
from typing import Dict, List

def stratify_prop_to_project_name(df: pd.DataFrame, sample_frac: float) -> pd.DataFrame:
   random_seed_strat = 2802234
   df['PROJECT'].unique()[0]

#    return df.sample(max(1, int(sample_frac*len(df))), random_state=random_seed_strat)
   return df.sample(1, random_state=random_seed_strat)

def get_rand_item(comments: List[Dict]) -> Dict:
    random.seed(26225)
    return random.sample(comments, 1)

def get_comment_df(comments: Dict) -> pd.DataFrame:
    ids = [comment_dict['_id'] for comment_dict in comments]
    contents = [comment_dict['content'] for comment_dict in comments]
    project_names = [comment_dict['project_name'] for comment_dict in comments]
    comm_types = [comment_dict['type'] for comment_dict in comments]
    comm_dates = [comment_dict['committer_date'] for comment_dict in comments]
    comm_file_path = [comment_dict['file_path'] for comment_dict in comments]
    comm_vcs = ['{}/commit/{}'.format(comment_dict['vcs_url'].replace('.git', ''),
                                       comment_dict['commit_hash']) for comment_dict in comments]
    comm_hunk_old_line = [comment_dict['hunk_old_start'] for comment_dict in comments]
    comm_hunk_new_line = [comment_dict['hunk_new_start'] for comment_dict in comments]

    return pd.DataFrame(dict(
    ID=ids,
    FILE_PATH=comm_file_path,
    COMM_TYPE=comm_types,
    COMM_DATE=comm_dates,
    COMM_COMMIT_URL=comm_vcs,
    HUNK_OLD_LINE=comm_hunk_old_line,
    HUNK_NEW_LINE=comm_hunk_new_line,
    CONTENT=contents,
    PROJECT=project_names,

    
    ))

def get_strat_comment_df(comments: pd.DataFrame, sample_frac: float, seed: int) -> pd.DataFrame:
    shuffled_df = comments.sample(frac=1, random_state=seed)

    return shuffled_df.groupby('PROJECT', group_keys=False).apply(lambda x: stratify_prop_to_project_name(x, sample_frac))



# statistical sample size not needed as we do not use Cohens kappa on a sample of labeled data for agreement.
# check agreement with Cohens kappa and agreement level according to Fleiss (see duplicate SATD paper)
# Do need to get a statistical significant proportion of sample 

# C.I. = 5% and confidence level 95% then sample size 384..to get a statistically sign. sample for proportion p of SATD/non-SATD
# C.I. = 7% and confidence level 95% then sample size 196..to get a statistically sign. sample for proportion p of SATD/non-SATD
# C.I. = 8% and confidence level 95% then sample size 150..to get a statistically sign. sample for proportion p of SATD/non-SATD 
# C.I. = 10% and confidence level 95% then sample size 96..to get a statistically sign. sample for proportion p of SATD/non-SATD 
# start with 96 and see how it goes?
# can also compare p with proportions found in Guo et al and Maldonado datasets
stat_sign_sample_size = 95


# TODO get url to commit like this: <vcs_url without .git extension>/commit/commit_hash
# https://stackoverflow.com/questions/12214746/find-a-commit-on-github-given-the-commit-hash


# TODO create dataframe with rows for extra comment information to find context and understand it better
# use extra info rows and copy these to separate tab in google spreadsheets and link this extra info with comments through their ids (ID column)

# add file_path in the labeler's sheet as a column to more easily check the extension for a .java file!

comments_df = get_comment_df(comments)

random_seed_shuffle = 122324
sample_frac = stat_sign_sample_size / len(comments_df)
stratified_df = get_strat_comment_df(comments=comments_df, sample_frac=sample_frac, seed=random_seed_shuffle)

strat_project_count = len(stratified_df['PROJECT'].unique())
print(f'amount of project included in stratified sample: {strat_project_count}')

with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(stratified_df)

amount of project included in stratified sample: 95


Unnamed: 0,ID,FILE_PATH,COMM_TYPE,COMM_DATE,COMM_COMMIT_URL,HUNK_OLD_LINE,HUNK_NEW_LINE,CONTENT,PROJECT
431322,63fbac50ceddb47ddd8f69fa,activemq-unit-tests/src/test/java/org/apache/a...,GROUPED_LINE,2019-11-18 05:50:37,https://github.com/apache/activemq/commit/a59e...,1328,1420,Setup the producer and send the message.,activemq
611997,63fbaf8dceddb47ddd9420e6,modules/sharing-registry/sharing-registry-stub...,GROUPED_LINE,2017-04-03 11:50:05,https://github.com/apache/airavata/commit/5100...,0,1,required,airavata
20845,63fba9beceddb47ddd8561e1,archiva-modules/archiva-base/archiva-repositor...,GROUPED_LINE,2017-08-19 13:37:35,https://github.com/apache/archiva/commit/7a271...,25,25,www.springframework.org/schema/beans/spring-b...,archiva
730158,63fbb164ceddb47ddd97e711,lang/java/tools/src/main/resources/META-INF/NO...,GROUPED_LINE,2019-06-20 19:39:11,https://github.com/apache/avro/commit/70260919...,76,76,www.apache.org/).,avro
21906,63fba9beceddb47ddd8567fe,bigtop-packages/src/common/ambari/ODPi/1.0/ser...,BLOCK,2017-03-22 07:08:08,https://github.com/apache/bigtop/commit/ba8d7f...,0,1,!40101 SET character_set_client = utf8,bigtop
16866,63fba9b9ceddb47ddd853fa7,core/src/test/java/org/apache/calcite/test/Jdb...,GROUPED_LINE,2017-07-25 05:19:12,https://github.com/apache/calcite/commit/0cbd2...,5536,5537,"as previous, but implicit schema",calcite
18902,63fba9b9ceddb47ddd854f04,docs/asciidoc/cayenne-guide/src/docs/asciidoc/...,GROUPED_LINE,2018-01-19 13:21:30,https://github.com/apache/cayenne/commit/3fa68...,0,1,add Cayenne Gradle Plugin,cayenne
22079,63fba9beceddb47ddd85691f,README.md,GROUPED_LINE,2019-01-23 20:51:26,https://github.com/apache/commons-bcel/commit/...,46,46,travis-ci.org/apache/commons-bcel.svg)](https...,commons-bcel
22489,63fba9beceddb47ddd856c86,src/test/java/org/apache/commons/beanutils2/lo...,GROUPED_LINE,2017-12-27 21:27:35,https://github.com/apache/commons-beanutils/co...,0,1,Perform Tests,commons-beanutils
22674,63fba9beceddb47ddd856d91,src/main/java/org/apache/commons/codec/languag...,GROUPED_LINE,2017-04-02 22:41:29,https://github.com/apache/commons-codec/commit...,234,322,these are ignored completely,commons-codec


In [6]:
len(stratified_df)

95

In [None]:

sample_excel_path = Path(f'./data/adj_extractor_sample_comments_size_{stat_sign_sample_size}.xlsx')


stratified_df.to_excel(sample_excel_path, index=False)



In [23]:
from cmath import nan
from typing import Any, Union
import krippendorff

def label_converter(label: str) -> int:
    if label == 'satd':
        return 1
    elif label == 'non-satd':
        return 0
    else:
        return nan

satd_labeled_sample_path = Path('./data/SATD data quality check.xlsx')
labeled_sample_df = pd.read_excel(satd_labeled_sample_path, sheet_name='Comments Labeled',
    usecols='D:F',
    na_values="...",
    converters={0: label_converter, 1: label_converter, 2: label_converter},
    nrows=len(stratified_df))

reliability_data = [labeled_sample_df[column].tolist() for column in labeled_sample_df]

# gives back nan if whole row of one labeler is nan
krippendorff.alpha(reliability_data=reliability_data, level_of_measurement='nominal')


0.12344113295286419

In [21]:
labeled_sample_df


Unnamed: 0,labeler 1 (Nathan),labeler 2 (Alexander),labeler 3 (Twan)
0,,non-satd,non-satd
1,,non-satd,non-satd
2,,satd,non-satd
3,,satd,non-satd
4,,non-satd,non-satd
...,...,...,...
91,,satd,satd
92,,non-satd,non-satd
93,,satd,non-satd
94,,non-satd,non-satd


In [22]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# from selenium import webdriver
# from selenium.webdriver.common.keys import Keys

import requests
from bs4 import BeautifulSoup
import time

driver=webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# Set the GitHub repository and commit hash
repo = "apache/commons-rdf"
commit_hash = "92998b329182724789f44272f60538dc195b8df3"

# Set the string to search for within the diff
search_string = "Check with Triple"

# Construct the GitHub API URL
url = f"https://github.com/{repo}/commit/{commit_hash}"

driver.get(url)

wait = WebDriverWait(driver, 10)
element = wait.until(EC.presence_of_element_located((By.LINK_TEXT, "api/src/test/java/org/apache/commons/rdf/api/AbstractGraphTest.java"), 
                                                    ))

html = driver.page_source

driver.quit()


# # Send a GET request to the API URL and parse the JSON response
# response = requests.get(url)
# data = response.json()

# commit_response = requests.get(url)
soup = BeautifulSoup(html, "html.parser")
diff_anchors = soup.find_all("a")

for anchor in diff_anchors:
    if anchor.text == "api/src/test/java/org/apache/commons/rdf/api/AbstractGraphTest.java":
        print(f'https://github.com/{repo}/commit/{commit_hash}{anchor["href"]}R495')
        break
# TODO check what happens if some commit has a looot of lines of code and some diffs are folded in? 
# can this method then still bring you to the desired line number?

# TODO still some commits have wrong link connected to them? Also, 

# in studi3T cannot run hunk retrieval aggregate when date is on 2016 and the in between match statements are there!
# if $out is not activated it still works for some reason.
# seems to be working again also now. Magically..


https://github.com/apache/commons-rdf/commit/92998b329182724789f44272f60538dc195b8df3#diff-dac69938e4093eccc15dd1907659090c2b72963a19e747a55d0f38d90e46e556R495
