In [1]:
import sys
sys.executable

'/home/jovyan/conda-envs/SATD_empirical_py3.10/bin/python'

# Download Datasets

In [39]:
# !pip install gdown

Collecting gdown
  Using cached gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Collecting beautifulsoup4 (from gdown)
  Using cached beautifulsoup4-4.12.3-py3-none-any.whl.metadata (3.8 kB)
Collecting filelock (from gdown)
  Downloading filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting requests[socks] (from gdown)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting soupsieve>1.2 (from beautifulsoup4->gdown)
  Using cached soupsieve-2.6-py3-none-any.whl.metadata (4.6 kB)
Collecting charset-normalizer<4,>=2 (from requests[socks]->gdown)
  Downloading charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (35 kB)
Collecting urllib3<3,>=1.21.1 (from requests[socks]->gdown)
  Downloading urllib3-2.3.0-py3-none-any.whl.metadata (6.5 kB)
Collecting PySocks!=1.5.7,>=1.5.6 (from requests[socks]->gdown)
  Using cached PySocks-1.7.1-py3-none-any.whl.metadata (13 kB)
Using cached gdown-5.2.0-py3-none-any.whl (18 kB)
Using ca

In [1]:
import gdown
import os

def download_from_google_drive(fileId, filePath):
    download_url = f"https://drive.google.com/uc?id={fileId}"
    gdown.download(download_url, filePath, quiet=False) 

In [2]:
# Download Python dataset

os.makedirs("Datasets/Python", exist_ok=True)

fileId_filePath = {
    '1LFQarTywxsox1WAtD5BwGJl6GKIkRnrp': 'Datasets/Python/df0.pkl.gz',
    '1GAW2DdF9bddxhPahNlf8VvqyWIHM9IbG': 'Datasets/Python/df1.pkl.gz',
    '1aQ5Rpe-GI-Vqb5BCg8cz8bXXEKzKmguX': 'Datasets/Python/df3.pkl.gz',
}

for fileId, filePath in fileId_filePath.items():
    if not os.path.exists(filePath):
        download_from_google_drive(fileId, filePath)
    else:
        print(filePath, 'already exists.')

Datasets/Python/df0.pkl.gz already exists.
Datasets/Python/df1.pkl.gz already exists.
Datasets/Python/df3.pkl.gz already exists.


In [3]:
# Download Java dataset

os.makedirs("Datasets/Java", exist_ok=True)

fileId_filePath = {
    '142ZPmt-RuAWrsxvEmNEsB0-vUn0fpvX9': 'Datasets/Java/df0.pkl.gz',
    '1LH2HF5HkzpkuJJOsDkBivsqtZpFKtEmL': 'Datasets/Java/df1.pkl.gz',
    '1t5Pf0f8NSygdNBgTxtdsPPmvGbmFlbFe': 'Datasets/Java/df3.pkl.gz',
}

for fileId, filePath in fileId_filePath.items():
    if not os.path.exists(filePath):
        download_from_google_drive(fileId, filePath)
    else:
        print(filePath, 'already exists.')  

Datasets/Java/df0.pkl.gz already exists.
Datasets/Java/df1.pkl.gz already exists.
Datasets/Java/df3.pkl.gz already exists.


# Apply Filtering Steps

In [4]:
import pandas as pd
import gzip

In [5]:
from enum import Enum

class MethodRetrievalApproach(Enum):
    WITH_AST = 1
    WITHOUT_AST = 2
    AST_FALLBACK = 3

In [6]:
DATASET = 'Python' # Java or Python

In [7]:
df = pd.read_pickle(gzip.open(f'Datasets/{DATASET}/df0.pkl.gz', 'rb'))
print(len(df))
print(df.columns)

1607408
Index(['user', 'project', 'created_in_file', 'last_appeared_in_file',
       'created_in_line', 'last_appeared_in_line', 'created_in_commit',
       'deleted_in_commit', 'created_at_date', 'deleted_at_date', 'content',
       'deleted_in_lines', 'created_in_lines', 'updated_in_commits',
       'last_content', 'SATD_comment'],
      dtype='object')


In [49]:
print(f'Number of repositories in the {DATASET} SATD dataset:',len(set(df['project'])))
print("Note: Repositories in which the SATD Tracker did not find any SATD in their code have no rows in this dataset.")

Number of repositories in the Python SATD dataset: 11754
Note: Repositories in which the SATD Tracker did not find any SATD in their code have no rows in this dataset.


In [50]:
# Filter 1: Keep SATDs that are deleted (potentially repaid), and remove others
df = df.dropna(subset=['deleted_in_commit'])
print(len(df))

1059299


In [51]:
# Filter 2: The length of SATD comment should be at least three words
df = df[df['SATD_comment'].apply(lambda x: len(x.split()) > 2)]
print(len(df))

949188


In [52]:
# Load df1: This version includes some more columns that are required for the next filters
df1 = pd.read_pickle(gzip.open(f'Datasets/{DATASET}/df1.pkl.gz', 'rb'))
print(len(df1))
print(df1.columns)

949188
Index(['user', 'project', 'created_in_file', 'last_appeared_in_file',
       'created_in_line', 'last_appeared_in_line', 'created_in_commit',
       'deleted_in_commit', 'created_at_date', 'deleted_at_date', 'content',
       'deleted_in_lines', 'created_in_lines', 'updated_in_commits',
       'last_content', 'SATD_comment', 'containing_method_applied_approach',
       'containing_method_before_repayment',
       'containing_method_after_repayment', 'method_is_updated',
       'SATD_count_before_repayment', 'SATD_count_after_repayment'],
      dtype='object')


In [53]:
# Filter 3: Keep SATDs that are inside methods, and remove others
df2 = df1[df1["containing_method_before_repayment"].str.len()>0]
print(len(df2))

723258


In [54]:
# Filter 4: Keep SATDs that the containing method’s name still exist after repayment
df2 = df2[df2["containing_method_after_repayment"].str.len()>0]
print(len(df2))

325031


In [55]:
# Filter 5: Keep SATDs that the containing method is updated after repayment
df2 = df2[df2["method_is_updated"]]
print(len(df2))

288007


In [56]:
# Filter 6: Keep SATDs that no other SATDs existed in that method before repayment. Also, no SATD exists in the method after repayment.
df2 = df2[(df2["SATD_count_before_repayment"] == 1) & (df2["SATD_count_after_repayment"] == 0)]
print(len(df2))

171825


In [57]:
# Filter 7: Removing duplicates
df2 = df2.drop_duplicates(subset=['containing_method_before_repayment', 'containing_method_after_repayment'])
print(len(df2))

143341


In [58]:
# Filter 8: Remove SATDs that having non-ASCII characters
def contains_non_ascii(s: str) -> bool:
    return not all(ord(char) < 128 for char in s)
df2 = df2[df2['containing_method_before_repayment'].apply(contains_non_ascii)==False]
df2 = df2[df2['containing_method_after_repayment'].apply(contains_non_ascii)==False]
print(len(df2))

140929


In [59]:
# Filter 9: Keep SATDs that the number of tokens in before and after SATD repayment is less than 1,024
import re
def split_to_tokens(text):
    # Split by word boundaries and include punctuation as separate tokens
    tokens = re.findall(r"\w+|[^\w\s]", text, re.UNICODE)
    return tokens

df2['method_tokens_before_repayment'] = df2['containing_method_before_repayment'].apply(lambda x: len(split_to_tokens(x)))
df2['method_tokens_after_repayment'] = df2['containing_method_after_repayment'].apply(lambda x: len(split_to_tokens(x)))
df2 = df2[(df2['method_tokens_before_repayment'] <= 1024) & (df2['method_tokens_after_repayment'] <= 1024)]
print(len(df2))

131945


In [60]:
# Load df3: This version includes the Llama3-70b label column that is required for the last filter
df3 = pd.read_pickle(gzip.open(f'Datasets/{DATASET}/df3.pkl.gz', 'rb'))
print(len(df3))
print(df3.columns)

131945
Index(['rand_index', 'user', 'project', 'created_in_file',
       'last_appeared_in_file', 'created_in_line', 'last_appeared_in_line',
       'created_in_commit', 'deleted_in_commit', 'created_at_date',
       'deleted_at_date', 'content', 'deleted_in_lines', 'created_in_lines',
       'updated_in_commits', 'last_content', 'SATD_comment',
       'containing_method_before_repayment',
       'containing_method_applied_approach',
       'containing_method_after_repayment', 'method_is_updated',
       'SATD_count_before_repayment', 'SATD_count_after_repayment',
       'method_tokens_before_repayment', 'method_tokens_after_repayment',
       'prompt', 'is_repayment_llama3'],
      dtype='object')


In [61]:
# show the value counts in is_repayment_llama3 column
print(df3['is_repayment_llama3'].value_counts())

is_repayment_llama3
yes        58722
no         47394
unclear    24429
Name: count, dtype: int64


In [62]:
# Filter 10: Remove items that the method update is not related to SATD repayment
df3 = df3[df3['is_repayment_llama3'] == 'yes']
print(len(df3))

58722


In [63]:
# Number of repositories having at least one SATD in the filtered dataset
df3[['user', 'project']].drop_duplicates().shape[0]

7219