# FineCatIss

# TODOs:

- Implement exporting finetune model so it can be re-used for future runs iff export_finetuned_model is true
- Implement adding PREDICTED_LABEL key to dictionary (json) if it is missing
- Make os.path.join(PROJECT_FOLDER, CONFIG_FOLDER, REPO_DATA_CONFIG_FILE_NAME) default config file for repo data config to avoid this output: No config file given. Trying to load config from "/home/jovyan/config/repo_data_config.toml"
- Add different data filtering options train, eval and test data - currently issues always are assigned to the test set unless "predicted_label" is not None (OPTIONAL, low prio)
- Option to ignore issues with predicted labels for fine-tuning as this could lead to worse performance
  - not that important as issues with predictions have predicted_label not Null and therefore will get filtered out... this would only matter if the same labels as usual are used for predicted labels and then data being rebuild from scratch so that predicted_label always is Null again
- implement adding exported predicted labels to a json file to cover cases where data is gathered from scratch, so predictions aren't lost?
- Implement checks if all required data exists and for calculateable data, calculate it if missing
- Implement predetermined breaking points (Sollbruchstellen) if required data is missing or the script can't continue for some reason
- Issues can be deleted... should there be a check if any issues got deleted, so they get deleted from the data json file as well? and how to handle edits?
    - when fetching issues one could add a dict with all issue numbers, which has the key set to True if the issue already exists (opposed to merely continuing with the next one) or after I got fetched, and all issues, whichs key is still False afterwards got deleted... we could add a field that saves this info and also an option in a config to delete all of these issues
- Should new issues be fetched every time? (new issues meaning issues with a higher number than the highest one in current data set)
- What if all issues are unlabeled? Then fine-tuning should be skipped... make sure it works like that!
- Issue data can now either be a list or dictionary... come up with a way to work with that or enforce choosing either!
- Implement that connection to github repo isn't required if issues_data (json file containing it) is given
- GitHub:
    - README
    - Requirements.txt

## Imports required for both

In [None]:
# pip install PyGithub
from github import Github, RateLimitExceededException
from github.Repository import Repository
# conda install python-dotenv
from dotenv import load_dotenv

In [None]:
import os
import logging
import logging.config
import json
import re
from datetime import datetime

In [None]:
# conda install pandas
import pandas as pd
# conda install numpy
import numpy as np
# conda install toml
import toml
# conda install PyYAML
import yaml
# conda install matplotlib
import matplotlib.pyplot as plt

## Imports required for dashboard

In [None]:
import matplotlib.dates as mdates
import time

## Imports required for Issue Label Classification

In [None]:
# pip install simpletransformers
from simpletransformers.classification import ClassificationModel, ClassificationArgs
# pip install transformers[torch]
from transformers import RobertaForSequenceClassification, RobertaTokenizer, get_linear_schedule_with_warmup

In [None]:
import sklearn
# from sklearn.preprocessing import LabelEncoder # TODO: implement LabelEncoder instead of using pandas.Series.cat.codes since that is less error prone - TODO: verify that the encodings stay the same
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.optim import AdamW
from functools import partial
import unicodedata as ud
import unittest
from unittest.mock import patch, MagicMock
from tqdm.auto import tqdm

### Some basic setup things

In [None]:
# FULL = "full"
# PARTIAL = "partial"
# ISSUES = "issues"
# PRS = "pull_requests"

In [None]:
# function_requirements = {
#     "first_prediction": {
#         ISSUES: {"number": FULL, "title": FULL, "user": FULL, "description": PARTIAL, "labels": PARTIAL},
#         PRS: {}
#     },
#     "prediction_without_duplicates": {
#         ISSUES: {"number": FULL, "title": FULL, "user": FULL, "description": PARTIAL, "labels": PARTIAL, "predicted_label": PARTIAL},
#         PRS: {}
#     },
#     "function_3": {"data_key_1": "full", "data_key_4": "partial"},
# }

In [None]:
# data_availability_dicts = {
#     ISSUES: {
#         "fully_available": data_type_1_fully_available,
#         "partially_available": data_type_1_partially_available,
#     },
#     PRS: {
#         "fully_available": data_type_2_fully_available,
#         "partially_available": data_type_2_partially_available,
#     },
# }

In [None]:
# def check_function_executability(function_requirements, data_availability_dicts):
#     executable_functions = {}

#     for func, data_requirements in function_requirements.items():
#         is_executable = True  # Assume executable unless a requirement fails

#         for data_type, requirements in data_requirements.items():
#             fully_available = data_availability_dicts[data_type]["fully_available"]
#             partially_available = data_availability_dicts[data_type]["partially_available"]

#             for key, required_availability in requirements.items():
#                 if required_availability == "full":
#                     if not fully_available.get(key, False):
#                         is_executable = False
#                         break
#                 elif required_availability == "partial":
#                     if not (fully_available.get(key, False) or partially_available.get(key, False)):
#                         is_executable = False
#                         break

#             if not is_executable:
#                 break  # Stop checking other data types if one fails

#         executable_functions[func] = "executable" if is_executable else "not_executable"

#     return executable_functions

In [None]:
# result = check_function_executability(function_requirements, data_availability_dicts)

# for func, status in result.items():
#     logger.info(f"{func}: {status}")

In [None]:
# import sys
# sys.exit(0)

In [None]:
LOGGING_LEVEL = logging.INFO

# The following two values can only be changed here as otherwise the cofigs couldn"t be loaded
CONFIG_FOLDER = "config"
CONFIG_FILE_NAME = "config.toml"

BUG = "bug"
ENHANCEMENT = "enhancement"
SUPPORT = "support"

PREDICTION_BUG_LABEL = BUG
PREDICTION_ENHANCEMENT_LABEL = ENHANCEMENT
PREDICTION_SUPPORT_LABEL = SUPPORT

In [None]:
pd.set_option("display.max_colwidth", None)
logging.basicConfig(level=LOGGING_LEVEL)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)
torch.multiprocessing.set_sharing_strategy("file_system")
# Check if CUDA is available
cuda_available = torch.cuda.is_available()
logging.info(f"CUDA Available: {cuda_available}")
# Get the name of the GPU
if cuda_available:
    gpu_name = torch.cuda.get_device_name(0)
    logging.debug(f"GPU: {gpu_name}")

In [None]:
os.getcwd()

In [None]:
# NOTEBOOKS_FOLDER = "notebooks"
NOTEBOOKS_FOLDER = os.path.basename(os.getcwd())

In [None]:
NOTEBOOKS_FOLDER

In [None]:
def remove_last_folder(path):
    # Remove the last folder from the path
    return os.path.dirname(path)

# # Since this notebook is in the notebooks folder, we need to remove the notebooks folder from the path to get the project folder
# PROJECT_FOLDER = ""
# cwd = os.getcwd()
# if (NOTEBOOKS_FOLDER in cwd): # TODO: only look at last folder instead of whole path? (as theoretically the name of the notebook folder could be part of the prior path)
#     PROJECT_FOLDER = remove_last_folder(os.getcwd())
# else:
#     PROJECT_FOLDER = cwd

PROJECT_FOLDER = remove_last_folder(os.getcwd())

In [None]:
PROJECT_FOLDER

## Data structure initialization

## Configuration

In [None]:
# Load environment variables from the .env file
load_dotenv()

# Get GITHUB_PAT from .env file
GITHUB_PAT = os.getenv("GITHUB_PAT", "")

# GITHUB_TOKEN = rf"{remove_last_folder(PROJECT_FOLDER)}\private\pat.txt" # TODO: remove and make sure that better solution does work

# with open(GITHUB_TOKEN) as file:
#     GITHUB_PAT = file.read().strip()

### Basic config (config.toml)

In [None]:
# selected_repository
USER_NAME = "vaadin" 
REPO_NAME = "flow"
FETCH_NEW_ISSUES = True

# logging
LOGGER_NAME = "StandardLogger"
LOG_FILE_NAME = "app.log"

# folder_structure
DATA_FOLDER = "data"
MODEL_FOLDER = "model"
LOGS_FOLDER = "logs"
RESULTS_FOLDER = "results"

# config_files
LOGGING_CONFIG_FILE_NAME = "logging_config.yaml"
REPO_DATA_CONFIG_FILE_NAME = "repo_data_config.toml"
LABEL_CONFIG_FILE_NAME = "label_config.toml"
ML_CONFIG_FILE_NAME = "ml_config.toml"

In [None]:
# Default values in case keys are missing
DEFAULT_CONFIG = {
    "selected_repository": {
        "user_name": "vaadin",
        "repo_name": "flow",
        "fetch_new_issues": True
    },
    "logging": {
        "logger_name": "StandardLogger",
        "log_file_name": "app.log"
    },
    "folder_structure": {
        "data_folder": "data",
        "logs_folder": "logs",
        "model_folder": "models",
        "notebooks_folder": "notebooks",
        "results_folder": "results"
    },
    "config_files": {
        "logging_config_file_name": "logging_config.yaml",
        "repo_data_config_file_name": "repo_data.toml",
        "label_config_file_name": "label_config.toml",
        "ml_config_file_name": "ml_config.toml"
    }
}

In [None]:
def load_main_config(config_file: str = os.path.join(PROJECT_FOLDER, CONFIG_FOLDER, CONFIG_FILE_NAME)):
    global USER_NAME, REPO_NAME, FETCH_NEW_ISSUES
    global LOGGER_NAME, LOG_FILE_NAME
    global DATA_FOLDER, LOGS_FOLDER, MODEL_FOLDER, NOTEBOOKS_FOLDER, RESULTS_FOLDER
    global LOGGING_CONFIG_FILE_NAME, REPO_DATA_CONFIG_FILE_NAME, LABEL_CONFIG_FILE_NAME, ML_CONFIG_FILE_NAME

    if not os.path.exists(config_file):
        logging.warning(f"Config file {config_file} not found. Using default values.")
        config_data = DEFAULT_CONFIG
    else:
        with open(config_file, "r", encoding="utf-8-sig") as file:
            config_data = toml.load(file)

    # Handle missing sections with default values
    config_data = {**DEFAULT_CONFIG, **config_data}  # Merge with defaults

    selected_repo = config_data.get("selected_repository", {})
    USER_NAME = selected_repo.get("user_name", DEFAULT_CONFIG["selected_repository"]["user_name"])
    REPO_NAME = selected_repo.get("repo_name", DEFAULT_CONFIG["selected_repository"]["repo_name"])
    FETCH_NEW_ISSUES = selected_repo.get("fetch_new_issues", DEFAULT_CONFIG["selected_repository"]["fetch_new_issues"])

    # Logging section
    logging_config = config_data.get("logging", {})
    LOGGER_NAME = logging_config.get("logger_name", DEFAULT_CONFIG["logging"]["logger_name"])
    LOG_FILE_NAME = logging_config.get("log_file_name", DEFAULT_CONFIG["logging"]["log_file_name"])

    # Folder structure
    folder_structure = config_data.get("folder_structure", {})
    DATA_FOLDER = folder_structure.get("data_folder", DEFAULT_CONFIG["folder_structure"]["data_folder"])
    LOGS_FOLDER = folder_structure.get("logs_folder", DEFAULT_CONFIG["folder_structure"]["logs_folder"])
    MODEL_FOLDER = folder_structure.get("model_folder", DEFAULT_CONFIG["folder_structure"]["model_folder"])
    NOTEBOOKS_FOLDER = folder_structure.get("notebooks_folder", DEFAULT_CONFIG["folder_structure"]["notebooks_folder"])
    RESULTS_FOLDER = folder_structure.get("results_folder", DEFAULT_CONFIG["folder_structure"]["results_folder"])

    # Config files
    config_files = config_data.get("config_files", {})
    LOGGING_CONFIG_FILE_NAME =  config_files.get("logging_config_file_name", DEFAULT_CONFIG["config_files"]["logging_config_file_name"])
    REPO_DATA_CONFIG_FILE_NAME = config_files.get("repo_data_config_file_name", DEFAULT_CONFIG["config_files"]["repo_data_config_file_name"])
    LABEL_CONFIG_FILE_NAME = config_files.get("label_config_file_name", DEFAULT_CONFIG["config_files"]["label_config_file_name"])
    ML_CONFIG_FILE_NAME = config_files.get("ml_config_file_name", DEFAULT_CONFIG["config_files"]["ml_config_file_name"])

    logging.info("Configuration successfully loaded.")

In [None]:
load_main_config()

In [None]:
os.makedirs(DATA_FOLDER, exist_ok=True)
os.makedirs(LOGS_FOLDER, exist_ok=True)
os.makedirs(MODEL_FOLDER, exist_ok=True)
os.makedirs(RESULTS_FOLDER, exist_ok=True)
with open(LOG_FILE_NAME, 'a', encoding="utf-8-sig"):
    os.utime(LOG_FILE_NAME, None)

### ML parameters (Do not change as this can negatively impact or break the model!)

In [None]:
MAX_TITLE_LENGTH = 30
MAX_BODY_LENGTH = 170
PUNCTUATIONS = r'!"$%&\()*,/:;<=>[\\]^`{|}~+#@-`'
ISSUE_REGEX = r"#[0-9]+"
FUNCTION_REGEX = r"[a-zA-Z][a-zA-Z0-9_.]*\([a-zA-Z0-9_, ]*\)"
ASCII_REGEX = r"[^\x00-\x7f]"

### ML config

In [None]:
USE_FINETUNED_MODEL = False
PRETRAINED_MODEL_FILE_NAME = "pytorch_model.bin"
FINETUNED_MODEL_FOLDER = "roberta"
FINETUNED_MODEL_FILE_NAME = ""

EVALUATION = True
TEST = True

VALIDATION_SPLIT_PERCENTAGE = 0.2
RANDOM_STATE = None

NUMBER = "number"

LABEL = "labels"
TIME = "created_at"
REPO = "repository_url"
TITLE = "title"
BODY = "description"
AUTHOR = "author_association"
URL = "url"
PREDICTED_LABEL = "predicted_label"

LABEL_COL = "labels"
TEXT_COL = "text"

EXPORT_ML_DATA_CSVS = False
EXPORT_FINETUNED_MODEL = False # TODO: Implement!!!
EXPORT_PREDICTIONS = True
UPDATE_ISSUE_DATA = True
UPDATE_GITHUB_LABELS = False

In [None]:
# Default ML configuration
DEFAULT_ML_CONFIG = {
    "model": {
        "use_finetuned_model": False,
        "pretrained_model_file_name": "pytorch_model.bin",
        "finetuned_model_folder": "",
        "finetuned_model_file_name": ""
    },
    "mode": {
        "evaluation": True,
        "test": True,
        "validation_split_percentage": 0.2,
        "random_state": None
    },
    "data": {
        "keys": {
            "number": "number",
            "label": "labels",
            "time": "created_at",
            "repo": "repository_url",
            "title": "title",
            "body": "description",
            "author": "author_association",
            "url": "url",
            "predicted_label": "predicted_label",
            "label_col": "labels",
            "text_col": "text"
        },
        # "preprocessing": {
        #     "max_title_length": 30,
        #     "max_body_length": 170,
        #     "punctuations": '!"$%&\\()*,/:;<=>[\\]^`{|}~+#@-`',
        #     "issue_regex": r"#[0-9]+",
        #     "function_regex": r"[a-zA-Z][a-zA-Z0-9_.]*\([a-zA-Z0-9_, ]*\)",
        #     "ascii_regex": r"[^\x00-\x7f]"
        # },
        "export": {
            "export_ml_data_csvs": True,
            "export_finetuned_model": False,
            "export_predictions": True,
            "update_issue_data": False,
            "update_github_labels": False
        }
    }
}

# Global variables (initialized with default values)
USE_FINETUNED_MODEL = DEFAULT_ML_CONFIG["model"]["use_finetuned_model"]
PRETRAINED_MODEL_FILE_NAME = DEFAULT_ML_CONFIG["model"]["pretrained_model_file_name"]
FINETUNED_MODEL_FOLDER = DEFAULT_ML_CONFIG["model"]["finetuned_model_folder"]
FINETUNED_MODEL_FILE_NAME = DEFAULT_ML_CONFIG["model"]["finetuned_model_file_name"]

EVALUATION = DEFAULT_ML_CONFIG["mode"]["evaluation"]
TEST = DEFAULT_ML_CONFIG["mode"]["test"]
VALIDATION_SPLIT_PERCENTAGE = DEFAULT_ML_CONFIG["mode"]["validation_split_percentage"]
RANDOM_STATE = DEFAULT_ML_CONFIG["mode"]["random_state"]

NUMBER = DEFAULT_ML_CONFIG["data"]["keys"]["number"]
LABEL = DEFAULT_ML_CONFIG["data"]["keys"]["label"]
TIME = DEFAULT_ML_CONFIG["data"]["keys"]["time"]
REPO = DEFAULT_ML_CONFIG["data"]["keys"]["repo"]
TITLE = DEFAULT_ML_CONFIG["data"]["keys"]["title"]
BODY = DEFAULT_ML_CONFIG["data"]["keys"]["body"]
AUTHOR = DEFAULT_ML_CONFIG["data"]["keys"]["author"]
URL = DEFAULT_ML_CONFIG["data"]["keys"]["url"]
PREDICTED_LABEL = DEFAULT_ML_CONFIG["data"]["keys"]["predicted_label"]

LABEL_COL = DEFAULT_ML_CONFIG["data"]["keys"]["label_col"]
TEXT_COL = DEFAULT_ML_CONFIG["data"]["keys"]["text_col"]

# MAX_TITLE_LENGTH = DEFAULT_ML_CONFIG["data"]["preprocessing"]["max_title_length"]
# MAX_BODY_LENGTH = DEFAULT_ML_CONFIG["data"]["preprocessing"]["max_body_length"]
# PUNCTUATIONS = DEFAULT_ML_CONFIG["data"]["preprocessing"]["punctuations"]
# ISSUE_REGEX = DEFAULT_ML_CONFIG["data"]["preprocessing"]["issue_regex"]
# FUNCTION_REGEX = DEFAULT_ML_CONFIG["data"]["preprocessing"]["function_regex"]
# ASCII_REGEX = DEFAULT_ML_CONFIG["data"]["preprocessing"]["ascii_regex"]

EXPORT_ML_DATA_CSVS = DEFAULT_ML_CONFIG["data"]["export"]["export_ml_data_csvs"]
EXPORT_FINETUNED_MODEL = DEFAULT_ML_CONFIG["data"]["export"]["export_finetuned_model"]
EXPORT_PREIDCTIONS = DEFAULT_ML_CONFIG["data"]["export"]["export_predictions"]
UPDATE_ISSUE_DATA = DEFAULT_ML_CONFIG["data"]["export"]["update_issue_data"]
UPDATE_GITHUB_LABELS = DEFAULT_ML_CONFIG["data"]["export"]["update_github_labels"]

def load_ml_config(config_file: str = os.path.join(PROJECT_FOLDER, CONFIG_FOLDER, ML_CONFIG_FILE_NAME)):
    global USE_FINETUNED_MODEL, PRETRAINED_MODEL_FILE_NAME, FINETUNED_MODEL_FOLDER, FINETUNED_MODEL_FILE_NAME
    global EVALUATION, TEST, VALIDATION_SPLIT_PERCENTAGE, RANDOM_STATE
    global NUMBER, LABEL, TIME, REPO, TITLE, BODY, AUTHOR, URL, PREDICTED_LABEL, LABEL_COL, TEXT_COL
    # global MAX_TITLE_LENGTH, MAX_BODY_LENGTH, PUNCTUATIONS, ISSUE_REGEX, FUNCTION_REGEX, ASCII_REGEX
    global EXPORT_ML_DATA_CSVS, EXPORT_FINETUNED_MODEL, EXPORT_PREDICTIONS, UPDATE_ISSUE_DATA, UPDATE_GITHUB_LABELS

    if not os.path.exists(config_file):
        logging.warning(f"Config file {config_file} not found. Using default values.")
        config_data = DEFAULT_ML_CONFIG
    else:
        with open(config_file, 'r', encoding="utf-8-sig") as file:
            config_data = toml.load(file)
        # Merge with default configuration
        config_data = {**DEFAULT_ML_CONFIG, **config_data}

    # Model section
    USE_FINETUNED_MODEL = config_data["model"].get("use_finetuned_model", DEFAULT_ML_CONFIG["model"]["use_finetuned_model"])
    PRETRAINED_MODEL_FILE_NAME = config_data["model"].get("pretrained_model_file_name", DEFAULT_ML_CONFIG["model"]["pretrained_model_file_name"])
    FINETUNED_MODEL_FOLDER = config_data["model"].get("finetuned_model_folder", DEFAULT_ML_CONFIG["model"]["finetuned_model_folder"])
    FINETUNED_MODEL_FILE_NAME = config_data["model"].get("finetuned_model_file_name", DEFAULT_ML_CONFIG["model"]["finetuned_model_file_name"])

    # Mode section
    EVALUATION = config_data["mode"].get("evaluation", DEFAULT_ML_CONFIG["mode"]["evaluation"])
    TEST = config_data["mode"].get("test", DEFAULT_ML_CONFIG["mode"]["test"])
    VALIDATION_SPLIT_PERCENTAGE = config_data["mode"].get("validation_split_percentage", DEFAULT_ML_CONFIG["mode"]["validation_split_percentage"])
    RANDOM_STATE = config_data["mode"].get("random_state", DEFAULT_ML_CONFIG["mode"]["random_state"])

    # Data section
    data_config = config_data.get("data", {})
    data_config = {**DEFAULT_ML_CONFIG["data"], **data_config}

    # Data keys
    data_keys = data_config.get("keys", {})
    NUMBER = data_keys.get("number", DEFAULT_ML_CONFIG["data"]["keys"]["number"])
    LABEL = data_keys.get("label", DEFAULT_ML_CONFIG["data"]["keys"]["label"])
    TIME = data_keys.get("time", DEFAULT_ML_CONFIG["data"]["keys"]["time"])
    REPO = data_keys.get("repo", DEFAULT_ML_CONFIG["data"]["keys"]["repo"])
    TITLE = data_keys.get("title", DEFAULT_ML_CONFIG["data"]["keys"]["title"])
    BODY = data_keys.get("body", DEFAULT_ML_CONFIG["data"]["keys"]["body"])
    AUTHOR = data_keys.get("author", DEFAULT_ML_CONFIG["data"]["keys"]["author"])
    URL = data_keys.get("url", DEFAULT_ML_CONFIG["data"]["keys"]["url"])
    PREDICTED_LABEL = data_keys.get("predicted_label", DEFAULT_ML_CONFIG["data"]["keys"]["predicted_label"])
    LABEL_COL = data_keys.get("label_col", DEFAULT_ML_CONFIG["data"]["keys"]["label_col"])
    TEXT_COL = data_keys.get("text_col", DEFAULT_ML_CONFIG["data"]["keys"]["text_col"])
    

    # # Data preprocessing
    # data_preprocessing = data_config.get("preprocessing", {})
    # MAX_TITLE_LENGTH = data_preprocessing.get("max_title_length", DEFAULT_ML_CONFIG["data"]["preprocessing"]["max_title_length"])
    # MAX_BODY_LENGTH = data_preprocessing.get("max_body_length", DEFAULT_ML_CONFIG["data"]["preprocessing"]["max_body_length"])
    # PUNCTUATIONS = data_preprocessing.get("punctuations", DEFAULT_ML_CONFIG["data"]["preprocessing"]["punctuations"])
    # ISSUE_REGEX = data_preprocessing.get("issue_regex", DEFAULT_ML_CONFIG["data"]["preprocessing"]["issue_regex"])
    # FUNCTION_REGEX = data_preprocessing.get("function_regex", DEFAULT_ML_CONFIG["data"]["preprocessing"]["function_regex"])
    # ASCII_REGEX = data_preprocessing.get("ascii_regex", DEFAULT_ML_CONFIG["data"]["preprocessing"]["ascii_regex"])

    # Data export
    data_export = data_config.get("export", {})
    EXPORT_ML_DATA_CSVS = data_export.get("export_ml_data_csvs", DEFAULT_ML_CONFIG["data"]["export"]["export_ml_data_csvs"])
    EXPORT_FINETUNED_MODEL = data_export.get("export_finetuned_model", DEFAULT_ML_CONFIG["data"]["export"]["export_finetuned_model"])
    EXPORT_PREDICTIONS = data_export.get("export_predictions", DEFAULT_ML_CONFIG["data"]["export"]["export_predictions"])
    UPDATE_ISSUE_DATA = data_export.get("update_issue_data", DEFAULT_ML_CONFIG["data"]["export"]["update_issue_data"])
    UPDATE_GITHUB_LABELS = data_export.get("update_github_labels", DEFAULT_ML_CONFIG["data"]["export"]["update_github_labels"])

    logging.info("ML configuration successfully loaded.")

In [None]:
load_ml_config()

In [None]:
NECESSARY_COLUMNS = [LABEL, TIME, AUTHOR, REPO, TITLE, BODY, URL, PREDICTED_LABEL]
USEFUL_COLUMNS = [NUMBER] + NECESSARY_COLUMNS

ISSUE_REGEX = re.compile(ISSUE_REGEX)
FUNCTION_REGEX = re.compile(FUNCTION_REGEX)
ASCII_REGEX = re.compile(ASCII_REGEX)

### Logging Setup

In [None]:
def setup_logging(
        default_path=os.path.join(PROJECT_FOLDER, CONFIG_FOLDER, LOGGING_CONFIG_FILE_NAME),
        default_level=LOGGING_LEVEL,
        env_key="LOG_CFG"):
    """
    Setup logging configuration.
    """
    config_path = default_path
    new_config_path = os.getenv(env_key, None)
    if new_config_path:
        config_path = new_config_path
    if os.path.exists(config_path):
        with open(config_path, "r", encoding="utf-8-sig") as file:
            config = yaml.safe_load(file.read())
        logging.config.dictConfig(config)
    else:
        logging.basicConfig(level=default_level)
        logging.warning(f"Logging configuration file not found at {config_path}, using basic configuration.")

if __name__ == "__main__":
    setup_logging()
    logger = logging.getLogger(LOGGER_NAME)
    logger.debug("This is a test debug message")
    logger.info("This is a test info message")
    logger.warning("This is a test warning message")
    logger.error("This is a test error message")
    logger.critical("This is a test critical message")

### Try to load other configs

### Necessary checks for Config file
if both github data json file and label lists are complete there is no need for GITHUB_PAT as a github connection isn"t needed (there should be a bool so one can update those 2 though)

## RepoDataHandler

In [None]:
class RepoDataHandler:

    def __init__(
        self,
        user_name: str = "",
        repository_name: str = "",
        token: str = GITHUB_PAT,
        github_api: Github = None,
        repo: Repository = None,
        repo_data_file_path: str = "",
        repo_data_file_name: str = "",
        # issues_data: dict = {}, # dict version
        issues_data: list = [], # list version
        issues_last_fetch: datetime = None,
        # pull_requests_data: dict = {}, # dict version
        pull_requests_data: list = [], # list version
        issue_pr_map: dict = {},
        issues_updated_since_last_pr_map: bool = False,
        prs_updated_since_last_pr_map: bool = False,
        bug_labels: list = [],
        enhancement_labels: list = [],
        support_labels: list = [],
        prediction_bug_label: str = PREDICTION_BUG_LABEL,
        prediction_enhancement_label: str = PREDICTION_ENHANCEMENT_LABEL,
        prediction_support_label: str = PREDICTION_SUPPORT_LABEL,
        predictions_df: pd.DataFrame = None
    ):
        self.user_name: str = user_name
        self.repository_name: str = repository_name
        self.token: str = token
        self.github_api: Github = github_api
        self.repo: Repository = repo
        # Initialize the repo data file path and name
        self.repo_data_file_path: str = ""
        self.repo_data_file_name: str = ""
        if(repo_data_file_path): self.repo_data_file_path: str = repo_data_file_path
        else:
            if(repo_data_file_name):
                file_path = os.path.join(PROJECT_FOLDER, DATA_FOLDER, repo_data_file_name)
                if(os.path.exists(file_path)): self.repo_data_file_path: str = file_path
            else:
                self.repo_data_file_path: str = os.path.join(PROJECT_FOLDER, DATA_FOLDER, f"{self.user_name}_{self.repository_name}_repo_data.json") # TODO: come up with better handling of this
        if(repo_data_file_name): self.repo_data_file_name = repo_data_file_name
        else:
            if(self.repo_data_file_path):
                if(os.path.exists(self.repo_data_file_path)):
                    self.repo_data_file_name = os.path.basename(self.repo_data_file_path)
            else:
                self.repo_data_file_name = f"{self.user_name}_{self.repository_name}_repo_data.json"
        
        self.issues_data: dict = issues_data
        # self.issues_last_fetch: datetime = issues_last_fetch # TODO: implement or remove
        self.pull_requests_data: dict = pull_requests_data
        self.issue_pr_map: dict = issue_pr_map
        # self.issues_updated_since_last_pr_map: bool = issues_updated_since_last_pr_map # TODO: implement or remove - also: Instead check length before and after fetching?
        # self.prs_updated_since_last_pr_map: bool = prs_updated_since_last_pr_map # TODO: implement or remove - also: Instead check length before and after fetching?
        self.bug_labels: list = bug_labels
        self.enhancement_labels: list = enhancement_labels
        self.support_labels: list = support_labels
        self.prediction_bug_label = prediction_bug_label
        self.prediction_enhancement_label = prediction_enhancement_label
        self.prediction_support_label = prediction_support_label
        self.results_df = None

        if not self.repo: self.setup_github_api()

        # In case a connection to the repository is provided, but not the user_name and repository_name
        if (not (self.user_name and self.repository_name) and self.repo):
            self.user_name = self.repo.owner.login
            self.repository_name = self.repo.name

        self.fully_available_issue_data = {
            "number": False,
            "title": False,
            "state": False,
            "created_at": False,
            "closed_at": False,
            "user": False,
            "comments": False,
            "description": False,
            "description_length": False,
            "labels": False,
            "predicted_label": False,
            "author_association": False
        }

        self.fully_available_pr_data = {
            "number": False,
            "title": False,
            "state": False,
            "created_at": False,
            "merged_at": False,
            "closed_at": False,
            "user": False,
            "comments": False,
            "review_comments": False,
            "description": False,
            "description_length": False,
            "additions": False,
            "deletions": False,
            "changed_files": False,
            "files": False,
            "commits": False,
            "labels": False,
            "mergeable_state": False,
            "files_changed": {
                "filename": False,
                "additions": False,
                "deletions": False
            },
            "commit_data": {
                "sha": False,
                "author": False,
                "date": False,
                "message": False
            }
        }

        self.partially_available_issue_data = self.fully_available_issue_data.copy()

        self.partially_available_pr_data = self.fully_available_pr_data.copy()

    # def setup_github_api(self, token: str = None): # TODO: doesn"t work rn due to self.token (if changed back please remember to rmeove "self." before token)
    def setup_github_api(self):
        if self.repo:
            logger.info(f"Connected to {self.repo.full_name}")
            return
        if(self.token): self.github_api = Github(self.token)
        if(not self.github_api): raise Exception("No token for or connection to the Github API provided")
        try:
            self.repo = self.github_api.get_repo(f"{self.user_name}/{self.repository_name}")
            logger.info(f"Connected to {self.repo.full_name}")
        except Exception as e:
            logger.error(f"Error connecting to the repository {self.user_name}/{self.repository_name}: {e}")

    def load_repo_data_config(self, config_file: str = ""):

        # if self.issues_data:
        #     logger.info("Loading repo data config not necessary as issue data already given.")
        #     return
        
        if config_file: logger.debug(f"Trying to load repo data file configuration from {config_file}")
        
        else:
            config_file = os.path.join(PROJECT_FOLDER, CONFIG_FOLDER, REPO_DATA_CONFIG_FILE_NAME)
            logger.debug(f'No config file given. Trying to load config from "{os.path.join(PROJECT_FOLDER, CONFIG_FOLDER, REPO_DATA_CONFIG_FILE_NAME)}"')

        repo_key = f"{self.user_name}/{self.repository_name}"
        
        if not os.path.exists(config_file):
            logger.warning(f'No repo data file path provided. Assuming default "{os.path.join(PROJECT_FOLDER, DATA_FOLDER, f"{repo_key}_repo_data.json")}"!')
            config_file = os.path.join(PROJECT_FOLDER, DATA_FOLDER, f"{repo_key}_repo_data.json")
            if not os.path.exists(config_file):
                repo_data_file = os.path.join(PROJECT_FOLDER, DATA_FOLDER, f"{repo_key}_repo_data.json")
                logger.warning(f"{config_file} doesn't exist either. Trying to load data directly from default data file location \"{repo_data_file}\".")
                if not os.path.exists(repo_data_file):
                    logger.warning(f"Repo data file doesn't exist either. Fetching issues...")
                    self.fetch_issues()

        else:
            with open(config_file, "r", encoding="utf-8-sig") as file:
                config_data = toml.load(file)
    
            # Find the configuration for the current user_name/repo_name
            if repo_key not in config_data:
                repo_data_file = os.path.join(PROJECT_FOLDER, DATA_FOLDER, f"{repo_key}_repo_data.json")
                logger.warning(f'No configuration found for {repo_key} in the config file. Assuming default data file location "{repo_data_file}".')
                return
                
            repo_data_file = config_data[repo_key]["repo_data_file"]
            logger.debug(f"Loaded repo data file configuration for {repo_key}: {repo_data_file}")
    
            if(repo_data_file == os.path.basename(repo_data_file)): # If it is just a file name, not full path
                self.repo_data_file_name = repo_data_file
                self.repo_data_file_path = os.path.join(PROJECT_FOLDER, DATA_FOLDER, self.repo_data_file_name)
            else:
                self.repo_data_file_name = os.path.basename(repo_data_file)
                self.repo_data_file_path = repo_data_file

    def load_data(self, repo_data_json_file: str = ""):
        logger.debug(f"self.repo_data_file_name: {self.repo_data_file_name}")
        if(not repo_data_json_file):
            logger.debug("No file given, loading file from default file path")
            if(self.repo_data_file_path):
                logger.debug(f"Loading data from self.repo_data_file_path: {self.repo_data_file_path}")
                repo_data_json_file = self.repo_data_file_path
            elif(self.repo_data_file_name):
                logger.debug(f"Loading data from self.repo_data_file_name: {self.repo_data_file_name}")
                repo_data_json_file = os.path.join(PROJECT_FOLDER, DATA_FOLDER, self.repo_data_file_name)
            else:
                logger.info("No info on data file given. Can't load any data.")
                return
            

        logger.debug(f"Repo data json file: {repo_data_json_file}")

        try:
            with open(repo_data_json_file, encoding="utf-8-sig") as input_file:
                data = json.load(input_file)
                self.issues_data = data["issues"]
                self.pull_requests_data = data["pull_requests"]
                self.issue_pr_map = data["issue_pr_map"]
        except FileNotFoundError:
            logger.info("No existing data file found.")

    def _load_label_config(self, config_file: str = os.path.join(PROJECT_FOLDER, CONFIG_FOLDER, LABEL_CONFIG_FILE_NAME)):
        if not os.path.exists(config_file):
            raise FileNotFoundError(f"Config file {config_file} not found.")

        with open(config_file, "r", encoding="utf-8-sig") as file:
            config_data = toml.load(file)

        # Find the configuration for the current user/repo
        repo_key = f"{self.user_name}/{self.repository_name}"
        if repo_key not in config_data:
            raise ValueError(f"No configuration found for {repo_key} in the config file.")

        return config_data
        
    def load_label_config(self, config_file: str = os.path.join(PROJECT_FOLDER, CONFIG_FOLDER, LABEL_CONFIG_FILE_NAME)):
        config_data = self._load_label_config(config_file=config_file)

        repo_key = f"{self.user_name}/{self.repository_name}"
        labels = config_data[repo_key]
        logger.debug(f"Loaded label configuration for {repo_key}: {labels}")
        self.bug_labels = labels.get("bug_labels", [])
        self.enhancement_labels = labels.get("enhancement_labels", [])
        self.support_labels = labels.get("support_labels", [])
        self.prediction_bug_label = labels.get("prediction_bug_label", PREDICTION_BUG_LABEL)
        self.prediction_enhancement_label = labels.get("prediction_enhancement_label", PREDICTION_ENHANCEMENT_LABEL)
        self.prediction_support_label = labels.get("prediction_support_label", PREDICTION_SUPPORT_LABEL)

    def load_label_lists(self, config_file: str = os.path.join(PROJECT_FOLDER, CONFIG_FOLDER, LABEL_CONFIG_FILE_NAME)):
        config_data = self._load_label_config()

        repo_key = f"{self.user_name}/{self.repository_name}"
        labels = config_data[repo_key]
        logger.debug(f"Loaded label configuration for {repo_key}: {labels}")
        self.bug_labels = labels.get("bug_labels", [])
        self.enhancement_labels = labels.get("enhancement_labels", [])
        self.support_labels = labels.get("support_labels", [])
        
    def load_prediction_labels(self, config_file: str = os.path.join(PROJECT_FOLDER, CONFIG_FOLDER, LABEL_CONFIG_FILE_NAME)):
        config_data = self._load_label_config()

        repo_key = f"{self.user_name}/{self.repository_name}"
        labels = config_data[repo_key]
        logger.debug(f"Loaded label configuration for {repo_key}: {labels}")
        self.prediction_bug_label = labels.get("prediction_bug_label", PREDICTION_BUG_LABEL)
        self.prediction_enhancement_label = labels.get("prediction_enhancement_label", PREDICTION_ENHANCEMENT_LABEL)
        self.prediction_support_label = labels.get("prediction_support_label", PREDICTION_SUPPORT_LABEL)

    def append_label_config(self, new_data: dict, config_file: str = os.path.join(PROJECT_FOLDER, CONFIG_FOLDER, LABEL_CONFIG_FILE_NAME)):
        # Check if the config file exists
        if os.path.exists(config_file):
            # Load the existing configuration
            with open(config_file, "r", encoding="utf-8-sig") as file:
                existing_data = toml.load(file)
        else:
            # If the file doesn"t exist, start with an empty config
            existing_data = {}

        # Merge the new data with the existing data
        existing_data.update(new_data)

        # Write the updated configuration to the file
        with open(config_file, "w", encoding="utf-8-sig") as output_file:
            toml.dump(existing_data, output_file)

        logger.info(f"Updated label configuration saved to {config_file}")

    def save_label_config(self, data: dict = {}, config_file: str = os.path.join(PROJECT_FOLDER, CONFIG_FOLDER, LABEL_CONFIG_FILE_NAME)):
        # Save the default label configuration to the TOML file
        with open(config_file, "w") as output_file:
            toml.dump(data, output_file)

        logger.info(f"Label configuration saved to {config_file}")
    
    def add_new_repo_to_label_config(self, config_file: str = os.path.join(PROJECT_FOLDER, CONFIG_FOLDER, LABEL_CONFIG_FILE_NAME)):

        # Fetch labels from the connected repository or use default labels if no repo is connected
        if not self.repo:
            logger.warning("No repository connected. Adding default label configuration.")
            repo = "user_name/repo_name"
            bug_labels = [BUG, "defect"]
            enhancement_labels = [ENHANCEMENT, "feature"]
            support_labels = [SUPPORT, "help", "question"]
        else:
            repo = f"{self.user_name}/{self.repository_name}"
            # Fetch labels from the connected repository
            labels = self.repo.get_labels()
            # Extract label names and descriptions
            labels = [(label.name, label.description) for label in labels]
            # Set descriptions to "" if they are None
            labels = [(label_name, label_desc if label_desc else "") for label_name, label_desc in labels]

            # Filter bug-related labels
            bug_labels = [
                label_name for label_name, label_desc in labels
                if BUG.lower() in label_name.lower() or "defect" in label_name.lower() or BUG.lower() in label_desc.lower() or "defect" in label_desc.lower()
            ]

            # Filter enhancement-related labels
            enhancement_labels = [
                label_name for label_name, label_desc in labels
                if ENHANCEMENT.lower() in label_name.lower() or "feature" in label_name.lower() or ENHANCEMENT.lower() in label_desc.lower() or "feature" in label_desc.lower()
            ]

            # Filter support/question-related labels
            support_labels = [
                label_name for label_name, label_desc in labels
                if SUPPORT in label_name.lower() or "question" in label_name.lower() or "help" in label_name.lower() or SUPPORT.lower() in label_desc.lower() or "question" in label_desc.lower() or "help" in label_desc.lower()
            ]

        # Create the label configuration for this repository
        new_repo_config = {
            repo: {
                BUG: bug_labels,
                ENHANCEMENT: enhancement_labels,
                SUPPORT: support_labels
            }
        }

        # Add new configuration to the file or create it if it doesn"t exist
        self.append_label_config(new_repo_config, config_file)

    # Save all data to a .json file
    def save_data(self, partial: bool = False, repo_data_file: str = ""): # Please note that of partial is set to True, repo_data_file gets overwritten 

        dirname = ""
        if not repo_data_file:
            dirname = os.path.dirname(self.repo_data_file_path) or os.path.join(PROJECT_FOLDER, DATA_FOLDER) # Set to dirname of repo data file path if given and if not set to default dirname

            if self.repo_data_file_path:
                logger.debug(f"Setting repo_data_file to self.repo_data_file_path: {self.repo_data_file_path} for saving")
                repo_data_file = self.repo_data_file_path
            elif self.repo_data_file_name:
                logger.debug(f"Setting repo_data_file to self.repo_data_file_name: {self.repo_data_file_name} for saving")
                repo_data_file = self.repo_data_file_name
            else: # if neither are given set to default value
                repo_data_file = os.path.join(PROJECT_FOLDER, DATA_FOLDER, f"{self.user_name}_{self.repository_name}.json")
                logger.debug(f"Setting repo_data_file to default: {repo_data_file} for saving")
        else:
            dirname = os.path.dirname(repo_data_file)

        if(dirname and (not os.path.exists(dirname))): os.makedirs(dirname)

        if partial:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            repo_data_file = os.path.join(PROJECT_FOLDER, DATA_FOLDER, f"{self.user_name}_{self.repository_name}_partial_{timestamp}.json")
        
        data = {
            "issues": self.issues_data,
            "pull_requests": self.pull_requests_data,
            "issue_pr_map": self.issue_pr_map
        }
        
        with open(repo_data_file, "w", encoding="utf-8-sig") as output_file:
            json.dump(data, output_file, indent=4)
        logger.info(f"Data saved to {repo_data_file}")

    # handle rate limit exceed exceptions
    def rate_limit_exceed_handling(self):
        logger.info("Rate limit exceeded again. Saving partial data...")
        self.save_data(partial=True)
        logger.info("Partial data saved. Waiting until reset...")
        reset_time = self.github_api.rate_limiting_resettime
        sleep_time = max(0, reset_time - time.time() + 10)  # Adding a buffer of 10 seconds
        logger.info(f"Sleeping for {sleep_time:.0f} seconds")
        time.sleep(sleep_time)
        # Retry after sleep
        logger.info("\nRetrying...")

    # Map issues to PRs and save the data to issue_pr_map
    def map_issues_to_prs(self): # TODO: implement support for issues being a dict instead of a list! BUG
        for pr in self.pull_requests_data:
            if pr["title"]:
                issue_numbers = re.findall(r"#(\d+)", pr["title"])
                for issue_number in issue_numbers:
                    if issue_number not in self.issue_pr_map: # Create a new list if not present
                        self.issue_pr_map[issue_number] = []
                    self.issue_pr_map[issue_number].append(pr["number"])
            if pr["description"]:
                issue_numbers = re.findall(r"#(\d+)", pr["description"])
                for issue_number in issue_numbers:
                    if issue_number not in self.issue_pr_map: # Create a new list if not present
                        self.issue_pr_map[issue_number] = []
                    self.issue_pr_map[issue_number].append(pr["number"])

    def check_rate_limit(self):
        return self.repo.get_rate_limit()

    # Fetch all issues
    def fetch_issues(self):
        # latest_issue = max(issue["number"] for issue in self.issues_data.values())
        latest_issue = 0
        if len(self.issues_data) < 1: pass # No issue data found
        elif isinstance(self.issues_data, dict): # Dict of dicts
            latest_issue = max(issue[NUMBER] for issue in self.issues_data.values())
        elif isinstance(self.issues_data, list): # List of dicts
            latest_issue = max(issue[NUMBER] for issue in self.issues_data)
        else:
            raise TypeError("self.issues_data must be either a dict or a list.")

        try:
            # issues = self.repo.get_issues(state="all", sort="created", direction="asc", since=self.issues_last_fetch) # TODO: if since is used here, the skipping issues part will break fetching new issues
            # fetch_time = datetime.now()
            issues = self.repo.get_issues(state="all", sort="created", direction="asc")
            logger.info(f"Amount of issues to fetch: {len(list(issues))}")
        except RateLimitExceededException as e:
            logger.info(f"\n{e}\n")
            self.rate_limit_exceed_handling()
        
        try:
            logger.debug(len(self.issues_data))
            for i, issue in enumerate(issues):
                # if(i < len(self.issues_data)): continue # TODO: unnecessary with since... choose either! If this is used, please check if updates of issues affect this
                if issue.number < latest_issue: continue
                
                # # self.issues_data[issue.number] = { # dict version
                # self.issues_data.append({ # list version
                #     "number": issue.number, # redundant for dict version
                #     "title": issue.title,
                #     "state": issue.state, # calculateable, but still required as we didn"t implement the logic to update the state of an issue using closed_at
                #     "created_at": issue.created_at.isoformat(),
                #     "closed_at": issue.closed_at.isoformat() if issue.closed_at else None,
                #     "user": issue.user.login, # optional
                #     "comments": issue.comments, # optional
                #     "description": issue.body,
                #     "description_length": len(issue.body) if issue.body else 0, # calculateable
                #     "labels": [label.name for label in issue.labels],
                #     "predicted_label": None,
                #     "author_association": issue.raw_data.get("author_association", "NONE")
                # # } # dict version
                # }) # list version

                # self.issues_data[issue.number] = { # dict version
                self.issues_data.append({ # list version
                    NUMBER: issue.number, # redundant for dict version
                    TITLE: issue.title,
                    "state": issue.state, # calculateable, but still required as we didn"t implement the logic to update the state of an issue using closed_at
                    "created_at": issue.created_at.isoformat(),
                    "closed_at": issue.closed_at.isoformat() if issue.closed_at else None,
                    "user": issue.user.login, # optional
                    "comments": issue.comments, # optional
                    BODY: issue.body,
                    "description_length": len(issue.body) if issue.body else 0, # calculateable
                    LABEL: [label.name for label in issue.labels],
                    PREDICTED_LABEL: None,
                    AUTHOR: issue.raw_data.get("author_association", "NONE")
                # } # dict version
                }) # list version

                if(i % 100 == 0): logger.debug(i)
            # self.issues_last_fetch = fetch_time
            self.save_data(False)
        except RateLimitExceededException as e:
            logger.debug(f"\n{e}\n")
            self.save_data(True)
            self.rate_limit_exceed_handling()
            self.fetch_issues()
        except Exception as e:
            self.save_data(True)
            logger.error(f"\n{e}\n")
            self.fetch_issues()

    # Fetch all pull requests
    def fetch_pull_requests(self):
        latest_pr = 0
        if len(self.pull_requests_data) < 1: pass # No issue data found
        elif isinstance(self.pull_requests_data, dict): # Dict of dicts
            latest_pr = max(pr[NUMBER] for pr in self.pull_requests_data.values())
        elif isinstance(self.pull_requests_data, list): # List of dicts
            latest_pr = max(pr[NUMBER] for pr in self.pull_requests_data)
        else:
            raise TypeError("self.issues_data must be either a dict or a list.")
            
        try:
            prs = self.repo.get_pulls(state="all", sort="created", direction="asc")
        except RateLimitExceededException as e:
            logger.info(f"\n{e}\n")
            self.rate_limit_exceed_handling()

        try:
            for i, pr in enumerate(prs):
                # if(i < len(self.pull_requests_data)): continue
                if i < latest_pr: continue
                
                pr_files = pr.get_files()
                files_changed = [{
                    "filename": pr_file.filename,
                    "additions": pr_file.additions,
                    "deletions": pr_file.deletions
                } for pr_file in pr_files]
                
                pr_commits = pr.get_commits()
                commit_data = [{
                    "sha": commit.sha, # optional
                    "author": commit.commit.author.name, # optional
                    "date": commit.commit.author.date.isoformat(),
                    "message": commit.commit.message
                } for commit in pr_commits]

                # self.pull_requests_data[pr.number] = { # dict version
                self.pull_requests_data.append({ # list version
                    "number": pr.number, # redundant
                    "title": pr.title,
                    "state": pr.state, # calculateable, but still required as we didn"t implement the logic to update the state of a pull request using closed_at
                    "created_at": pr.created_at.isoformat(),
                    "merged_at": pr.merged_at.isoformat() if pr.merged_at else None,
                    "closed_at": pr.closed_at.isoformat() if pr.closed_at else None,
                    "user": pr.user.login, # optional
                    "comments": pr.comments, # optional
                    "review_comments": pr.review_comments, # optional
                    "description": pr.body,
                    "description_length": len(pr.body) if pr.body else 0, # calculateable
                    "additions": pr.additions,
                    "deletions": pr.deletions,
                    "changed_files": pr.changed_files,
                    "files": files_changed,
                    "commits": commit_data,
                    "labels": [label.name for label in pr.labels],
                    "mergeable_state": pr.mergeable_state
                # } # dict version
                }) # list version
            self.save_data(False)
        except RateLimitExceededException as e:
            logger.debug(f"\n{e}\n")
            self.save_data(True)
            self.rate_limit_exceed_handling()
            self.fetch_pull_requests()  
        except Exception as e:
            logger.error(f"\n{e}\n")
            self.save_data(True)
            self.fetch_pull_requests()

    def issue_exists(self, issue_number: int) -> bool:
        return issue_number in self.issues_data.keys()

    def pr_exists(self, pr_number: int) -> bool:
        return pr_number in self.pull_requests_data.keys()

    def get_issue(self, issue_number: int) -> dict:
        return self.issues_data[issue_number]
    
    def get_pr(self, pr_number: int) -> dict:
        return self.pull_requests_data[pr_number]
    
    def get_issues(self) -> dict:
        return self.issues_data
    
    def get_prs(self) -> dict:
        return self.pull_requests_data

    def load_results_dataframe(self, df=None, csv_path=None):
        if df is not None:
            logger.info("Loading provided DataFrame as results.")
            self.results_df = df
            
        elif csv_path:
            logger.info(f"Loading results DataFrame from CSV: {csv_path}")
            self.results_df = pd.read_csv(csv_path)
        else:
            raise ValueError("Either `csv_path` or `df` must be provided.")

    def update_issue(self, issue_number: int = None, predicted_label: str = ""):
        try:
            issue = self.repo.get_issue(number=issue_number)
            # Get the current labels on the issue
            current_labels = issue.get_labels()
            
            # Extract label names
            current_label_names = [label.name for label in current_labels]

            # TODO: TEST THIS - START
            if predicted_label == BUG:
                predicted_label = self.prediction_bug_label
            elif predicted_label == ENHANCEMENT:
                predicted_label = self.prediction_enhancement_label
            elif predicted_label == SUPPORT:
                predicted_label = self.prediction_support_label
            else:
                predicted_label = "Unknown"
            # TODO: TEST THIS - END
                
            # Add new labels and ensure that there are no duplicates
            new_labels = list(set(current_label_names + [predicted_label])) # TODO: change predicted label to the one specified for that class in the config!
        except Exception as e:
            logger.error(f"Error when trying to fetch add add predicted label {predicted_label} for issue {issue_number}: {e} ")
        
        # Set the combined labels
        try:
            issue.set_labels(*new_labels)
        except Exception as e:
            logger.error(f"Error when trying to update labels of issue {issue_number} with label {predicted_label}: {e}")
            
    def update_issues(self, df=None):
        try:
            permissions = self.repo.permissions
            if not permissions.push:
                raise PermissionError(
                    "The authenticated user does not have write permissions for this repository. Can't update issues."
                )
        except AttributeError:
            raise RuntimeError("Unable to verify repository permissions. Check the connection to GitHub.")
        
        if self.results_df is None:
            if df is not None:
                self.results_df = df
            else:
                raise ValueError("No dataframe containing predictions found. Can't update issues.")
        
        needed_columns = [NUMBER, PREDICTED_LABEL]

        for col in needed_columns:
            if col not in self.results_df.columns:
                raise ValueError(f"Results dataframe doesn't contain column {col}. Can't update issues.")

        results_df = self.results_df[needed_columns]        
    
        for _, row in results_df.iterrows():
            self.update_issue(issue_number = row[NUMBER], predicted_label = row[PREDICTED_LABEL])

    def save_predictions_to_issue_data(self, df=None): # TODO: fix: BUG: doesn't work, because issue_data is a list not a dictionary
        if df is not None:
            self.results_df = df
            
        if self.results_df is None:
            raise ValueError("No dataframe containing predictions found. Can't update issues.")
        
        needed_columns = [NUMBER, PREDICTED_LABEL]

        for col in needed_columns:
            if col not in self.results_df.columns:
                raise ValueError(f"Results dataframe doesn't contain column {col}. Can't update issues.")

        results_df = self.results_df[needed_columns]

        for issue in self.issues_data:
            matching_row = results_df[results_df[NUMBER] == issue[NUMBER]]
            if not matching_row.empty:
                issue[PREDICTED_LABEL] = matching_row.iloc[0][PREDICTED_LABEL]
            
        # for _, row in results_df.iterrows():
        #     issue_number = str(row[NUMBER])
        #     predicted_label = row[PREDICTED_LABEL]
        #     if issue_number in self.issues_data:
        #         self.issues_data[issue_number]["predicted_label"] = predicted_label
        #     else:
        #         logger.error(f"No entry in data found for issue number \"{issue_number}\". Adding new entry to data.")
        #         self.issues_data[issue_number] = {"predicted_label": predicted_label}
    
        self.save_data(partial=True) # TODO: remove "partial=True" once testing is done!

    def save_predictions_to_issue_data_and_update_issues(self, df=None): # TODO: fix: BUG: doesn't work, because issue_data is a list not a dictionary
        # TODO: check in which order the 2 lines below should get updated
        self.update_issues(df=df)
        self.save_predictions_to_issue_data(df=df)
        
        # try:
        #     permissions = self.repo.permissions
        #     if not permissions.push:
        #         raise PermissionError(
        #             "The authenticated user does not have write permissions for this repository. Can't update issues."
        #         )
        # except AttributeError:
        #     raise RuntimeError("Unable to verify repository permissions. Check the connection to GitHub.")
        
        # if self.results_df is None:
        #     if df is not None:
        #         self.results_df = df
        #     else:
        #         raise ValueError("No dataframe containing predictions found. Can't update issues.")
        
        # needed_columns = [NUMBER, PREDICTED_LABEL]

        # for col in needed_columns:
        #     if col not in results_df.columns:
        #         raise ValueError(f"Results dataframe doesn't contain column {col}. Can't update issues.")

        # results_df = self.results_df[needed_columns]        
    
        # for _, row in results_df.iterrows():
        #     try:
        #         self.update_issue(issue_number = row[NUMBER], predicted_label = row[PREDICTED_LABEL])
        #     except Exception as e:
        #         self.save()
        #         logger.error(f"When trying to update issue {issue_number} with label {predicted_label} an exception occured: {e}")
        #         break # stop updating for now... TODO: keep it this way or continue trying for others? (also handle different exceptions differently?)

        #     issue_number = str(row[NUMBER])
        #     predicted_label = row[PREDICTED_LABEL]
        #     if issue_number in self.issues_data:
        #         self.issues_data[issue_number]["predicted_label"] = predicted_label
        #     else:
        #         logger.error(f"No entry in data found for issue number \"{issue_number}\". Adding new entry to data.")
        #         self.issues_data[issue_number] = {"predicted_label": predicted_label}

        # self.save()

    
    def check_full_availability(self, template_dict, data_dict):
        result = {}
        for key, value in template_dict.items():
            if isinstance(value, dict):
                # Recursively check nested dictionaries
                result[key] = all(
                    self.check_full_availability(value, item.get(key, {}))
                    for item in data_dict if isinstance(item, dict)
                )
            else:
                # Handle the case where data_dict is a list of dictionaries
                result[key] = all(
                    key in item and item[key] is not None
                    for item in data_dict if isinstance(item, dict)
                )
        return result

    def check_partial_availability(self, template_dict, data_dict):
        result = {}
        for key, value in template_dict.items():
            if isinstance(value, dict):
                # Recursively check nested dictionaries
                result[key] = any(
                    self.check_partial_availability(value, item.get(key, {}))
                    for item in data_dict if isinstance(item, dict)
                )
            else:
                # Handle the case where data_dict is a list of dictionaries
                result[key] = any(
                    key in item and item[key] is not None
                    for item in data_dict if isinstance(item, dict)
                )
        return result
    
    def check_availability(self):
        # Full and partial availability checks
        if(self.issues_data): self.fully_available_issue_data = self.check_full_availability(self.fully_available_issue_data, self.issues_data)
        if(self.pull_requests_data): self.fully_available_pr_data = self.check_full_availability(self.fully_available_pr_data, self.pull_requests_data)

        if(self.issues_data): self.partially_available_issue_data = self.check_partial_availability(self.partially_available_issue_data, self.issues_data)
        if(self.pull_requests_data): self.partially_available_pr_data = self.check_partial_availability(self.partially_available_pr_data, self.pull_requests_data)

        self.availability_checked = True

    def check_function_executability(function_requirements):

        if not self.availability_checked: self.check_availability()
            
        executable_functions = {}

        data_availability_dicts = {
            ISSUES: {
                "partially_available": self.partially_available_issue_data,
                "fully_available": self.fully_available_issue_data
            },
            PRS: {
                "partially_available": self.partially_available_pr_data,
                "fully_available": self.fully_available_pr_data
            }
        }
    
        for func, data_requirements in function_requirements.items():
            is_executable = True  # Assume executable unless a requirement fails
    
            for data_type, requirements in data_requirements.items():
                fully_available = data_availability_dicts[data_type]["fully_available"]
                partially_available = data_availability_dicts[data_type]["partially_available"]
    
                for key, required_availability in requirements.items():
                    if required_availability == "full":
                        if not fully_available.get(key, False):
                            is_executable = False
                            break
                    elif required_availability == "partial":
                        if not (fully_available.get(key, False) or partially_available.get(key, False)):
                            is_executable = False
                            break
    
                if not is_executable:
                    break  # Stop checking other data types if one fails
    
            executable_functions[func] = "executable" if is_executable else "not_executable"
    
        return executable_functions
    
    def get_fully_available_issue_data(self):
        return self.fully_available_issue_data
    
    def get_fully_available_pr_data(self):
        return self.fully_available_pr_data
    
    def get_partially_available_issue_data(self):
        return self.partially_available_issue_data
    
    def get_partially_available_pr_data(self):
        return self.partially_available_pr_data
    
    def get_label_dict(self):
        return {BUG: self.bug_labels, ENHANCEMENT: self.enhancement_labels, SUPPORT: self.support_labels}

    def get_prediction_labels(self):
        return {BUG: self.prediction_bug_label, ENHANCEMENT: self.prediction_enhancement_label, SUPPORT: self.prediction_support_label}

### RepoHandler Test

In [None]:
# USER_NAME = "vaadin"
# REPO_NAME = "flow"
# repoHandler = RepoDataHandler(user_name=USER_NAME, repository_name=REPO_NAME, token=GITHUB_PAT)
# repoHandler.load_repo_data_config()
# repoHandler.load_data()
# if FETCH_NEW_ISSUES: repoHandler.fetch_issues()

In [None]:
# import sys
# sys.exit(0)

In [None]:
# class TestRepoDataHandler(unittest.TestCase):

#     def setUp(self):
#         # Initialize the RepoDataHandler with mocked data
#         self.handler = RepoDataHandler(
#             user_name="grafana",
#             repository_name="grafana"
#         )
#         self.mock_repo = MagicMock()
#         self.handler.repo = self.mock_repo
#         self.handler.github_api = MagicMock()

#         # Set the path for the actual JSON file
#         self.repo_data_json_file = os.path.join(PROJECT_FOLDER, DATA_FOLDER, "grafana_grafana_repo_data.json")

#     def test_load_data_from_file(self):
#         # Load the actual data from the specified file
#         with open(self.repo_data_json_file, "r", encoding="utf-8-sig") as input_file:
#             data = json.load(input_file)
#             self.handler.issues_data = data["issues"]
#             self.handler.pull_requests_data = data["pull_requests"]
#             self.handler.issue_pr_map = data["issue_pr_map"]

#         # Assert that data is loaded correctly (use appropriate checks)
#         self.assertIn("issues", data)
#         self.assertIn("pull_requests", data)
#         self.assertIn("issue_pr_map", data)
#         self.assertGreater(len(self.handler.issues_data), 0)
#         self.assertGreater(len(self.handler.pull_requests_data), 0)
#         self.assertGreater(len(self.handler.issue_pr_map), 0)

#     def test_check_availability(self):
#         # Assuming data is loaded, we now check full and partial availability
#         self.test_load_data_from_file()

#         # Run the availability check
#         self.handler.check_availability()

#         # Check that the availability structures have been updated
#         for value in self.handler.fully_available_issue_data.values():
#             self.assertIsInstance(value, bool)

#         for value in self.handler.fully_available_pr_data.values():
#             self.assertIsInstance(value, bool)

#         # Check that the availability structures have been updated
#         for value in self.handler.partially_available_issue_data.values():
#             self.assertIsInstance(value, bool)

#         for value in self.handler.partially_available_pr_data.values():
#             self.assertIsInstance(value, bool)

#     def test_load_toml_config(self):
#         # Path to TOML config file (mocked in the test)
#         # config_file = os.path.join(PROJECT_FOLDER, CONFIG_FOLDER, LABEL_CONFIG_FILE_NAME)

#         # Mock TOML data that simulates the contents of the file
#         toml_data = """
#         ["vaadin/flow"]
#         bug_labels = ["bug", "BFP"]
#         enhancement_labels = ["enhancement", "feature request"]
#         support_labels = ["documentation", "question", "help wanted"]

#         ["grafana/grafana"]
#         bug_labels = ["type/bug"]
#         enhancement_labels = ["type/feature-request", "kind/enhancement", "discussion/consideration"]
#         support_labels = ["type/docs", "type/question", "bot/question"]
#         """

#         # Mock open to simulate reading the TOML file
#         with patch("builtins.open", unittest.mock.mock_open(read_data=toml_data)):
#             self.handler.load_label_config()

#         logger.debug(self.handler.bug_labels)
#         logger.debug(self.handler.enhancement_labels)
#         logger.debug(self.handler.support_labels)

#         # Assertions to verify that the label configuration is loaded correctly
#         self.assertEqual(self.handler.bug_labels, ["type/bug"])
#         self.assertEqual(self.handler.enhancement_labels, ["type/feature-request", "kind/enhancement", "discussion/consideration"])
#         self.assertEqual(self.handler.support_labels, ["type/docs", "type/question", "bot/question"])


# # Run the tests directly in the Jupyter notebook
# unittest.TextTestRunner().run(unittest.defaultTestLoader.loadTestsFromTestCase(TestRepoDataHandler))

## CatIss

In [None]:
class DataProcessor:
    def __init__(self, 
                 data: dict = {}, 
                 label_names: dict = {}, 
                 user_name: str = None,
                 repository_name: str = None,
                 evaluation_mode: bool = EVALUATION, 
                 test_mode: bool = TEST, 
                 validation_split_percentage: float = VALIDATION_SPLIT_PERCENTAGE, 
                 random_state: int = RANDOM_STATE):
        """
        Initializes the DataProcessor for handling preprocessing tasks.

        :param data: Dictionary containing raw features and labels for each example.
        :param label_names: Dictionary mapping label types to lists of labels (e.g., {"bug": [...], "enhancement": [...], "support": [...]}).
        :param user_name: GitHub user name (for constructing URLs).
        :param repository_name: GitHub repository name (for constructing URLs).
        :param evaluation_mode: If True, data will be split into training and evaluation sets.
        :param test_mode: If True, rows without labels will be saved for testing.
        :param validation_split_percentage: Proportion of data used for evaluation (default is 20%).
        :param random_state: Seed for reproducibility of data splitting.
        """
        self.data = data
        self.label_names = label_names
        self.evaluation_mode = evaluation_mode
        self.test_mode = test_mode
        self.validation_split_percentage = validation_split_percentage
        self.random_state = random_state
        self.user_name = user_name
        self.repository_name = repository_name
        self.train_csv = os.path.join(PROJECT_FOLDER, DATA_FOLDER, f"{self.user_name}_{self.repository_name}_train_clean_concat_{MAX_TITLE_LENGTH + MAX_BODY_LENGTH}.csv")
        self.eval_csv = os.path.join(PROJECT_FOLDER, DATA_FOLDER, f"{self.user_name}_{self.repository_name}_eval_clean_concat_{MAX_TITLE_LENGTH + MAX_BODY_LENGTH}.csv")
        self.test_csv = os.path.join(PROJECT_FOLDER, DATA_FOLDER, f"{self.user_name}_{self.repository_name}_test_clean_concat_{MAX_TITLE_LENGTH + MAX_BODY_LENGTH}.csv")

    def filter_and_select_label(self, label_list):
        """
        Filters and selects the first valid label from the provided list.

        :param label_list: List of labels for a specific issue or pull request.
        :return: The first matching label category (e.g., "bug", "enhancement") or None if no match is found.
        """
        # return None if predicted_label isn't None? - NOT HERE
        for label in label_list:
            for key in self.label_names:
                if label in self.label_names[key]:
                    return key
        return None

    def create_df(self):
        """
        Creates a DataFrame from the raw data and applies necessary filtering and transformations.
        - Filters for necessary columns.
        - Replaces the labels with the first valid label from the list of labels.
        - Adds the GitHub repository URL.
        - Adjusts the time format by adding the "Z" suffix.
        """
        self.df = pd.DataFrame(self.data)

        # logger.debug(self.df.head())
        logger.debug(f"self.df.columns: {self.df.columns}")

        self.df[REPO] = f"https://api.github.com/repos/{self.user_name}/{self.repository_name}"

        # logger.debug(self.df.head())
        logger.debug(f"self.df.columns: {self.df.columns}")

        # self.df["number"] = [f"https://api.github.com/repos/{self.user_name}/{self.repository_name}/issues/{issue_number}" for issue_number in self.df["number"]] # {self.user_name}/{self.repository_name}/{issue_number} would achieve the exact same
        # self.df.rename(columns={"number": URL}, inplace=True)
        # TODO: test line below instead of the 2 above
        self.df[URL] = [f"https://api.github.com/repos/{self.user_name}/{self.repository_name}/issues/{issue_number}" for issue_number in self.df["number"]] # {self.user_name}/{self.repository_name}/{issue_number} would achieve the exact same

        # Ensure necessary columns are present in the data
        if not all(col in self.df.columns for col in NECESSARY_COLUMNS):
            logger.debug(f"self.df.columns: {self.df.columns}")
            raise ValueError(f"Missing necessary columns. Required: {NECESSARY_COLUMNS} ({REPO} is calculated from user_name and repository_name and {URL} from issue number during runtime).")

        if(not "number" in self.df.columns): self.df["number"] = self.df[URL]
        self.df = self.df[USEFUL_COLUMNS]
        self.df[LABEL] = self.df[LABEL].apply(self.filter_and_select_label)
        self.df[TIME] = self.df[TIME] + "Z"

    def filter_df(self):
        all_labels_to_filter = [BUG, ENHANCEMENT, SUPPORT] # self.label_names[BUG] + self.label_names[ENHANCEMENT] + self.label_names[SUPPORT] # TODO: test and verify which one is required at this point. (since label mapping has been done priorily it should be the first one(?))
        logger.debug(f"all labels to filter: {all_labels_to_filter}")
        
        # Filter out rows where the 'labels' column contains any of the labels in all_labels_to_filter
        logger.debug("self.df[LABEL].value_counts():")
        logger.debug(self.df[LABEL].value_counts())
        logger.debug(f"self.df[{PREDICTED_LABEL}].value_counts(dropna=False):")
        logger.debug(self.df[PREDICTED_LABEL].value_counts(dropna=False))
        if(self.test_mode): # TODO: add that issues, which already have already a prediciton, get ignored here
            self.test = self.df[~self.df[LABEL].isin(all_labels_to_filter)]
            logger.debug("Value counts for test data frame BEFORE removing all the ones that already have a prediction:")
            logger.debug(self.test[PREDICTED_LABEL].value_counts(dropna=False))
            self.test = self.test[self.test[PREDICTED_LABEL].isnull()] # TODO: test
            logger.debug("Value counts for test data frame AFTER removing all the ones that already have a prediction:")
            logger.debug(self.test[PREDICTED_LABEL].value_counts(dropna=False))
            logger.debug("self.test.value_counts():")
            logger.debug(self.df[LABEL].value_counts())
        self.df = self.df[self.df[LABEL].isin(all_labels_to_filter)]
        logger.debug("self.df.value_counts():")
        logger.debug(self.df[LABEL].value_counts())

    def split_data(self):
        """
        Splits data into training, evaluation, and test sets based on the provided mode.
        - Test data contains rows without labels.
        - Data is split into training and evaluation sets if evaluation_mode is True.
        """
        if(not self.test_mode): self.test = None
        
        # self.test = None
        # if self.test_mode:
            # self.test = self.df[self.df[LABEL].isna()]
            # self.filter_df()

        # self.df.dropna(subset=[LABEL], inplace=True)

        if self.evaluation_mode:
            self.train, self.eval = train_test_split(self.df, test_size=self.validation_split_percentage, random_state=self.random_state)
        else:
            self.train = self.df
            self.eval = None

    def deduplicate_data(self):
        """
        Deduplicates the training data based on the "url" column (e.g., issue number).
        - Ensures all text columns are converted to string type after deduplication.
        """
        self.dedup_train = self.train.sort_values(URL).drop_duplicates(subset=[URL]).copy()
        logger.debug(f"Number of dropped duplicate issues: {self.train.shape[0] - self.dedup_train.shape[0]}")

        # Ensure columns are strings
        for col in [TITLE, BODY, AUTHOR, TIME, REPO]:
            self.dedup_train[col] = self.dedup_train[col].astype(str)

        if self.evaluation_mode:
            for col in [TITLE, BODY, AUTHOR, TIME, REPO]:
                self.eval[col] = self.eval[col].astype(str)

        if self.test_mode:
            for col in [TITLE, BODY, AUTHOR, TIME, REPO]:
                self.test[col] = self.test[col].astype(str)

    def normalize_text(self):
        """
        Normalizes text data in the "title" and "body" fields.
        - Replaces functions and issue numbers with placeholders.
        - Converts text to lowercase.
        """
        # self.dedup_train[body] = self.dedup_train[body].apply(lambda x: FUNCTION_REGEX.sub(" function ", x))
        # self.dedup_train[title] = self.dedup_train[title].apply(lambda x: ISSUE_REGEX.sub(" issue ", x))
        # self.dedup_train[title] = self.dedup_train[title].str.lower()
        # self.dedup_train[body] = self.dedup_train[body].str.lower()

        # if self.evaluation_mode:
        #     self.eval[body] = self.eval[body].apply(lambda x: FUNCTION_REGEX.sub(" function ", x))
        #     self.eval[title] = self.eval[title].apply(lambda x: ISSUE_REGEX.sub(" issue ", x))
        #     self.eval[title] = self.eval[title].str.lower()
        #     self.eval[body] = self.eval[body].str.lower()

        # if self.test_mode:
        #     self.test[body] = self.test[body].apply(lambda x: FUNCTION_REGEX.sub(" function ", x))
        #     self.test[title] = self.test[title].apply(lambda x: ISSUE_REGEX.sub(" issue ", x))
        #     self.test[title] = self.test[title].str.lower()
        #     self.test[body] = self.test[body].str.lower()

        logger.debug("Replacing functions...")
        self.dedup_train[BODY] = self.dedup_train[BODY].apply(lambda x:FUNCTION_REGEX.sub(" function ",x))
        
        if(self.evaluation_mode): self.eval[BODY] = self.eval[BODY].apply(lambda x:FUNCTION_REGEX.sub(" function ",x))
        if(self.test_mode): self.test[BODY] = self.test[BODY].apply(lambda x:FUNCTION_REGEX.sub(" function ",x))

        logger.debug("Replacing issue numbers...")
        self.dedup_train[TITLE] = self.dedup_train[TITLE].apply(lambda x:ISSUE_REGEX.sub(" issue ",x))
        self.dedup_train[BODY] = self.dedup_train[BODY].apply(lambda x:ISSUE_REGEX.sub(" issue ",x))
        
        if(self.evaluation_mode):
            self.eval[TITLE] = self.eval[TITLE].apply(lambda x:ISSUE_REGEX.sub(" issue ",x))
            self.eval[BODY] = self.eval[BODY].apply(lambda x:ISSUE_REGEX.sub(" issue ",x))

        if(self.test_mode):
            self.test[TITLE] = self.test[TITLE].apply(lambda x:ISSUE_REGEX.sub(" issue ",x))
            self.test[BODY] = self.test[BODY].apply(lambda x:ISSUE_REGEX.sub(" issue ",x))

        logger.debug("Converting to lower case...")
        self.dedup_train[TITLE] = self.dedup_train[TITLE].str.lower()
        self.dedup_train[BODY] = self.dedup_train[BODY].str.lower()
        
        if(self.evaluation_mode):
            self.eval[TITLE] = self.eval[TITLE].str.lower()
            self.eval[BODY] = self.eval[BODY].str.lower()

        if(self.test_mode):
            self.test[TITLE] = self.test[TITLE].str.lower()
            self.test[BODY] = self.test[BODY].str.lower()

    def remove_extra_info(self):
        """
        Removes extraneous information from the text data.
        - Removes punctuation, non-ASCII characters, and extra spaces from the text fields.
        """
        # Remove punctuation
        logger.debug("Removing punctuation...")
        replace_string = " " * len(PUNCTUATIONS)
        self.dedup_train[TITLE] = self.dedup_train[TITLE].str.translate(str.maketrans(PUNCTUATIONS, replace_string))
        self.dedup_train[BODY] = self.dedup_train[BODY].str.translate(str.maketrans(PUNCTUATIONS, replace_string))

        if self.evaluation_mode:
            self.eval[TITLE] = self.eval[TITLE].str.translate(str.maketrans(PUNCTUATIONS, replace_string))
            self.eval[BODY] = self.eval[BODY].str.translate(str.maketrans(PUNCTUATIONS, replace_string))

        if self.test_mode:
            self.test[TITLE] = self.test[TITLE].str.translate(str.maketrans(PUNCTUATIONS, replace_string))
            self.test[BODY] = self.test[BODY].str.translate(str.maketrans(PUNCTUATIONS, replace_string))

        # Remove non-ASCII characters and normalize
        logger.debug("Removing non-ASCII characters...")
        self.dedup_train[TITLE] = self.dedup_train[TITLE].apply(lambda x: re.sub(ASCII_REGEX, "", x))
        self.dedup_train[TITLE] = self.dedup_train[TITLE].apply(lambda x: ud.normalize("NFD", x))
        self.dedup_train[BODY] = self.dedup_train[BODY].apply(lambda x: re.sub(ASCII_REGEX, "", x))
        self.dedup_train[BODY] = self.dedup_train[BODY].apply(lambda x: ud.normalize("NFD", x))

        if self.evaluation_mode:
            self.eval[TITLE] = self.eval[TITLE].apply(lambda x: re.sub(ASCII_REGEX, "", x))
            self.eval[TITLE] = self.eval[TITLE].apply(lambda x: ud.normalize("NFD", x))
            self.eval[BODY] = self.eval[BODY].apply(lambda x: re.sub(ASCII_REGEX, "", x))
            self.eval[BODY] = self.eval[BODY].apply(lambda x: ud.normalize("NFD", x))

        if self.test_mode:
            self.test[TITLE] = self.test[TITLE].apply(lambda x: re.sub(ASCII_REGEX, "", x))
            self.test[TITLE] = self.test[TITLE].apply(lambda x: ud.normalize("NFD", x))
            self.test[BODY] = self.test[BODY].apply(lambda x: re.sub(ASCII_REGEX, "", x))
            self.test[BODY] = self.test[BODY].apply(lambda x: ud.normalize("NFD", x))

        logger.debug("Replacing fixed part of REPO URl column...")
        self.dedup_train[REPO] = self.dedup_train[REPO].apply(lambda x: x.replace("https://api.github.com/repos/", ""))
        
        if(self.evaluation_mode): self.eval[REPO] = self.eval[REPO].apply(lambda x: x.replace("https://api.github.com/repos/", ""))

        if(self.test_mode): self.test[REPO] = self.test[REPO].apply(lambda x: x.replace("https://api.github.com/repos/", ""))

        logger.debug("Replacing white spaces...")
        self.dedup_train[TITLE] = self.dedup_train[TITLE].apply(lambda x:" ".join(x.split()))
        self.dedup_train[BODY] = self.dedup_train[BODY].apply(lambda x:" ".join(x.split()))

        if(self.evaluation_mode):
            self.eval[TITLE] = self.eval[TITLE].apply(lambda x:" ".join(x.split()))
            self.eval[BODY] = self.eval[BODY].apply(lambda x:" ".join(x.split()))
        
        if(self.test_mode):
            self.test[TITLE] = self.test[TITLE].apply(lambda x:" ".join(x.split()))
            self.test[BODY] = self.test[BODY].apply(lambda x:" ".join(x.split()))

    def truncate_columns(self):
        """
        Truncates the "TITLE" and "BODY" fields to a maximum length.
        - Uses MAX_TITLE_LENGTH and MAX_BODY_LENGTH constants to truncate text fields.
        """
        self.dedup_train[TITLE] = self.dedup_train[TITLE].apply(lambda x: " ".join(x.split(maxsplit=MAX_TITLE_LENGTH)[:MAX_TITLE_LENGTH]))
        self.dedup_train[BODY] = self.dedup_train[BODY].apply(lambda x: " ".join(x.split(maxsplit=MAX_BODY_LENGTH)[:MAX_BODY_LENGTH]))

        if self.evaluation_mode:
            self.eval[TITLE] = self.eval[TITLE].apply(lambda x: " ".join(x.split(maxsplit=MAX_TITLE_LENGTH)[:MAX_TITLE_LENGTH]))
            self.eval[BODY] = self.eval[BODY].apply(lambda x: " ".join(x.split(maxsplit=MAX_BODY_LENGTH)[:MAX_BODY_LENGTH]))

        if self.test_mode:
            self.test[TITLE] = self.test[TITLE].apply(lambda x: " ".join(x.split(maxsplit=MAX_TITLE_LENGTH)[:MAX_TITLE_LENGTH]))
            self.test[BODY] = self.test[BODY].apply(lambda x: " ".join(x.split(maxsplit=MAX_BODY_LENGTH)[:MAX_BODY_LENGTH]))

    def extract_label_column(self): # TODO: function to reverse labels back to original ones/to the ones wanted for predictions!
        """
        Converts the label column into categorical data and extracts label codes.
        - Uses the "label" column and generates categorical codes for the model.
        """

        logger.debug("extract_label_column - START - value_counts() for dataframes:")
        if(self.dedup_train is not None):
            logger.debug(self.dedup_train[LABEL].value_counts())
        if(self.eval is not None):
            logger.debug(self.eval[LABEL].value_counts())
        if(self.test is not None):
            logger.debug(self.test[LABEL].value_counts())
        
        self.dedup_train[LABEL] = pd.Categorical(self.dedup_train[LABEL])
        # Save original labels so the predictions of the model can be reverted later on
        self.label_mapping_dict = dict(enumerate(self.dedup_train[LABEL].cat.categories))
        logger.debug(f"self.label_mapping_dict = {self.label_mapping_dict}")
        if self.evaluation_mode:
            self.eval[LABEL] = pd.Categorical(self.eval[LABEL])
        if self.test_mode:
            self.test[LABEL] = pd.Categorical(self.test[LABEL])

        # for label in self.label_names.keys():
        #     logger.debug(self.dedup_train[self.dedup_train[LABEL] == label].iloc[0])
        self.dedup_train[LABEL_COL] = self.dedup_train[LABEL].cat.codes
        # for label in range(len(self.label_names.keys())):
        #     logger.debug(self.dedup_train[self.dedup_train[LABEL_COL] == label].iloc[0])
        logger.debug(self.dedup_train[LABEL_COL].value_counts())
        
        if self.evaluation_mode:
            # for label in self.label_names.keys():
            #     logger.debug(self.eval[self.eval[LABEL] == label].iloc[0])
            self.eval[LABEL_COL] = self.eval[LABEL].cat.codes
            # for label in range(len(self.label_names.keys())):
            #     logger.debug(self.eval[self.eval[LABEL_COL] == label].iloc[0])
            logger.debug(self.eval[LABEL_COL].value_counts())
        
        if self.test_mode:
            # for label in self.label_names.keys():
            #     logger.debug(self.test[self.test[LABEL] == label].iloc[0])
            self.test[LABEL_COL] = self.test[LABEL].cat.codes
            # for label in range(len(self.label_names.keys())):
            #     logger.debug(self.test[self.test[LABEL_COL] == label].iloc[0])
            logger.debug(self.test[LABEL_COL].value_counts())

        logger.debug("extract_label_column - END - value_counts() for dataframes:")
        if(self.dedup_train is not None):
            logger.debug(self.dedup_train[LABEL].value_counts())
        if(self.eval is not None):
            logger.debug(self.eval[LABEL].value_counts())
        if(self.test is not None):
            logger.debug(self.test[LABEL].value_counts())

    def prepare_columns_for_model(self):
        """
        Prepares the final "text" and "label" columns for model training.
        - Concatenates different text features into a single "text_col" for model input.
        """
        self.dedup_train[TEXT_COL] = "time " + self.dedup_train[TIME] + " author " + self.dedup_train[AUTHOR] +" repo " + self.dedup_train[REPO] + " title " + self.dedup_train[TITLE] + " body " + self.dedup_train[BODY]
        self.dedup_train.reset_index(drop=True, inplace=True)

        if self.evaluation_mode:
            self.eval[TEXT_COL] = "time " + self.eval[TIME] + " author " + self.eval[AUTHOR] +" repo " + self.eval[REPO] + " title " + self.eval[TITLE] + " body " + self.eval[BODY]
            self.eval.reset_index(drop=True, inplace=True)

        if self.test_mode:
            self.test[TEXT_COL] = "time " + self.test[TIME] + " author " + self.test[AUTHOR] +" repo " + self.test[REPO] + " title " + self.test[TITLE] + " body " + self.test[BODY]
            self.test.reset_index(drop=True, inplace=True)

        logger.debug(f"Number of train issues: {self.train.shape}")
        if self.evaluation_mode:
            logger.debug(f"Number of eval issues: {self.eval.shape}")
        if self.test_mode:
            logger.debug(f"Number of test issues: {self.test.shape}")

    def save_data(self):
        """
        Saves the processed data to CSV files for training, evaluation, and testing.
        - File paths are constructed using PROJECT_FOLDER and DATA_FOLDER constants.
        """       
        self.dedup_train[[TEXT_COL, LABEL_COL]].to_csv(self.train_csv, index=False)
        
        if self.evaluation_mode:
            self.eval[[TEXT_COL, LABEL_COL]].to_csv(self.eval_csv, index=False)
        
        if self.test_mode:
            self.test[[TEXT_COL, LABEL_COL]].to_csv(self.test_csv, index=False)

    def process_data(self):
        """
        Executes the full data processing pipeline:
        - Create DataFrame, split into train/eval/test sets, deduplicate data.
        - Normalize text, clean extraneous information, and prepare columns for model input.
        """
        logger.debug("create df")
        self.create_df()
        logger.debug("filter df")
        self.filter_df()
        logger.debug("split data")
        self.split_data()
        logger.debug("deduplicate data")
        self.deduplicate_data()
        logger.debug("normalize text")
        self.normalize_text()
        logger.debug("remove extra info from texts")
        self.remove_extra_info()
        logger.debug("truncate columns")
        self.truncate_columns()
        logger.debug("extract label column")
        self.extract_label_column()
        logger.debug("prepare columns for model")
        self.prepare_columns_for_model()

        global EXPORT_ML_DATA_CSVS
        if EXPORT_ML_DATA_CSVS:
            self.save_data()

    def load_data(self):
        """
        Loads the processed training, evaluation, and test data from CSV files.
        - File paths are constructed using PROJECT_FOLDER and DATA_FOLDER constants.
        """
        if os.file.exists(self.train_csv):
            self.dedup_train = pd.read_csv(self.train_csv)
        else:
            raise FileNotFoundError(f"Training data file not found: {self.train_csv}")
        
        if self.evaluation_mode:
            if os.file.exists(self.eval_csv):
                self.eval = pd.read_csv(self.eval_csv)
            else:
                raise FileNotFoundError(f"Evaluation data file not found: {self.eval_csv}")
                
        if self.test_mode:
            if os.file.exists(self.test_csv):
                self.test = pd.read_csv(self.test_csv)
            else:
                raise FileNotFoundError(f"Test data file not found: {self.test_csv}")

    def get_train_data(self):
        """
        Returns the training data DataFrame.
        """
        return self.dedup_train
    
    def get_eval_data(self):
        """
        Returns the evaluation data DataFrame.
        """
        return self.eval
    
    def get_test_data(self):
        """
        Returns the test data DataFrame.
        """
        return self.test

    def get_label_mapping_dict(self):
        """
        Returns a dictionary that contains the label mapping.
        """
        return self.label_mapping_dict

In [None]:
# USER_NAME = "vaadin"
# REPO_NAME = "flow"

# USER_NAME = "grafana"
# REPO_NAME = "grafana"

In [None]:
# repoHandler = RepoDataHandler(user_name=USER_NAME, repository_name=REPO_NAME, token=GITHUB_PAT)
# repoHandler.load_repo_data_config()
# repoHandler.load_data()
# if FETCH_NEW_ISSUES:
#     repoHandler.fetch_issues()
#     repoHandler.save_data(partial=True)
# issue_data = repoHandler.get_issues()

# repoHandler.load_label_config()
# label_names = repoHandler.get_label_dict()
# logger.debug(label_names)

# dataProcessor = DataProcessor(data=issue_data, label_names=label_names, user_name=USER_NAME, repository_name=REPO_NAME)
# dataProcessor.process_data()

## ML

In [None]:
# # Set device
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Load tokenizer
# tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# # Number of labels
# num_labels = 3  # Adjust as per your use case

# # Load model configuration with correct number of labels
# model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_labels)

# # Load the state dictionary from your .bin file
# state_dict = torch.load(PRETRAINED_MODEL_FILE_NAME, map_location=device)

# # Load the state dict into the model
# model.load_state_dict(state_dict, strict=False)

# # Move model to the device (CPU or GPU)
# model.to(device)

In [None]:
class IssueLabelPredictor:
    def __init__(self, 
                 model_name="roberta", 
                 model_version="roberta-base", 
                 learning_rate=3e-5, 
                 epochs=4, 
                 # batch_size=100,
                 batch_size=50,
                 max_seq_length=200, 
                 num_labels=3, # len(label_names)
                 output_dir=os.path.join(RESULTS_FOLDER, FINETUNED_MODEL_FOLDER), 
                 use_cuda=torch.cuda.is_available(),
                 use_fine_tuned_model=False,
                 pretrained_model_path=os.path.join(PROJECT_FOLDER, MODEL_FOLDER, PRETRAINED_MODEL_FILE_NAME),
                 finetuned_model_path=os.path.join(PROJECT_FOLDER, MODEL_FOLDER, FINETUNED_MODEL_FOLDER, FINETUNED_MODEL_FILE_NAME)
                 ):
        """
        Initializes the IssueLabelPredictor for handling ML tasks.

        :param model_name: Name of the model architecture.
        :param model_version: Specific version of the model.
        :param learning_rate: Learning rate for training.
        :param epochs: Number of training epochs.
        :param batch_size: Batch size for training and evaluation.
        :param max_seq_length: Maximum sequence length for inputs.
        :param num_labels: Number of target labels/classes.
        :param output_dir: Directory to save or load the model.
        :param use_cuda: Whether to use GPU acceleration.
        :param use_fine_tuned_model: Whether to load a fine-tuned model or a pretrained one.
        :param pretrained_model_path: Path to the pretrained model file.
        :param finetuned_model_path: Path to the fine-tuned model file.
        """
        self.model_name = model_name
        self.model_version = model_version
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.batch_size = batch_size
        self.max_seq_length = max_seq_length
        self.num_labels = num_labels
        self.output_dir = output_dir
        self.use_cuda = use_cuda and torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        self.use_fine_tuned_model = use_fine_tuned_model
        self.pretrained_model_path = pretrained_model_path
        self.finetuned_model_path = finetuned_model_path
        # Initialize model and tokenizer
        self._init_model_and_tokenizer()
    
    def _init_model_and_tokenizer(self):
        """
        Initializes the model and tokenizer.
        """
        # Initialize tokenizer
        self.tokenizer = RobertaTokenizer.from_pretrained(self.model_version)
        
        # Load the fine-tuned model if specified
        if self.use_fine_tuned_model and self.finetuned_model_path and os.path.exists(self.finetuned_model_path):
            self.model = RobertaForSequenceClassification.from_pretrained(
                self.finetuned_model_path
            )
            logger.info(f"Loaded fine-tuned model from {self.finetuned_model_path}")
        
        elif self.pretrained_model_path and os.path.exists(self.pretrained_model_path):
            if os.path.isdir(self.pretrained_model_path):
                self.model = RobertaForSequenceClassification.from_pretrained(
                    self.pretrained_model_path,
                    num_labels=self.num_labels
                )
                self.tokenizer = RobertaTokenizer.from_pretrained(self.pretrained_model_path)
                logger.info(f"Loaded pretrained model from directory {self.pretrained_model_path}")
            else:
                # Load model from pytorch_model.bin file
                self.model = RobertaForSequenceClassification.from_pretrained(
                    self.model_version, 
                    num_labels=self.num_labels
                )
                state_dict = torch.load(self.pretrained_model_path, map_location=self.device)
                
                # Load the state dict with strict=False to ignore unexpected keys
                self.model.load_state_dict(state_dict, strict=False)
                logger.info(f"Loaded pretrained model from file {self.pretrained_model_path}")
        else:
            self.model = RobertaForSequenceClassification.from_pretrained(
                self.model_version, 
                num_labels=self.num_labels
            )
            logger.info(f"Initialized new model from {self.model_version}")
        
        self.model.to(self.device)
        self.model.eval()                                                                                                                                                                                                                                                                                       

    def train(self, train_df, eval_df=None):
        """
        Trains the model using the provided training DataFrame.
        """
        from transformers import Trainer, TrainingArguments

        # Prepare datasets
        train_dataset = self._prepare_dataset(train_df)
        eval_dataset = self._prepare_dataset(eval_df) if eval_df is not None else None

        # Determine the strategy based on whether eval_df is provided
        strategy = "epoch" if eval_df is not None else "no"

        # Set up training arguments
        training_args = TrainingArguments(
            output_dir=self.output_dir,
            num_train_epochs=self.epochs,
            per_device_train_batch_size=self.batch_size,
            per_device_eval_batch_size=self.batch_size,
            learning_rate=self.learning_rate,
            evaluation_strategy=strategy,
            save_strategy=strategy,
            logging_dir=os.path.join(self.output_dir, "logs"),
            logging_steps=50,
            load_best_model_at_end=True if eval_df is not None else False,
            metric_for_best_model="accuracy",
            greater_is_better=True,
            use_cpu=not self.use_cuda,
            overwrite_output_dir=True,
            report_to="none"  # Disable wandb and other logging integrations
        )

        # Define compute_metrics function
        # def compute_metrics(pred):
        #     labels = pred.label_ids
        #     preds = pred.predictions.argmax(-1)
        #     report = classification_report(labels, preds, output_dict=True, zero_division=0)
        #     return {
        #         "accuracy": report["accuracy"],
        #         "f1_micro": report["micro avg"]["f1-score"]
        #     }

        def compute_metrics(pred):
            labels = pred.label_ids
            preds = pred.predictions.argmax(-1)
            acc = accuracy_score(labels, preds)
            f1_micro = f1_score(labels, preds, average='micro', zero_division=0)
            return {
                "accuracy": acc,
                "f1_micro": f1_micro
            }

        # Initialize Trainer
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            tokenizer=self.tokenizer,
            compute_metrics=compute_metrics if eval_df is not None else None
        )

        # Start training
        trainer.train()

        # Save the model and tokenizer
        # self.model.save_pretrained(self.output_dir)
        # self.tokenizer.save_pretrained(self.output_dir)
        self.model.save_pretrained(os.path.dirname(self.finetuned_model_path))
        self.tokenizer.save_pretrained(os.path.dirname(self.finetuned_model_path))
        

    def _prepare_dataset(self, df):
        """
        Prepares a list of dictionaries from a DataFrame.

        :param df: DataFrame containing 'text' and 'labels' columns.
        :return: List of dictionaries ready for Trainer.
        """
        if df is None:
            return None

        texts = df[TEXT_COL].tolist()
        labels = df[LABEL_COL].tolist()

        # Tokenize the texts
        encodings = self.tokenizer(
            texts, 
            max_length=self.max_seq_length, 
            truncation=True, 
            padding='max_length'
        )

        # Convert to list of dictionaries
        dataset = []
        for i in range(len(texts)):
            item = {key: torch.tensor(val[i]) for key, val in encodings.items()}
            item['labels'] = torch.tensor(labels[i])
            dataset.append(item)

        return dataset

    def predict(self, df):
        """
        Predicts labels for the provided DataFrame.

        :param df: DataFrame containing a "text" column.
        :return: DataFrame with an additional "predicted_label" column.
        """
        self.model.eval()
        texts = df[TEXT_COL].tolist()

        # Tokenize the texts
        inputs = self.tokenizer(
            texts, 
            return_tensors="pt", 
            max_length=self.max_seq_length, 
            truncation=True, 
            padding="max_length"
        )

        # Move tensors to device
        inputs = {key: val.to(self.device) for key, val in inputs.items()}

        # Create DataLoader
        dataset = TensorDataset(inputs["input_ids"], inputs["attention_mask"])
        dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=False)

        all_predictions = []

        with torch.no_grad():
            for batch in dataloader:
                input_ids, attention_mask = batch
                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                predictions = torch.argmax(logits, dim=-1)
                all_predictions.extend(predictions.cpu().numpy())

        df = df.copy()
        df[PREDICTED_LABEL] = all_predictions
        # logger.debug(f"Predictions: {df.head()}")
        return df

    def evaluate(self, df):
        """
        Evaluates the model on the provided DataFrame.

        :param df: DataFrame containing "text" and "labels" columns.
        :return: Classification report as a string.
        """
        df_with_predictions = self.predict(df)
        y_true = df_with_predictions[LABEL_COL]
        y_pred = df_with_predictions[PREDICTED_LABEL]

        report = classification_report(y_true, y_pred, zero_division=0)
        cm = confusion_matrix(y_true, y_pred)
        per_class_accuracy = cm.diagonal() / cm.sum(axis=1)

        # Prepare detailed report
        report_with_class_accuracy = f"Classification Report:\n{report}\nPer-Class Accuracy:\n"
        for i, acc in enumerate(per_class_accuracy):
            report_with_class_accuracy += f"Class {i}: {acc:.2f}\n"

        return report_with_class_accuracy

    def analyze_predictions(self, df):
        """
        Analyzes predictions for unlabeled data.

        :param df: DataFrame containing "text" and "predicted_label" columns.
        """
        class_distribution = df[PREDICTED_LABEL].value_counts()

        # Visualizing class distribution
        plt.figure(figsize=(10, 6))
        class_distribution.plot(kind="bar")
        plt.title("Class Distribution of Predicted Labels")
        plt.xlabel("Class")
        plt.ylabel("Frequency")
        plt.show()

        # # Sampling data for manual evaluation
        # sample_size = 10  # Number of samples to pick per class
        # sampled_data = df.groupby("predicted_label").apply(
        #     lambda x: x.sample(min(len(x), sample_size))
        # )

        # logger.debug("\nSampled Data for Manual Evaluation:\n", sampled_data[["predicted_label", "text"]])

    def save_predictions(self, df, output_file_path=os.path.join(PROJECT_FOLDER, RESULTS_FOLDER, "predictions.csv")):
        """
        Saves the predictions to a CSV file.

        :param df: DataFrame containing predictions.
        :param output_file_path: File path to save the predictions.
        """
        df.to_csv(output_file_path, index=False)
        logger.debug(f"Predictions saved to {output_file_path}")

In [None]:
# USER_NAME = "vaadin"
# REPO_NAME = "flow"

# USER_NAME = "grafana"
# REPO_NAME = "grafana"

In [None]:
repoHandler = RepoDataHandler(user_name=USER_NAME, repository_name=REPO_NAME, token=GITHUB_PAT)
# repoHandler.load_repo_data_config(config_file=REPO_DATA_CONFIG_FILE_NAME) # os.path.join(PROJECT_FOLDER, CONFIG_FOLDER, REPO_DATA_CONFIG_FILE_NAME)
repoHandler.load_repo_data_config()
repoHandler.load_data()
if FETCH_NEW_ISSUES:
    repoHandler.fetch_issues()
    repoHandler.save_data(partial=True)
issue_data = repoHandler.get_issues()
logger.debug(f"Length of issue data: {len(issue_data)}")

repoHandler.load_label_config()
label_names = repoHandler.get_label_dict()
logger.debug(label_names)

dataProcessor = DataProcessor(data=issue_data, label_names=label_names, user_name=USER_NAME, repository_name=REPO_NAME)
dataProcessor.process_data()

train_df = eval_df = test_df = None

train_df = dataProcessor.get_train_data()
if(train_df is not None):
    logger.debug(f"Shape of training data frame: {train_df.shape}")
eval_df = dataProcessor.get_eval_data()
if(eval_df is not None):
    logger.debug(f"Shape of evaluation data frame: {eval_df.shape}")
test_df = dataProcessor.get_test_data()
if(test_df is not None):
    logger.debug(f"Shape of test data frame: {test_df.shape}")
label_mapping_dict = dataProcessor.get_label_mapping_dict()
logger.debug(f"Label mapping dict: {label_mapping_dict}")

eval_df_copy = dataProcessor.get_eval_data()

In [None]:
BASELINE = False

if not BASELINE:
    # Initialize the predictor
    predictor = IssueLabelPredictor()
    
    # Fine-tune the model
    predictor.train(train_df, eval_df=eval_df)


else:
    baseline_predictor = IssueLabelPredictor()
    baseline_predictor.predict(eval_df_copy)
    
    if eval_df_copy is not None:
        report = baseline_predictor.evaluate(eval_df_copy)
        try:
            eval_df_copy[LABEL_COL] = eval_df_copy[LABEL_COL].map(label_mapping_dict) # TODO: TEST
        except Exception as e:
            logger.error(f"When trying to map labels for evaluation DataFrame Error {e} occured")
        logger.debug(report)

In [None]:
report = None

# Evaluate the model
if len(eval_df) > 0:
    report = predictor.evaluate(eval_df)
    try:
        eval_df[LABEL_COL] = eval_df[LABEL_COL].map(label_mapping_dict) # TODO: TEST
    except Exception as e:
        logger.error(f"When trying to map labels for evaluation DataFrame Error {e} occured")
    logger.debug(report)

# Make predictions on test data
if len(test_df) > 0:
    test_df = predictor.predict(test_df)
    try:
        test_df[PREDICTED_LABEL] = test_df[PREDICTED_LABEL].map(label_mapping_dict)
    except Exception as e:
        logger.error(f"When trying to map labels for test DataFrame Error {e} occured")

    if EXPORT_PREDICTIONS:
        predictor.save_predictions(test_df)

    # TODO: delete START
    repoHandler.load_results_dataframe(df=test_df)
    try:
        repoHandler.save_predictions_to_issue_data()
    except Exception as e:
        logger.error(f"Updating issue data failed: {e}")
    # TODO: delete END

    # # TODO: uncomment START
    # if UPDATE_ISSUE_DATA or UPDATE_GITHUB_LABELS:
    #     test_df[PREDICTED_LABEL] = test_df[PREDICTED_LABEL].map(repoHandler.get_prediction_labels()) # TODO: TEST
    #     repoHandler.load_results_dataframe(df=test_df)
    #     if UPDATE_ISSUE_DATA and UPDATE_GITHUB_LABELS:
    #         try:
    #             repoHandler.save_predictions_to_issue_data_and_update_issues()
    #         except Exception as e:
    #             logger.error(f"Updating issues failed: {e}")
    #     if UPDATE_ISSUE_DATA:
    #         try:
    #             repoHandler.save_predictions_to_issue_data()
    #         except Exception as e:
    #             logger.error(f"Updating issue data failed: {e}")
    # # TODO: uncomment END

    # # If test data is unlabeled
    # predictor.analyze_predictions(test_df)

In [None]:
test_df[PREDICTED_LABEL].value_counts()

In [None]:
if len(test_df) > 0 and EXPORT_PREIDCTIONS: # replace True with check for setting from config
    test_df_export = test_df[[NUMBER, PREDICTED_LABEL]]
    test_df_export.to_csv(os.path.join(PROJECT_FOLDER, RESULTS_FOLDER, f"predictions.csv"), index=False)
    # test_df_export.to_csv(os.path.join(PROJECT_FOLDER, RESULTS_FOLDER, f"{USER_NAME}_{REPO_NAME}_predictions.csv"), index=False)
    # test_df_export.to_csv(os.path.join(PROJECT_FOLDER, RESULTS_FOLDER, f"{USER_NAME}_{REPO_NAME}_predictions_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv"), index=False)

In [None]:
if UPDATE_ISSUE_DATA and len(test_df) > 0:
    logger.debug("Value counts for results dataframe:")
    logger.debug(test_df[PREDICTED_LABEL].value_counts())
    repoHandler.save_predictions_to_issue_data()

if UPDATE_GITHUB_LABELS: repoHandler.update_issues(df=test_df)

In [None]:
# TEST_USER_NAME = "test"
# TEST_REPO_NAME = "test"

In [None]:
# # Initialize your RepoDataHandler with your test repository
# handler = RepoDataHandler(
#     user_name=TEST_USER_NAME,
#     repository_name=TEST_REPO_NAME,
#     token=GITHUB_PAT
# )

# # Ensure GitHub API setup
# handler.setup_github_api()

# if UPDATE_ISSUE_DATA and len(test_df) > 0:
#     logger.debug("Value counts for results dataframe:")
#     logger.debug(test_df[PREDICTED_LABEL].value_counts())
#     repoHandler.save_predictions_to_issue_data()

# if UPDATE_GITHUB_LABELS: repoHandler.update_issues(df=test_df) # TODO: test by removing some labels for issues of my repo and then test the whole setup, including adding predicted labels to github repo
        
#     # csv_path = "../results/predictions.csv"  # Path to the CSV file with results
#     # handler.load_results_dataframe(csv_path=csv_path)  # Load predictions into handler


# def sync_issues_with_test_repo(handler):
#     counter = 0
#     for _, row in handler.results_df.iterrows():
#         if counter >= 10: return
#         try:
#             # Check if the issue already exists by title
#             existing_issues = list(handler.repo.get_issues(state="all"))
#             matched_issue = next((issue for issue in existing_issues if issue.title == row["title"]), None)

#             if matched_issue:
#                 print(f"Issue {matched_issue.number} already exists in the test repo. Updating labels.")
#                 handler.update_issue(issue_number=matched_issue.number, predicted_label=row[PREDICTED_LABEL])
#             else:
#                 # Create a new issue if not found
#                 new_issue = handler.repo.create_issue(
#                     title=row["title"],
#                     body=row["description"]
#                 )
#                 print(f"Created new issue {new_issue.number} from original issue {row['number']}")
#                 handler.update_issue(issue_number=new_issue.number, predicted_label=row[PREDICTED_LABEL])
#                 counter += 1
            
#             print(f"Updated issue {row['number']} in test repo.")
#         except Exception as e:
#             print(f"Error handling issue {row['number']}: {e}")

# # sync_issues_with_test_repo(handler)

# # TODO: update json file with new pushed labels, so next time only new unlabeled issues will have predictions done for (also introduce parameter that saves if the label was hand-made or a prediction of this model and also implement an option to only use hand-made labels for fine-tuning and ignore the issues with predicted labels completely)