# Additionnal Metrics
As per the specification of the personnal project, we'll try and gather additionnal metrics to improve our model. 
## 1. Lines added & deleted from a given version
We'll begin by creating a dictionnary of versions and corresponding commits from the file `Hive_Last_Commits.csv`previously created

In [9]:
import os
import glob
import re
import csv
import pandas as pd
from pathlib import Path
from collections import defaultdict
import subprocess
from datetime import datetime
from statistics import mean
import logging
from datetime import datetime
from statistics import mean
import pytz
import git
from typing import Iterable, Dict, Tuple
import math

In [3]:
project_repo = Path("/home/nicolas-richard/Desktop/.Apache_Hive_Bug_Prediction_ML_Model/")
hive_repo = Path("/home/nicolas-richard/Desktop/.Apache_Hive/")

In [3]:
last_commits = open(os.path.join(project_repo, "Hive_Last_Commits.csv"), "r")

versions = []
commits_by_version = {}

for i, line in enumerate(last_commits.readlines()):
    if i == 0:
        continue

    parts = line.strip().split(",")
    version = parts[0]
    commit = parts[1]

    versions.append(version)
    commits_by_version[version] = [commit]  
last_commits.close()

print(commits_by_version)

{'2.0.0': ['7f9f1fcb8697fb33f0edc2c391930a3728d247d7'], '2.0.1': ['e3cfeebcefe9a19c5055afdcbb00646908340694'], '2.1.0': ['9265bc24d75ac945bde9ce1a0999fddd8f2aae29'], '2.1.1': ['1af77bbf8356e86cabbed92cfa8cc2e1470a1d5c'], '2.2.0': ['da840b0f8fa99cab9f004810cd22abc207493cae'], '2.3.0': ['6f4c35c9e904d226451c465effdc5bfd31d395a0'], '2.3.1': ['7590572d9265e15286628013268b2ce785c6aa08'], '2.3.2': ['857a9fd8ad725a53bd95c1b2d6612f9b1155f44d'], '2.3.3': ['3f7dde31aed44b5440563d3f9d8a8887beccf0be'], '2.3.4': ['56acdd2120b9ce6790185c679223b8b5e884aaf2'], '2.3.5': ['76595628ae13b95162e77bba365fe4d2c60b3f29'], '2.3.6': ['2c2fdd524e8783f6e1f3ef15281cc2d5ed08728f'], '2.3.7': ['cb213d88304034393d68cc31a95be24f5aac62b6'], '2.3.8': ['f1e87137034e4ecbe39a859d4ef44319800016d7'], '2.3.9': ['92dd0159f440ca7863be3232f3a683a510a62b9d'], '2.3.10': ['5160d3af392248255f68e41e1e0557eae4d95273'], '3.0.0': ['ce61711a5fa54ab34fc74d86d521ecaeea6b072a'], '3.1.0': ['bcc7df95824831a8d2f1524e4048dfc23ab98c19'], '3.1.1':

In [None]:
UND_hive_updated_directory = project_repo / "UND_hive_updated_data"
output_directory = project_repo / "UND_hive_additional_metrics"
os.makedirs(output_directory, exist_ok=True)

csv_files = sorted([f for f in os.listdir(UND_hive_updated_directory) if f.endswith('.csv')])

for file in csv_files:
    df = pd.read_csv(UND_hive_updated_directory / file)
    df_version = file.split("_")[1]

    print(f"\n=== Processing version: {df_version} ===")

    for another_file in csv_files:
        another_version = another_file.split("_")[1]
    
        df[f"LinesAddedSince{another_version}"] = 0
        print(f"LinesAddedSince{another_version} added to version {df_version}")
        df[f"LinesRemovedSince{another_version}"] = 0
        print(f"LinesRemovedSince{another_version} added to version {df_version}")


        if another_version >= df_version: 
            continue
            
        print(f"  Comparing with earlier version: {another_version}")
        another_df = pd.read_csv(os.path.join(UND_hive_updated_directory, another_file))

        for index, row in df.iterrows():
            file_name = row["FileName"]
            line_count = row["CountLine"]

            matching_rows = another_df[another_df["FileName"] == file_name]

            if not matching_rows.empty:
                another_line_count = matching_rows.iloc[0]["CountLine"]
                print(f"    - {file_name} found in version {another_version}")

                if line_count > another_line_count:
                    added_lines = line_count - another_line_count
                    df.loc[index, f"LinesAddedSince{another_version}"] = added_lines
                    print(f"      Lines added: {added_lines}")
                elif line_count < another_line_count:
                    removed_lines = another_line_count - line_count
                    df.loc[index, f"LinesRemovedSince{another_version}"] = removed_lines
                    print(f"      Lines removed: {removed_lines}")

    output_path = output_directory / f"UND_{df_version}.csv"
    df.to_csv(output_path, index=False)
    print(f"=== Updated file saved as {output_path} ===\n")

## 2. Commits

### 2.1 Mapping Developper Experienmce
First, we'll map all developers who have worked on the software and calculate their experience up to each version. Developer experience will be defined as the number of commits they have made before the current version.

In [8]:
repository = git.Repo(hive_repo)

developer_experience = defaultdict(int)
co_author_pattern = re.compile(r'^Co-authored-by:\s+.*<([^>]+)>$', re.IGNORECASE)

developer_experience = defaultdict(int)

def normalize_email(email):
    """Strip whitespace and convert email to lowercase."""
    return email.strip().lower() if email else None

def extract_emails_from_commit(commit):
    """Extract all relevant emails from a commit."""
    emails = set()

    for role in ['author', 'committer']:
        person = getattr(commit, role, None)
        if person and hasattr(person, 'email'):
            email = normalize_email(person.email)
            if email:
                emails.add(email)

    for line in commit.message.splitlines():
        match = co_author_pattern.match(line)
        if match:
            co_author_email = normalize_email(match.group(1))
            if co_author_email:
                emails.add(co_author_email)

    return emails

for commit in repository.iter_commits():
    developers = extract_emails_from_commit(commit)
    for developer in developers:
        developer_experience[developer] += 1

developer_experience = dict(developer_experience)

for developer, commits in developer_experience.items():
    print(f"{developer}: {commits} commit(s)")

dengzh@apache.org: 80 commit(s)
rameshkumarthangarajan@gmail.com: 24 commit(s)
3898450+cxzl25@users.noreply.github.com: 2 commit(s)
dkuzmenko@cloudera.com: 115 commit(s)
yuwq1996@gmail.com: 32 commit(s)
zratkai@cloudera.com: 3 commit(s)
643348094@qq.com: 1 commit(s)
shivjha@visa.com: 1 commit(s)
55212670+mayankkunwar@users.noreply.github.com: 6 commit(s)
kiranvelumuri369@gmail.com: 3 commit(s)
louwazor@gmail.com: 1 commit(s)
dkuzmenko@apache.org: 23 commit(s)
kkasa@cloudera.com: 118 commit(s)
deniskuzz@gmail.com: 35 commit(s)
raghavaggarwal03.ra@gmail.com: 11 commit(s)
55709772+scarlin-cloudera@users.noreply.github.com: 23 commit(s)
daya.apache@gmail.com: 7 commit(s)
mdayakar@cloudera.com: 8 commit(s)
ayushsaxena@apache.org: 149 commit(s)
bodorlaszlo0202@gmail.com: 164 commit(s)
beter.max@gmail.com: 7 commit(s)
git@okumin.com: 25 commit(s)
lngsg@postech.ac.kr: 19 commit(s)
dmitriy.fingerman@gmail.com: 21 commit(s)
iamsbadhya@gmail.com: 18 commit(s)
dengzhhu653@gmail.com: 88 commit(s)
3

In [10]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def run_git_command(command, repo_path):
    """
    Executes a git command in the specified repository path.
    """
    logging.debug(f"Executing Git command: {command} in {repo_path}")
    try:
        result = subprocess.check_output(
            command, shell=True, text=True, stderr=subprocess.STDOUT, cwd=repo_path
        )
        logging.debug(f"Git command output: {result}")
        return result.strip().splitlines()
    except subprocess.CalledProcessError as e:
        logging.error(f"Error running command: {command}\n{e.output}")
        return []

def calculate_time_differences(timestamps):
    """
    Calculates the mean time difference in seconds between consecutive timestamps.
    """
    if not timestamps or len(timestamps) < 2:
        return None
    timestamps.sort()
    diffs = [(timestamps[i] - timestamps[i-1]).total_seconds() for i in range(1, len(timestamps))]
    return mean(diffs) if diffs else None

def get_file_names(csv_path):
    """
    Reads a CSV file and returns a list of filenames from the 'FileName' column.
    """
    try:
        file_df = pd.read_csv(csv_path)
        if 'FileName' not in file_df.columns:
            logging.warning(f"'FileName' column not found in {csv_path}.")
            return []
        return file_df['FileName'].dropna().tolist()  
    except Exception as e:
        logging.error(f"Error reading file {csv_path}: {e}")
        return []

def calculate_expertise( developers: Iterable[str], developer_experience: Dict[str, int]) -> Tuple[float, float]:
    """
    Calculates the average and minimum expertise of developers based on the developer_experience dictionary.
    """
    experience_levels = [developer_experience.get(dev, 0) for dev in developers]

    if experience_levels:
        avg_expertise = sum(experience_levels) / len(experience_levels)
        min_expertise = min(experience_levels)
        return avg_expertise, min_expertise
    else:
        return math.nan, math.nan

In [12]:
def main():
    if not os.path.isdir(project_repo):
        logging.error(f"Project repository path does not exist: {project_repo}")
        return
    if not os.path.isdir(hive_repo):
        logging.error(f"Hive repository path does not exist: {hive_repo}")
        return

    version_file = "Hive_Last_Commits.csv"
    version_file_path = os.path.join(project_repo, version_file)

    try:
        versions = pd.read_csv(version_file_path)
        if not {'Version', 'Commit Hash'}.issubset(versions.columns):
            logging.error(f"'Version' and/or 'Commit Hash' columns missing in {version_file_path}.")
            return
    except Exception as e:
        logging.error(f"Error reading version file {version_file_path}: {e}")
        return

    file_pattern = os.path.join(project_repo, "UND_hive_additional_metrics", "*")
    file_paths = glob.glob(file_pattern)

    if not file_paths:
        logging.warning(f"No files found matching pattern: {file_pattern}")
        return

    keywords = ["fix", "bug", "resolve", "patch", "issue", "correct"]

    git_format = "%h%x1f%ae%x1f%ce%x1f%at"

    for file_path in file_paths:
        files = get_file_names(file_path)
        if not files:
            logging.info(f"No files to process in {file_path}.")
            continue

        for file in files:
            target_file_path = os.path.join(project_repo, "UND_hive_additional_metrics", file)
            if not os.path.isfile(target_file_path):
                logging.warning(f"Target file does not exist: {target_file_path}")
                continue

            for i in range(len(versions)):
                version = versions.iloc[i].get('Version')
                commit_hash = versions.iloc[i].get('Commit Hash')
                if pd.isna(version) or pd.isna(commit_hash):
                    logging.warning(f"Missing 'Version' or 'Commit Hash' in row {i} of {version_file_path}.")
                    continue

                previous_commit = versions.iloc[i-1].get('Commit Hash') if i > 0 else None

                logging.info(f"\n====== Processing file: {file} in version: {version} ======")

                commit_range = f"{previous_commit}..{commit_hash}" if previous_commit else commit_hash

                git_command = (
                    f"git log --oneline --format='{git_format}' {commit_range} -- \"{target_file_path}\""
                )
                commits = run_git_command(git_command, hive_repo)

                timestamps = []
                developers = set()
                for commit in commits:
                    parts = commit.split('\x1f')
                    if len(parts) != 3:
                        logging.debug(f"Malformed commit entry: {commit}")
                        continue
                    _, dev, ts = parts
                    dev = dev.strip().lower() 
                    try:
                        timestamp = datetime.fromtimestamp(int(ts))
                        timestamps.append(timestamp)
                        developers.add(dev)
                    except ValueError:
                        logging.debug(f"Invalid timestamp in commit: {commit}")
                        continue

                all_commits_command = f"git log --oneline --format='{git_format}' {commit_hash} -- \"{target_file_path}\""
                all_commits = run_git_command(all_commits_command, hive_repo)

                all_timestamps = []
                all_developers = set()
                for commit in all_commits:
                    parts = commit.split('\x1f')
                    if len(parts) != 3:
                        logging.debug(f"Malformed commit entry: {commit}")
                        continue
                    _, dev, ts = parts
                    dev = dev.strip().lower()  
                    try:
                        timestamp = datetime.fromtimestamp(int(ts))
                        all_timestamps.append(timestamp)
                        all_developers.add(dev)
                    except ValueError:
                        logging.debug(f"Invalid timestamp in commit: {commit}")
                        continue

                grep_keywords = '|'.join(keywords)
                bug_fixes_command = (
                    f"git log --oneline --grep='{grep_keywords}' "
                    f"--format='{git_format}' {commit_range} -- \"{target_file_path}\""
                )
                bug_fixes = run_git_command(bug_fixes_command, hive_repo)

                avg_time_between_changes = calculate_time_differences(timestamps)
                avg_time_all_versions = calculate_time_differences(all_timestamps)

                avg_expertise, min_expertise = calculate_expertise(developers, developer_experience)

                logging.info(f"Commits in version {version}: {len(commits)}")
                logging.info(f"Commits up to version {version}: {len(all_commits)}")
                logging.info(f"Bug-fix commits in version {version}: {len(bug_fixes)}")
                logging.info(f"Developers in version {version}: {developers}")
                logging.info(f"Developers up to version {version}: {len(all_developers)}")
                logging.info(f"Average time between changes in version {version}: {avg_time_between_changes} seconds")
                logging.info(f"Average time between changes in all versions: {avg_time_all_versions} seconds")
                logging.info(f"Average developer expertise in version {version}: {avg_expertise}")
                logging.info(f"Minimum developer expertise in version {version}: {min_expertise}\n\n\n")

if __name__ == "__main__":
    main()


2024-12-03 17:17:57,013 - INFO - 
2024-12-03 17:17:57,264 - INFO - Commits in version 2.0.0: 2
2024-12-03 17:17:57,264 - INFO - Commits up to version 2.0.0: 2
2024-12-03 17:17:57,265 - INFO - Bug-fix commits in version 2.0.0: 0
2024-12-03 17:17:57,265 - INFO - Developers in version 2.0.0: set()
2024-12-03 17:17:57,265 - INFO - Developers up to version 2.0.0: 0
2024-12-03 17:17:57,265 - INFO - Average time between changes in version 2.0.0: None seconds
2024-12-03 17:17:57,266 - INFO - Average time between changes in all versions: None seconds
2024-12-03 17:17:57,266 - INFO - Average developer expertise in version 2.0.0: nan
2024-12-03 17:17:57,268 - INFO - Minimum developer expertise in version 2.0.0: nan



2024-12-03 17:17:57,269 - INFO - 
2024-12-03 17:17:57,355 - INFO - Commits in version 2.0.1: 0
2024-12-03 17:17:57,356 - INFO - Commits up to version 2.0.1: 2
2024-12-03 17:17:57,356 - INFO - Bug-fix commits in version 2.0.1: 0
2024-12-03 17:17:57,357 - INFO - Developers in version 

KeyboardInterrupt: 