# Additionnal Metrics
As per the specification of the personnal project, we'll try and gather additionnal metrics to improve our model. Since this notebook is very compute-intensive, it was executed directly in the terminal after converting this notebook to a python file.
## 1. Lines added & deleted from a given version
We'll begin by creating a dictionnary of versions and corresponding commits from the file `Hive_Last_Commits.csv`previously created

In [34]:
import os
import glob
import re
import csv
import pandas as pd
from pathlib import Path
from collections import defaultdict
import subprocess
from datetime import datetime
from statistics import mean
import logging
from datetime import datetime
from statistics import mean
import pytz
import git
from typing import Iterable, Dict, Tuple
import math

In [4]:
project_repo = Path("/home/nicolas-richard/Desktop/.Apache_Hive_Bug_Prediction_ML_Model/")
hive_repo = Path("/home/nicolas-richard/Desktop/.Apache_Hive/")

In [5]:
last_commits = open(os.path.join(project_repo, "Hive_Last_Commits.csv"), "r")

versions = []
commits_by_version = {}

for i, line in enumerate(last_commits.readlines()):
    if i == 0:
        continue

    parts = line.strip().split(",")
    version = parts[0]
    commit = parts[1]

    versions.append(version)
    commits_by_version[version] = [commit]  
last_commits.close()

print(commits_by_version)

{'2.0.0   ': ['7f9f1fcb8697fb33f0edc2c391930a3728d247d7'], '2.0.1   ': ['e3cfeebcefe9a19c5055afdcbb00646908340694'], '2.1.0   ': ['9265bc24d75ac945bde9ce1a0999fddd8f2aae29'], '2.1.1   ': ['1af77bbf8356e86cabbed92cfa8cc2e1470a1d5c'], '2.2.0   ': ['da840b0f8fa99cab9f004810cd22abc207493cae'], '2.3.0   ': ['6f4c35c9e904d226451c465effdc5bfd31d395a0'], '2.3.1   ': ['7590572d9265e15286628013268b2ce785c6aa08'], '2.3.2   ': ['857a9fd8ad725a53bd95c1b2d6612f9b1155f44d'], '2.3.3   ': ['3f7dde31aed44b5440563d3f9d8a8887beccf0be'], '2.3.4   ': ['56acdd2120b9ce6790185c679223b8b5e884aaf2'], '2.3.5   ': ['76595628ae13b95162e77bba365fe4d2c60b3f29'], '2.3.6   ': ['2c2fdd524e8783f6e1f3ef15281cc2d5ed08728f'], '2.3.7   ': ['cb213d88304034393d68cc31a95be24f5aac62b6'], '2.3.8   ': ['f1e87137034e4ecbe39a859d4ef44319800016d7'], '2.3.9   ': ['92dd0159f440ca7863be3232f3a683a510a62b9d'], '2.3.10  ': ['5160d3af392248255f68e41e1e0557eae4d95273'], '3.0.0   ': ['ce61711a5fa54ab34fc74d86d521ecaeea6b072a'], '3.1.0   ': [

In [None]:
UND_hive_updated_directory = project_repo / "UND_hive_updated_data"
output_directory = project_repo / "UND_hive_additional_metrics"
os.makedirs(output_directory, exist_ok=True)

csv_files = sorted([f for f in os.listdir(UND_hive_updated_directory) if f.endswith('.csv')])

for file in csv_files:
    df = pd.read_csv(UND_hive_updated_directory / file)
    df_version = file.split("_")[1]

    print(f"\n=== Processing version: {df_version} ===")

    for another_file in csv_files:
        another_version = another_file.split("_")[1]
    
        df[f"LinesAddedSince{another_version}"] = 0
        print(f"LinesAddedSince{another_version} added to version {df_version}")
        df[f"LinesRemovedSince{another_version}"] = 0
        print(f"LinesRemovedSince{another_version} added to version {df_version}")

        if another_version >= df_version: 
            continue
            
        print(f"  Comparing with earlier version: {another_version}")
        another_df = pd.read_csv(os.path.join(UND_hive_updated_directory, another_file))

        for index, row in df.iterrows():
            file_name = row["FileName"]
            line_count = row["CountLine"]

            matching_rows = another_df[another_df["FileName"] == file_name]

            if not matching_rows.empty:
                another_line_count = matching_rows.iloc[0]["CountLine"]
                print(f"    - {file_name} found in version {another_version}")

                if line_count > another_line_count:
                    added_lines = line_count - another_line_count
                    df.loc[index, f"LinesAddedSince{another_version}"] = added_lines
                    print(f"      Lines added: {added_lines}")
                elif line_count < another_line_count:
                    removed_lines = another_line_count - line_count
                    df.loc[index, f"LinesRemovedSince{another_version}"] = removed_lines
                    print(f"      Lines removed: {removed_lines}")

    output_path = output_directory / f"UND_{df_version}.csv"
    df.to_csv(output_path, index=False)
    print(f"=== Updated file saved as {output_path} ===\n")

## 2. Commits

### 2.1 Commits Affecting the File in a Given version
First, we'll fetch the `CommitsAffectingFileInCurrentVersion`, `CommitsFixingBugInFileInCurrentVersion`, `CommitsAffectingFileInPreviousVersions` variables and save them to our additionnal metrics files.

In [33]:
version_commits_file = "Hive_Last_Commits.csv"

def load_version_commits(version_commits_file):
    """Load version and commit mapping from a CSV file."""
    version_commits = []
    with open(version_commits_file, mode='r') as f:
        reader = csv.reader(f)
        next(reader)  
        for row in reader:
            version_commits.append((row[0].strip(), row[1].strip()))
    return version_commits

def compare_versions(version1, version2):
    """Compare two semantic versions (e.g., 2.1.0 and 2.0.1)."""
    v1 = list(map(int, version1.split('.')))
    v2 = list(map(int, version2.split('.')))
    return v1 <= v2

def collect_metrics(hive_repo, version_commits, target_file_name, df_version):
    """Collect metrics for a target file."""
    repo = git.Repo(hive_repo)
    metrics = defaultdict(dict)
    all_previous_commits = []

    for i, (version, commit_hash) in enumerate(version_commits):
        if not compare_versions(version, df_version):
            continue

        try:
            commits_affecting_file = list(repo.iter_commits(f"{commit_hash}..HEAD", paths=target_file_name))
        except Exception as e:
            print(f"Error fetching commits for {target_file_name}: {e}")
            continue
        
        bug_fix_keywords = ["fix", "bug", "issue", "HIVE-"]
        bug_fix_commits = [
            c for c in commits_affecting_file if any(keyword in c.message.lower() for keyword in bug_fix_keywords)
        ]

        if i > 0:
            try:
                previous_commit_hash = version_commits[i - 1][1]
                previous_commits = list(repo.iter_commits(f"{previous_commit_hash}..{commit_hash}", paths=target_file_name))
                all_previous_commits.extend(previous_commits)
                all_previous_commits = list(set(all_previous_commits))  
            except Exception as e:
                print(f"Error fetching previous commits for {target_file_name}: {e}")
                previous_commits = []

        metrics[version] = {
            "num_commits_in_version": len(commits_affecting_file),
            "num_bug_fix_commits": len(bug_fix_commits),
            "num_commits_in_previous_versions": len(all_previous_commits),
        }
    return metrics

def display_metrics(metrics):
    """Display metrics in a readable format."""
    for version, data in metrics.items():
        print(f"  - Commits affecting file in version: {data['num_commits_in_version']}")
        print(f"  - Bug fix commits in version: {data['num_bug_fix_commits']}")
        print(f"  - Commits in previous versions: {data['num_commits_in_previous_versions']}")
        print()

if __name__ == "__main__":
    version_commits = load_version_commits(version_commits_file)
    files = sorted([
        os.path.join(project_repo, "UND_hive_additional_metrics", f) 
        for f in os.listdir(os.path.join(project_repo, "UND_hive_additional_metrics")) 
    ])

    for file in files:
        df_version = file.split("_")[-1] 
        df = pd.read_csv(file)
        df["CommitsAffectingFileInCurrentVersion"] = 0
        print(f'"CommitsAffectingFileInCurrentVersion" added to version {df_version}')
        df["CommitsFixingBugInFileInCurrentVersion"] = 0
        print(f'"CommitsFixingBugInFileInCurrentVersion" added to version {df_version}')
        df["CommitsAffectingFileInPreviousVersions"] = 0
        print(f'"CommitsAffectingFileInPreviousVersions" added to version {df_version}')


        for index, row in df.iterrows():
            target_file_name = row["FileName"]
            try:
                print(f"\n\nVersion Metrics for {target_file_name} in version <= {df_version}")
                metrics = collect_metrics(hive_repo, version_commits, target_file_name.strip(), df_version)
                display_metrics(metrics)

                df.loc[index, "CommitsAffectingFileInCurrentVersion"] = metrics[df_version]["num_commits_in_version"]
                df.loc[index, "CommitsFixingBugInFileInCurrentVersion"] = metrics[df_version]["num_bug_fix_commits"]
                df.loc[index, "CommitsAffectingFileInPreviousVersions"] = metrics[df_version]["num_commits_in_previous_versions"]
                
            except Exception as e:
                print(f"Error processing {target_file_name}: {e}")

        df.to_csv(file, index=False)
        print(f"=== Updated file saved as {file} ===\n")
    print("\n\n\nCommit version processing successful\n\n\n")

"CommitsAffectingFileInCurrentVersion" added to version 2.0.0
"CommitsFixingBugInFileInCurrentVersion" added to version 2.0.0
"CommitsAffectingFileInPreviousVersions" added to version 2.0.0


Version Metrics for /home/nicolas-richard/Desktop/.Apache_Hive/accumulo-handler/src/java/org/apache/hadoop/hive/accumulo/AccumuloConnectionParameters.java in version <= 2.0.0
  - Commits affecting file in version: 1
  - Bug fix commits in version: 0
  - Commits in previous versions: 0



Version Metrics for /home/nicolas-richard/Desktop/.Apache_Hive/accumulo-handler/src/java/org/apache/hadoop/hive/accumulo/AccumuloHiveConstants.java in version <= 2.0.0
  - Commits affecting file in version: 1
  - Bug fix commits in version: 1
  - Commits in previous versions: 0



Version Metrics for /home/nicolas-richard/Desktop/.Apache_Hive/accumulo-handler/src/java/org/apache/hadoop/hive/accumulo/AccumuloHiveRow.java in version <= 2.0.0
  - Commits affecting file in version: 3
  - Bug fix commits in version: 0


KeyboardInterrupt: 