##### The following code analyzes the viable_patches_json file. The points of analysis are described below. The primary tool for this analysis is pydriller.

1. Total size of the cloned repos
2. Total number of vulnerability inducing commits (vuln commits) found & not found
3. Average number of months between vuln commit and patch commit (or fix)
4. Average number of commits between the vuln commit & patch commit (or fix)
5. Average number of vuln commits fixed by patch commit (or fix)
6. Percentage of vulns where the vuln commit and fix were made by the same person


##### Sources
- @inbook{PyDriller,
    title = "PyDriller: Python Framework for Mining Software Repositories",
    abstract = "Software repositories contain historical and valuable information about the overall development of software systems. Mining software repositories (MSR) is nowadays considered one of the most interesting growing fields within software engineering. MSR focuses on extracting and analyzing data available in software repositories to uncover interesting, useful, and actionable information about the system. Even though MSR plays an important role in software engineering research, few tools have been created and made public to support developers in extracting information from Git repository. In this paper, we present PyDriller, a Python Framework that eases the process of mining Git. We compare our tool against the state-of-the-art Python Framework GitPython, demonstrating that PyDriller can achieve the same results with, on average, 50% less LOC and significantly lower complexity.URL: https://github.com/ishepard/pydrillerMaterials: https://doi.org/10.5281/zenodo.1327363Pre-print: https://doi.org/10.5281/zenodo.1327411",
    author = "Spadini, Davide and Aniche, Maurício and Bacchelli, Alberto",
    year = "2018",
    doi = "10.1145/3236024.3264598",
    booktitle = "The 26th ACM Joint European Software Engineering Conference and Symposium on the Foundations of Software Engineering (ESEC/FSE)",
    }

##### Author @Trust-Worthy



##### Reading in the results from the patch_vuln_match.json file and processing objects according to JSONL standard

In [9]:
import pandas as pd
import jsonlines    

json_path:str = "../../production_ready/patch_vuln_match.jsonl"

data: list[object] = []

with jsonlines.open(json_path) as reader:

    data = [entry for entry in reader]

# Convert the list of dictionaries into a pandas DataFrame
patch_vuln_df = pd.DataFrame(data)


# Define a function to extract the file paths and commits
def extract_vuln_files_commits(vuln_commits):
    if vuln_commits:
        files = list(vuln_commits.keys())
        commits = [commit for commits in vuln_commits.values() for commit in commits]
        return pd.Series([files, commits])
    else:
        return pd.Series([[], []])  # Empty lists if no vuln_commits

# # Apply the function to create new columns
# patch_vuln_df[['vuln_files', 'vuln_commits']] = patch_vuln_df['vuln_commits'].apply(extract_vuln_files_commits)



# print(patch_vuln_df.head())


##### This is where the fun begins.... (iykyk)

In [10]:
def extract_file_paths(vuln_commits):
    try:
        if isinstance(vuln_commits, dict):
            return list(vuln_commits.keys())
        return []
    except Exception as e:
        print(f"Error extracting file paths: {e}")
        return []
def extract_commit_hashes(vuln_commits):
    try:
        if isinstance(vuln_commits, dict):
            return list({commit for commits in vuln_commits.values() if isinstance(commits, list) for commit in commits})
        return []
    except Exception as e:
        print(f"Error extracting commit hashes: {e}")
        return []


In [11]:
# Apply functions to create new columns
patch_vuln_df["vuln_files"] = patch_vuln_df["vuln_commits"].apply(extract_file_paths)
patch_vuln_df["vuln_hashes"] = patch_vuln_df["vuln_commits"].apply(extract_commit_hashes)



print(patch_vuln_df.head())
print(patch_vuln_df.tail())

           cve_id                     repo  \
0   CVE-1999-0199             bminor/glibc   
1   CVE-1999-0731         KDE/kde1-kdebase   
2   CVE-2002-2443                krb5/krb5   
3  CVE-2005-10002  wp-plugins/secure-files   
4  CVE-2005-10003      mikexstudios/xcomic   

                               patch_commit  \
0  2864e767053317538feafa815046fff89e5a16be   
1  04906bd5de2f220bf100b605dad37b4a1d9a91a6   
2  cf1a0c411b2668c57c41e9c4efd15ba17b6b322c   
3  cab025e5fc2bcdad8032d833ebc38e6bd2a13c92   
4  6ed8e3cc336e29f09c7e791863d0559939da98bf   

                                        vuln_commits  \
0  {'elf/dl-load.c': ['dc5efe83c0252ad45337ab98ef...   
1                                                 {}   
2  {'src/kadmin/server/schpw.c': ['e88f857c3680ea...   
3  {'secure-files.php': ['b1afc063fd49cfb875e1c6f...   
4                                                 {}   

                                          vuln_files  \
0  [elf/dl-load.c, manual/search.texi, misc/sys

In [None]:
# Drop the original vuln_commits column if not needed
patch_vuln_df.drop(columns=["vuln_commits"], inplace=True)

print(patch_vuln_df.head())
print(patch_vuln_df.tail())

In [13]:
"""
Global variables
"""
NVD_ALL_REPOS = "/shared/rc/sfs/nvd-all-repos"

MATCH_FILES:str = "../../production_ready/patch_vuln_match.jsonl"

### Point 1
SIZE_OF_ALL_CLONED_REPOS: float = 0 ### size in MB

### Point 2
TOTAL_VULNS_COMMITS: int = 0 ### Another way to say this is total patch vuln pairs
TOTAL_PATCH_COMMITS_W_VULN_COMMIT: int = 0


### Point 6
### I can get the the number of patches without vulns / not found by doing total entires - total vulns
BY_SAME_PERSON: int = 0 ### Num of vulns made by the same person
PERCENTAGE_OF_VULN_N_PATCH_BY_SAME_PERSON: float = 0.0


### Point 3
TOTAL_NUM_MONTHS_BETWEEN: int = 0
AVERAGE_NUM_MONTHS_BETWEEN_VULN_N_PATCH: float = 0.0

### Point 4
TOTAL_NUM_COMMITS_BETWEEN: int = 0
AVERAGE_NUM_COMMITS_BETWEEN_VULN_N_PATCH: float = 0.0


In [14]:
import os
import logging
import glob
# Configure logging
logging.basicConfig(
filename="test_jupyter_1.log",
level=logging.WARNING,
format="%(asctime)s - %(levelname)s - %(message)s",
)

# Calculate repo size
def get_directory_size(path: str) -> float:
    size: float = 0
    for dirpath, _, filenames in os.walk(path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            size += os.path.getsize(fp)
    logging.info(f"got the size for {path} repo")
    return size

def safe_extract_vuln_files_commits(vuln_commits):
    """Wrapper function for error handling and logging."""
    try:
        return extract_vuln_files_commits(vuln_commits)
    except Exception as e:
        logging.error(f"Error processing vuln_commits: {vuln_commits} - {e}", exc_info=True)
        return pd.Series([[], []])  # Return empty lists in case of failure
def find_repo_path(owner_repo: str) -> str | None:
    """Finds the path of a repository inside NVD_ALL_REPOS.

    Args:
        owner_repo (str): The repository in 'owner/repo' format.

    Returns:
        str | None: The path to the repository if found, otherwise None.
    """
    
    matching_repos:list = glob.glob(os.path.join(NVD_ALL_REPOS, f"*{owner_repo}*"))
    return matching_repos[0]


In [17]:
from typing import Optional
from pydriller import Repository
from datetime import datetime
from dateutil.relativedelta import relativedelta 


def calculate_all_repo_sizes(patch_vuln_df: pd.DataFrame,total_size:float) -> float:
    """

    POINT 1
    
    This function calculates the total size of all the repos cloned for the Vulnerability History Project
    in our NVD all repos database.

    Args:
        patch_vuln_df (pd.DataFrame): _description_
        total_size (float): _description_

    Returns:
        float: _description_
    """
    # Variable used to track repos analyzed for accurate storage metrics
    unique_repo_paths: set[str] = set()

    for repo in patch_vuln_df["repo"]:
        try:
            repo_path = find_repo_path(repo)
            if repo_path is None:
                logging.error(f"Repository path not found for {repo}, skipping this entry.")
                continue
            logging.info(f"Repository path for {repo}: {repo_path}")
        except Exception as e:
            logging.error(f"Error finding repository path for {repo}: {e}")
            continue  # Skip this repo and move to the next
        try:
            # Code for point 1: Tracking repo size
            if repo_path not in unique_repo_paths:
                temp_repo_path = repo_path  
                unique_repo_paths.add(temp_repo_path)
                repo_size: float = get_directory_size(temp_repo_path) / (1024 * 1024)  # Convert to MB
                total_size += repo_size
                logging.info(f"Repo size for {temp_repo_path} added. Total size: {total_size} MB")
        except Exception as e:
            logging.error(f"Error calculating repo size for {repo_path}: {e}")
            continue  # Continue to next commit if size calculation fails

    return total_size

In [26]:
'''
POINT 2
'''
def calculate_total_num_vuln_hashes(patch_vuln_df: pd.DataFrame) -> int:
    return patch_vuln_df["vuln_hashes"].explode().count()

def calculate_patch_vuln_matches(patch_vuln_df: pd.DataFrame) -> int:
    
    # Query for empty vuln_files and vuln_hashes
    empty_patch_vuln_matches_count = patch_vuln_df[
        #patch_vuln_df["vuln_files"].apply(lambda x: len(x) == 0) #& 
        patch_vuln_df["vuln_hashes"].apply(lambda x: len(x) == 0)
    ].shape[0]
    return empty_patch_vuln_matches_count

TOTAL_VULNS_COMMITS = calculate_total_num_vuln_hashes(patch_vuln_df)
TOTAL_PATCH_COMMITS_W_VULN_COMMIT = calculate_patch_vuln_matches(patch_vuln_df)
TOTAL_DF_ENTRIES = patch_vuln_df.shape[0]
print("Total entries in patch vuln dataframe:" + str(TOTAL_DF_ENTRIES))
print("Total vulnerable commits in patch vuln dataframe:" + str(TOTAL_VULNS_COMMITS)) ### really total vuln patches
print("Total patch commits with at least one vulnerable commit:" + str(TOTAL_PATCH_COMMITS_W_VULN_COMMIT))

Total entries in patch vuln dataframe:13383
Total vulnerable commits in patch vuln dataframe:47979
Total patch commits with at least one vulnerable commit:3112


In [None]:
'''
Point 3
'''
def calculate_total_num_months_between_patches_and_vulns(patch_vuln_df: pd.DataFrame) -> int:
    for repo,patch_commit,vuln_hashes in zip(patch_vuln_df["repo"],patch_vuln_df["patch_commit"],patch_vuln_df["vuln_hashes"]):
        if vuln_hashes == []:
            continue
        
        repo_path = find_repo_path(repo)
        commits_to_analyze

In [16]:
try:
                # Handling patch commit (first commit in the list)
                if is_patch:
                    patch_author_date = commit.author_date
                    patch_author = commit.author.email  # Email is typically a string
                    patch_hash = commit.hash

                    TOTAL_PATCH_COMMITS_W_VULN_COMMIT += 1
                    logging.info(f"Patch commit found: {patch_hash}. Total patches with vuln commits: {TOTAL_PATCH_COMMITS_W_VULN_COMMIT}")
                    is_patch = False
                    continue  # Skip the patch commit in the next steps

                # Handling vulnerability commit (following commits after patch)
                vuln_author: Optional[str] = None
                vuln_author_date: Optional[datetime] = commit.author_date
                vuln_hash = commit.hash
                vuln_author = commit.author.email

                # Point 3: Calculate difference between patch and vuln dates in months
                if patch_author_date and vuln_author_date:
                    difference = relativedelta(patch_author_date, vuln_author_date)
                    months_difference = (difference.years or 0) * 12 + (difference.months or 0)
                    TOTAL_NUM_MONTHS_BETWEEN += months_difference
                    logging.info(f"Month difference between patch and vuln: {months_difference} months.")
                else:
                    logging.warning("Missing date values for patch or vuln commit. Skipping date difference calculation.")
                
                # Point 4: Count commits between patch and vuln commit
                try:
                    commit_count: int = get_commits_between(temp_repo_path, vuln_hash, patch_hash)
                    TOTAL_NUM_COMMITS_BETWEEN += commit_count
                    logging.info(f"Commits between vuln and patch: {commit_count}. Total commits between: {TOTAL_NUM_COMMITS_BETWEEN}")
                except Exception as e:
                    logging.error(f"Error counting commits between {vuln_hash} and {patch_hash}: {e}")
                    continue  # Skip if commit counting fails

                # Point 6: Compare patch and vuln author
                if patch_author == vuln_author:
                    BY_SAME_PERSON += 1
                    logging.info(f"Patch and vuln by same author: {patch_author}. Total: {BY_SAME_PERSON}")

                # Point 2: Get total number of vulnerabilities
                TOTAL_VULNS += len(vuln_commits)
                logging.info(f"Total vulnerabilities so far: {TOTAL_VULNS}")

            except Exception as e:
                logging.error(f"Error processing commit {commit.hash} in repo {owner_repo}: {e}")
                continue  # Skip this commit and continue to next one
    

IndentationError: unindent does not match any outer indentation level (<string>, line 46)