# Explore temporal Dependencies

In [None]:


# Group by 'child_sha' to collect all files modified in a commit
commits = []
for child_sha, group in df.groupby("child_sha"):
    commit_files = list(group["new_file"].unique())
    commits.append({"commit_id": child_sha, "files": commit_files})

def jaccard_similarity(files_t, files_t1):
    set_t = set(files_t)
    set_t1 = set(files_t1)
    if not set_t and not set_t1:
        return 1  # Both empty, max similarity
    return len(set_t & set_t1) / len(set_t | set_t1)

# Compute Jaccard similarity between consecutive commits
similarities = []
for i in range(len(commits) - 1):
    sim = jaccard_similarity(commits[i]['files'], commits[i+1]['files'])
    similarities.append(sim)

print("Average Jaccard Similarity:", np.mean(similarities))

In [None]:
import pandas as pd
# import numpy as np
# import itertools
# import random
from collections import defaultdict
from sklearn.metrics import precision_score, recall_score


# Preprocess data
def preprocess_data(df):
    # Ignore renames
    df = df[df["old_file"] == df["new_file"]]
    # Group by commit
    commits = df.groupby("child_sha").agg({
        "old_file": lambda x: list(x),
        "when": "first"
    }).reset_index()
    # Sort by timestamp
    commits = commits.sort_values(by="when")
    return commits

# Train-test split
def train_test_split(commits, test_size=0.3):
    split_idx = int(len(commits) * (1 - test_size))
    train_commits = commits.iloc[:split_idx]
    test_commits = commits.iloc[split_idx:]
    return train_commits, test_commits

# Baseline 1: Co-occurrence-based model
class CoOccurrenceModel:
    def __init__(self):
        self.co_occurrence = defaultdict(lambda: defaultdict(int))

    def train(self, train_commits):
        for files in train_commits["old_file"]:
            for i in range(len(files)):
                for j in range(i + 1, len(files)):
                    self.co_occurrence[files[i]][files[j]] += 1
                    self.co_occurrence[files[j]][files[i]] += 1

    def predict(self, input_files, top_k=5):
        scores = defaultdict(int)
        for file in input_files:
            if file in self.co_occurrence:
                for neighbor, count in self.co_occurrence[file].items():
                    scores[neighbor] += count
        # Remove input files from predictions
        for file in input_files:
            scores.pop(file, None)
        # Sort by score
        sorted_files = sorted(scores.keys(), key=lambda x: scores[x], reverse=True)
        return sorted_files[:top_k]
    
def evaluate_model(model, test_commits, top_k=10):
    y_true = []
    y_pred = []
    for _, row in test_commits.iterrows():
        files = row["old_file"]
        if len(files) < 2:
            continue
        # Hold out one file as input, rest as ground truth
        for i in range(len(files)):
            input_files = [files[i]]
            true_files = files[:i] + files[i+1:]
            pred_files = model.predict(input_files, top_k=len(true_files))
            # print(pred_files)
            # y_true.append([1 if f in true_files else 0 for f in pred_files])
            # y_pred.append([1] * len(pred_files))

            y_true.append([1 if f in true_files else 0 for f in all_files])
            y_pred.append([1 if f in pred_files else 0 for f in all_files])
    # Flatten lists
    y_true = [item for sublist in y_true for item in sublist]
    y_pred = [item for sublist in y_pred for item in sublist]
    # print(len(y_true),y_true[:200])
    # print(y_pred[:200])
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    return precision, recall

In [4]:
df = pd.read_csv("../ffmpeg-master-none.csv")

# Filter out files that appear in less than 3 commits
file_counts = df["new_file"].value_counts()
filtered_files = file_counts[file_counts >= 3].index

df_filtered = df[df['new_file'].isin(filtered_files)].copy()
print(f"Filtered dataset has {len(df_filtered)} rows.")
df = df_filtered

all_files = df["new_file"].unique()

commits = preprocess_data(df)
train_commits, test_commits = train_test_split(commits)

# Train and evaluate co-occurrence model
co_occurrence_model = CoOccurrenceModel()
co_occurrence_model.train(train_commits)
co_precision, co_recall = evaluate_model(co_occurrence_model, test_commits)
print(f"Co-occurrence Model - Precision: {co_precision:.4f}, Recall: {co_recall:.4f}")

Filtered dataset has 7179 rows.
1931310 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0