In [1]:
import pandas as pd

In [None]:
import pandas as pd

df = pd.read_csv("diff_analysis_results.csv")

def categorize_file(path):
    if path is None:
        return "Other"
    path = path.lower()
    if "test" in path:
        return "Test code"
    source_extensions = [
        ".py", ".java", ".c", ".cpp", ".h", ".hpp",
        ".js", ".ts", ".go", ".rs", ".rb", ".php"
    ]
    if any(path.endswith(ext) for ext in source_extensions):
        return "Source code"
    if "readme" in path:
        return "README"
    elif "license" in path:
        return "LICENSE"
    return "Other"

df["file_category"] = df["new_file_path"].fillna(df["old_file_path"]).apply(categorize_file)

total_counts = df.groupby("file_category").size()

mismatch_counts = df[df["Discrepancy"]=="Yes"].groupby("file_category").size()

summary = pd.DataFrame({
    "Total Files": total_counts,
    "Mismatches": mismatch_counts
}).fillna(0).astype(int)

summary["Mismatch %"] = (summary["Mismatches"] / summary["Total Files"] * 100).round(2)

print("\nMismatches by File Category:\n")
print(summary)


Mismatches by File Category:

               Total Files  Mismatches  Mismatch %
file_category                                     
Assets                 116           0         0.0
Build/Script           217           0         0.0
Config                2846           0         0.0
Data                    12           0         0.0
Docs                 12441           0         0.0
LICENSE                 21           0         0.0
Other                  977           0         0.0
README                 945           0         0.0
Source code          11296           0         0.0
Test code             5549           0         0.0


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# --- Load dataset ---
df = pd.read_csv("diff_analysis_results.csv")

# --- File categorization ---
def categorize_file(path):
    if pd.isna(path):
        return "Other"
    path = path.lower()
    if any(path.endswith(ext) for ext in [".py", ".java", ".c", ".cpp", ".js", ".ts", ".rb", ".go"]):
        return "Test code" if "test" in path else "Source code"
    elif "test" in path:
        return "Test code"
    elif "readme" in path:
        return "README"
    elif "license" in path:
        return "LICENSE"
    else:
        return "Other"

df["file_category"] = df["new_file_path"].fillna(df["old_file_path"]).apply(categorize_file)
df["is_mismatch"] = df["Discrepancy"].str.strip().str.lower() == "yes"

# --- Extract file extension ---
df["extension"] = df["new_file_path"].fillna(df["old_file_path"]).str.extract(r"(\.\w+)$")[0].fillna("Other")

# --- 1. Summary by file category ---
summary_category = df.groupby("file_category").agg(
    Total_Files=("file_category", "size"),
    Mismatches=("is_mismatch", "sum")
)
summary_category["Mismatch_%"] = 100 * summary_category["Mismatches"] / summary_category["Total_Files"]
print("=== File Category Summary ===\n", summary_category, "\n")

# --- 2. Summary by file extension ---
summary_ext = df.groupby("extension").agg(
    Total_Files=("extension", "size"),
    Mismatches=("is_mismatch", "sum")
)
summary_ext["Mismatch_%"] = 100 * summary_ext["Mismatches"] / summary_ext["Total_Files"]
summary_ext = summary_ext.sort_values("Mismatch_%", ascending=False)
print("=== File Extension Summary ===\n", summary_ext.head(10), "\n")

# --- 3. Top files causing mismatches ---
top_mismatch_files = df[df["is_mismatch"]].groupby("new_file_path").size().sort_values(ascending=False).head(10)
print("=== Top 10 Files Causing Mismatches ===\n", top_mismatch_files, "\n")

# --- 4. Commit-level analysis ---
commit_summary = df.groupby("commit_SHA")["is_mismatch"].sum().sort_values(ascending=False)
avg_mismatches_per_commit = df.groupby("commit_SHA")["is_mismatch"].mean().mean()
print(f"Average mismatches per commit: {avg_mismatches_per_commit:.2f}")
print("Top 10 commits with most mismatches:\n", commit_summary.head(10), "\n")

# --- 5. Repository-level analysis ---
repo_summary = df.groupby("repository_name").agg(
    Total_Files=("repository_name", "size"),
    Mismatches=("is_mismatch", "sum")
)
repo_summary["Mismatch_%"] = 100 * repo_summary["Mismatches"] / repo_summary["Total_Files"]
print("=== Repository Summary ===\n", repo_summary.sort_values("Mismatch_%", ascending=False), "\n")

# --- 6. Plots ---
plt.style.use('seaborn-darkgrid')

# Total files per category
summary_category["Total_Files"].plot(kind="bar", figsize=(8,5), color="skyblue", edgecolor="black")
plt.title("Total Files by Category"); plt.ylabel("Number of Files"); plt.xticks(rotation=30, ha="right"); plt.tight_layout(); plt.show()

# Mismatches per category
summary_category["Mismatches"].plot(kind="bar", figsize=(8,5), color="salmon", edgecolor="black")
plt.title("Mismatches by File Category"); plt.ylabel("Number of Mismatches"); plt.xticks(rotation=30, ha="right"); plt.tight_layout(); plt.show()

# Mismatch percentage per category
summary_category["Mismatch_%"].plot(kind="bar", figsize=(8,5), color="orange", edgecolor="black")
plt.title("Mismatch Percentage by File Category"); plt.ylabel("Mismatch Percentage (%)"); plt.xticks(rotation=30, ha="right"); plt.tight_layout(); plt.show()

# Top 10 file extensions by mismatch %
summary_ext.head(10)["Mismatch_%"].plot(kind="bar", figsize=(8,5), color="purple", edgecolor="black")
plt.title("Top 10 File Extensions by Mismatch %"); plt.ylabel("Mismatch Percentage (%)"); plt.xticks(rotation=30, ha="right"); plt.tight_layout(); plt.show()

# Repository mismatch %
repo_summary.sort_values("Mismatch_%", ascending=False)["Mismatch_%"].plot(kind="bar", figsize=(10,5), color="green", edgecolor="black")
plt.title("Mismatch Percentage by Repository"); plt.ylabel("Mismatch Percentage (%)"); plt.xticks(rotation=30, ha="right"); plt.tight_layout(); plt.show()

# Distribution of mismatches per commit
commit_summary.hist(bins=50, figsize=(8,5), color="teal", edgecolor="black")
plt.title("Distribution of Mismatches per Commit"); plt.xlabel("Number of Mismatches in Commit"); plt.ylabel("Frequency"); plt.tight_layout(); plt.show()

# Top 10 commits by mismatches
commit_summary.head(10).plot(kind="barh", figsize=(8,5), color="red", edgecolor="black")
plt.title("Top 10 Commits by Number of Mismatches"); plt.xlabel("Number of Mismatches"); plt.ylabel("Commit SHA"); plt.gca().invert_yaxis(); plt.tight_layout(); plt.show()

In [None]:
import os
import matplotlib.pyplot as plt

# Folder to save plots
save_folder = "/Users/tejasmacipad/Desktop/Third_year/STT/lab4report"
os.makedirs(save_folder, exist_ok=True)  # create folder if it doesn't exist

# --- 1. Total files per category ---
plt.figure(figsize=(8,5))
summary_category["Total_Files"].plot(kind="bar", color="skyblue", edgecolor="black")
plt.title("Total Files by Category")
plt.ylabel("Number of Files")
plt.xticks(rotation=30, ha="right")
plt.tight_layout()
plt.savefig(os.path.join(save_folder, "total_files_by_category.png"), dpi=300)
plt.close()

# --- 2. Mismatches per category ---
plt.figure(figsize=(8,5))
summary_category["Mismatches"].plot(kind="bar", color="salmon", edgecolor="black")
plt.title("Mismatches by File Category")
plt.ylabel("Number of Mismatches")
plt.xticks(rotation=30, ha="right")
plt.tight_layout()
plt.savefig(os.path.join(save_folder, "mismatches_by_category.png"), dpi=300)
plt.close()

# --- 3. Mismatch percentage per category ---
plt.figure(figsize=(8,5))
summary_category["Mismatch_%"].plot(kind="bar", color="orange", edgecolor="black")
plt.title("Mismatch Percentage by File Category")
plt.ylabel("Mismatch Percentage (%)")
plt.xticks(rotation=30, ha="right")
plt.tight_layout()
plt.savefig(os.path.join(save_folder, "mismatch_percentage_by_category.png"), dpi=300)
plt.close()

# --- 4. Top 10 file extensions by mismatch % ---
plt.figure(figsize=(8,5))
summary_ext.head(10)["Mismatch_%"].plot(kind="bar", color="purple", edgecolor="black")
plt.title("Top 10 File Extensions by Mismatch %")
plt.ylabel("Mismatch Percentage (%)")
plt.xticks(rotation=30, ha="right")
plt.tight_layout()
plt.savefig(os.path.join(save_folder, "top_extensions_mismatch.png"), dpi=300)
plt.close()

# --- 5. Repository mismatch % ---
plt.figure(figsize=(10,5))
repo_summary.sort_values("Mismatch_%", ascending=False)["Mismatch_%"].plot(kind="bar", color="green", edgecolor="black")
plt.title("Mismatch Percentage by Repository")
plt.ylabel("Mismatch Percentage (%)")
plt.xticks(rotation=30, ha="right")
plt.tight_layout()
plt.savefig(os.path.join(save_folder, "repo_mismatch_percentage.png"), dpi=300)
plt.close()

# --- 6. Distribution of mismatches per commit ---
plt.figure(figsize=(8,5))
commit_summary.hist(bins=50, color="teal", edgecolor="black")
plt.title("Distribution of Mismatches per Commit")
plt.xlabel("Number of Mismatches in Commit")
plt.ylabel("Frequency")
plt.tight_layout()
plt.savefig(os.path.join(save_folder, "distribution_mismatches_commit.png"), dpi=300)
plt.close()

# --- 7. Top 10 commits by mismatches ---
plt.figure(figsize=(8,5))
commit_summary.head(10).plot(kind="barh", color="red", edgecolor="black")
plt.title("Top 10 Commits by Number of Mismatches")
plt.xlabel("Number of Mismatches")
plt.ylabel("Commit SHA")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig(os.path.join(save_folder, "top_commits_mismatches.png"), dpi=300)
plt.close()