### Install and import necessary libraries

In [1]:
!pip install pydriller
import pandas as pd
from pydriller import Repository

Collecting pydriller
  Downloading PyDriller-2.9-py3-none-any.whl.metadata (1.3 kB)
Collecting lizard (from pydriller)
  Downloading lizard-1.17.31-py2.py3-none-any.whl.metadata (16 kB)
Collecting pathspec (from lizard->pydriller)
  Downloading pathspec-0.12.1-py3-none-any.whl.metadata (21 kB)
Downloading PyDriller-2.9-py3-none-any.whl (36 kB)
Downloading lizard-1.17.31-py2.py3-none-any.whl (82 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.1/82.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pathspec-0.12.1-py3-none-any.whl (31 kB)
Installing collected packages: pathspec, lizard, pydriller
Successfully installed lizard-1.17.31 pathspec-0.12.1 pydriller-2.9


In [29]:
!pip install radon

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting radon
  Downloading radon-6.0.1-py2.py3-none-any.whl.metadata (8.2 kB)
Collecting mando<0.8,>=0.6 (from radon)
  Downloading mando-0.7.1-py2.py3-none-any.whl.metadata (7.4 kB)
Downloading radon-6.0.1-py2.py3-none-any.whl (52 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.8/52.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading mando-0.7.1-py2.py3-none-any.whl (28 kB)
Installing collected packages: mando, radon
Successfully installed mando-0.7.1 radon-6.0.1


In [None]:
import pandas as pd

analysis_df = pd.read_csv('Lab_2_analysis.csv')

# 1. Total number of commits and files
total_commits = analysis_df["Hash"].nunique()
total_files = analysis_df["Filename"].nunique()

# 2. Average number of modified files per commit
files_per_commit = analysis_df.groupby("Hash")["Filename"].nunique()
avg_files_per_commit = files_per_commit.mean()

fix_type_distribution = analysis_df["LLM_inference"].value_counts().reset_index()
fix_type_distribution.columns = ['Fix type','Count']
top_files = analysis_df["Filename"].value_counts().reset_index()
top_files.columns = ['Filename','Count']
analysis_df["Extension"] = analysis_df["Filename"].str.split(".").str[-1]
top_extensions = analysis_df["Extension"].value_counts().reset_index()
top_extensions.columns = ['Extension','Count']

# Display results
print("Total commits:", total_commits)
print("Total files:", total_files)
print("Average modified files per commit:", round(avg_files_per_commit,2))
print("\nFix type distribution:\n", fix_type_distribution)
print("\nMost frequently modified files:\n", top_files.head(10))
print("\nMost frequently modified extensions:\n", top_extensions.head(10))

In [None]:
import pandas as pd
from radon.complexity import cc_visit
from radon.metrics import mi_visit
from radon.raw import analyze


def analyze_code(source_code: str):
    """Extract MI, CC, and LOC from given source code string."""
    try:
        # Maintainability Index
        mi_score = mi_visit(source_code, True)

        # Average Cyclomatic Complexity
        cc_blocks = cc_visit(source_code)
        cc_score =  sum(block.complexity for block in cc_blocks)/len(cc_blocks)

        # Lines of Code
        raw_metrics = analyze(source_code)
        loc = raw_metrics.loc

        return mi_score, cc_score, loc
    except Exception:
        return None, None, None


def process_dataframe(df: pd.DataFrame):
    """Run radon analysis on Source Code (before/current) for each row."""
    results = []

    for idx, row in df.iterrows():
        mi_before, cc_before, loc_before = analyze_code(str(row["Source Code (before)"]))
        mi_after, cc_after, loc_after = analyze_code(str(row["Source Code (current)"]))

        results.append({
            "MI_Change": (mi_after - mi_before) if mi_before is not None and mi_after is not None else None,
            "CC_Change": (cc_after - cc_before) if cc_before is not None and cc_after is not None else None,
            "LOC_Change": (loc_after - loc_before) if loc_before is not None and loc_after is not None else None
        })

    # Merging with original dataframe
    return pd.concat([df, pd.DataFrame(results)], axis=1)


df_processed = process_dataframe(analysis_df)

In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
model = AutoModel.from_pretrained("microsoft/codebert-base",device_map="auto")

def semantic_similarity(code1, code2):
    inputs = tokenizer([code1, code2], return_tensors="pt", padding=True, truncation=True)

    with torch.no_grad():
        outputs = model(**inputs)

    # Mean pooling over tokens
    embeddings = outputs.last_hidden_state.mean(dim=1)

    # Cosine similarity
    sim = cosine_similarity([embeddings[0].numpy()], [embeddings[1].numpy()])[0][0]
    return sim

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Using device: cuda


2025-09-05 06:22:41.207153: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757053361.557522      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757053361.654838      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [21]:
from sacrebleu.metrics import BLEU

# Initialize BLEU scorer
bleu = BLEU()

def token_similarity_bleu(before_code, after_code):
    sys = [after_code]
    refs = [[before_code]]
    
    score = bleu.corpus_score(sys, refs)
    return score.score / 100.0      # sacrebleu returns score out of 100, normalize to [0,1]

In [42]:
from tqdm import tqdm

# Wrap apply with tqdm
tqdm.pandas()
results_df = df_processed.dropna(subset=["Source Code (before)", "Source Code (current)"])
results_df["Semantic_Similarity"] = results_df.progress_apply(
    lambda row: semantic_similarity(str(row["Source Code (before)"]), str(row["Source Code (current)"])), axis=1
)

results_df["Token_Similarity"] = results_df.progress_apply(
    lambda row: token_similarity_bleu(str(row["Source Code (before)"]), str(row["Source Code (current)"])), axis=1
)

def classify_semantic(sim):
    return "Minor" if sim >= 0.80 else "Major"

def classify_token(sim):
    return "Minor" if sim >= 0.75 else "Major"

results_df["Semantic_Class"] = results_df["Semantic_Similarity"].apply(classify_semantic)
results_df["Token_Class"] = results_df["Token_Similarity"].apply(classify_token)
results_df["Classes_Agree"] = results_df.apply(
    lambda row: "YES" if row["Semantic_Class"] == row["Token_Class"] else "NO", axis=1
)
results_df.to_csv("Lab3_results.csv", index=False)

100%|██████████| 1136/1136 [01:40<00:00, 11.27it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_df["Semantic_Similarity"] = results_df.progress_apply(
100%|██████████| 1136/1136 [00:31<00:00, 35.57it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_df["Token_Similarity"] = results_df.progress_apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a