In [8]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("mamiksik/CommitPredictorT5")
model = AutoModelForSeq2SeqLM.from_pretrained("mamiksik/CommitPredictorT5")

In [1]:
# Keywords used to identify bug-fix commits
keywords = ["fixed", "bug", "fixes", "fix", "crash", "solves", "resolves", "issue", "regression", "fall back", "assertion", "coverity", "reproducible", "stack-wanted", "steps-wanted", "testcase", "failur", "fail" "npe", "except", "broken", "differential testing", "error", "hang", "test fix", "steps to reproduce", "crash", "assertion", "failure", "leak", "stack trace", "heap overflow", "freez", "problem", "overflow", "avoid", "workaround", "break", "stop"]
len(keywords)

38

In [None]:
from pydriller import Repository
import pandas as pd
bug_fix_df = []
diff_analysis_df = []

Repo_path = "/kaggle/working/unsloth"

# Traversing commits using PyDriller
for commit in Repository(Repo_path).traverse_commits():
    for keyword in keywords:
        if keyword in commit.msg:
            bug_fix_df.append({
                'Hash': commit.hash,
                'Author': commit.author.name,
                'Message': commit.msg,
                'Hashes of parents': commit.parents,
                'Is a merge commit?': len(commit.parents) > 1,
                'List of modified files': [mod.filename for mod in commit.modified_files],
            })

            if commit.modified_files:  # normal case
                for mod in commit.modified_files:
                    diff_analysis_df.append({
                        'Hash': commit.hash,
                        'Author': commit.author.name,
                        'Message': commit.msg,
                        'Filename': mod.filename,
                        'Change Type': mod.change_type.name,
                        'Source Code (before)': mod.source_code_before,
                        'Source Code (current)': mod.source_code,
                        'Diff': mod.diff
                    })
            break
# Create DataFrames
bug_fix_df = pd.DataFrame(bug_fix_df)
diff_analysis_df = pd.DataFrame(diff_analysis_df)

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
model = AutoModel.from_pretrained("microsoft/codebert-base",device_map="auto")
def get_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # Mean pooling over token embeddings
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

In [None]:
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
model = AutoModel.from_pretrained("microsoft/codebert-base",device_map="auto")
def get_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # Mean pooling over token embeddings
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
def evaluate_candidates_with_threshold(df, threshold):
    hit_counts = {"Message": 0, "LLM_inference": 0, "Rectified Message": 0}
    total = len(df)

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Evaluating commits"):
        diff_embedding = get_embedding(row["Overall_diff"], tokenizer, model)
        
        for col in ["Message","LLM_inference","Rectified Message"]:
            msg_embedding = get_embedding(str(row[col]), tokenizer, model)
            score = cosine_similarity([diff_embedding], [msg_embedding])[0][0]

            # Similarity score
            df.at[idx, f"{col}_score"] = score
            # Boolean precision flag
            df.at[idx, f"{col}_precise"] = score >= threshold

            if score >= threshold:
                hit_counts[col] += 1

    hit_rates = {k: v/total for k,v in hit_counts.items()}

    return df, hit_rates

df, hit_rates = evaluate_candidates_with_threshold(final_df, threshold=0.85)
print("Hit rates:", hit_rates)
print(df.head())

In [1]:
import pandas as pd
pd.read_csv('assgn2_analysis.csv')

Unnamed: 0.1,Unnamed: 0,Hash,Author,Message,Filename,Change Type,Source Code (before),Source Code (current),Diff,LLM_inference
0,0,4b97a810b509c93f44be4c037c7aa18fb8922884,Daniel Han,"Pre-release 2023 December version (Mistral, Pr...",README.md,MODIFY,"<div class=""align-center"">\n <img src=""./imag...","<div class=""align-center"">\n <img src=""./imag...","@@ -1,23 +1,25 @@\n <div class=""align-center"">...",add link to nvidia gpu
1,1,4b97a810b509c93f44be4c037c7aa18fb8922884,Daniel Han,"Pre-release 2023 December version (Mistral, Pr...",Discord.png,MODIFY,PNG\r\n\n,PNG\r\n\n,Binary files a/images/Discord.png and b/images...,distro binary files
2,2,4b97a810b509c93f44be4c037c7aa18fb8922884,Daniel Han,"Pre-release 2023 December version (Mistral, Pr...",LAION 2GPU.png,ADD,,PNG\r\n\n,Binary files /dev/null and b/images/LAION 2GPU...,distro image
3,3,4b97a810b509c93f44be4c037c7aa18fb8922884,Daniel Han,"Pre-release 2023 December version (Mistral, Pr...",LAION 2GPU.svg,DELETE,"<?xml version=""1.0"" encoding=""utf-8"" standalon...",,"@@ -1,1518 +0,0 @@\n-<?xml version=""1.0"" encod...",add missing missing nodes in skeleton skeleton
4,4,4b97a810b509c93f44be4c037c7aa18fb8922884,Daniel Han,"Pre-release 2023 December version (Mistral, Pr...",SlimOrca 1GPU.svg,DELETE,"<?xml version=""1.0"" encoding=""utf-8"" standalon...",,"@@ -1,1424 +0,0 @@\n-<?xml version=""1.0"" encod...",add missing nodes in skeleton skeleton
...,...,...,...,...,...,...,...,...,...,...
1222,1222,26601f9d42b4c416efa59a062665c858b94c8673,Daniel Han,Bug fixes (#3195)\n\n* Fix mamba\n\n* Update l...,pyproject.toml,MODIFY,"[build-system]\nrequires = [""setuptools"", ""set...","[build-system]\nrequires = [""setuptools"", ""set...","@@ -37,7 +37,7 @@ triton = [\n ]\n \n huggingf...",add more huggingfaces and colab versions
1223,1223,26601f9d42b4c416efa59a062665c858b94c8673,Daniel Han,Bug fixes (#3195)\n\n* Fix mamba\n\n* Update l...,__init__.py,MODIFY,# Copyright 2023-present Daniel Han-Chen & the...,# Copyright 2023-present Daniel Han-Chen & the...,"@@ -17,6 +17,10 @@ from packaging.version impo...",add missing fix for unsloth
1224,1224,26601f9d42b4c416efa59a062665c858b94c8673,Daniel Han,Bug fixes (#3195)\n\n* Fix mamba\n\n* Update l...,import_fixes.py,ADD,,# Copyright 2023-present Daniel Han-Chen & the...,"@@ -0,0 +1,119 @@\n+# Copyright 2023-present D...",add fixup for google.protobuf.message_factory
1225,1225,26601f9d42b4c416efa59a062665c858b94c8673,Daniel Han,Bug fixes (#3195)\n\n* Fix mamba\n\n* Update l...,_utils.py,MODIFY,# Copyright 2023-present Daniel Han-Chen & the...,# Copyright 2023-present Daniel Han-Chen & the...,"@@ -12,7 +12,7 @@\n # See the License for the ...",add missing version


In [6]:
df

Unnamed: 0.1,Unnamed: 0,Hash,Author,Message,Filename,Change Type,Source Code (before),Source Code (current),Diff
0,0,4b97a810b509c93f44be4c037c7aa18fb8922884,Daniel Han,"Pre-release 2023 December version (Mistral, Pr...",README.md,MODIFY,"<div class=""align-center"">\n <img src=""./imag...","<div class=""align-center"">\n <img src=""./imag...","@@ -1,23 +1,25 @@\n <div class=""align-center"">..."
1,1,4b97a810b509c93f44be4c037c7aa18fb8922884,Daniel Han,"Pre-release 2023 December version (Mistral, Pr...",Discord.png,MODIFY,PNG\r\n\n,PNG\r\n\n,Binary files a/images/Discord.png and b/images...
2,2,4b97a810b509c93f44be4c037c7aa18fb8922884,Daniel Han,"Pre-release 2023 December version (Mistral, Pr...",LAION 2GPU.png,ADD,,PNG\r\n\n,Binary files /dev/null and b/images/LAION 2GPU...
3,3,4b97a810b509c93f44be4c037c7aa18fb8922884,Daniel Han,"Pre-release 2023 December version (Mistral, Pr...",LAION 2GPU.svg,DELETE,"<?xml version=""1.0"" encoding=""utf-8"" standalon...",,"@@ -1,1518 +0,0 @@\n-<?xml version=""1.0"" encod..."
4,4,4b97a810b509c93f44be4c037c7aa18fb8922884,Daniel Han,"Pre-release 2023 December version (Mistral, Pr...",SlimOrca 1GPU.svg,DELETE,"<?xml version=""1.0"" encoding=""utf-8"" standalon...",,"@@ -1,1424 +0,0 @@\n-<?xml version=""1.0"" encod..."
...,...,...,...,...,...,...,...,...,...
4537,4537,f1508c9259f91e33f5c7fdf95d971a309196471c,Daniel Han,GPT OSS fixes,loader.py,MODIFY,# Copyright 2023-present Daniel Han-Chen & the...,# Copyright 2023-present Daniel Han-Chen & the...,"@@ -591,20 +591,12 @@ class FastModel(FastBase..."
4538,4538,f1508c9259f91e33f5c7fdf95d971a309196471c,Daniel Han,GPT OSS fixes,vision.py,MODIFY,# Copyright 2023-present Daniel Han-Chen & the...,# Copyright 2023-present Daniel Han-Chen & the...,"@@ -365,8 +365,10 @@ class FastBaseModel:\n ..."
4539,4539,f1508c9259f91e33f5c7fdf95d971a309196471c,Daniel Han,GPT OSS fixes,_utils.py,MODIFY,# Copyright 2023-present Daniel Han-Chen & the...,# Copyright 2023-present Daniel Han-Chen & the...,"@@ -76,6 +76,7 @@ platform_system = platform_s..."
4540,4540,f1508c9259f91e33f5c7fdf95d971a309196471c,Daniel Han,GPT OSS fixes,loader.py,MODIFY,# Copyright 2023-present Daniel Han-Chen & the...,# Copyright 2023-present Daniel Han-Chen & the...,"@@ -591,20 +591,12 @@ class FastModel(FastBase..."


In [4]:
import pandas as pd
df = pd.read_csv('files_fixed.csv')
df['Author'].unique()

array(['Daniel Han', 'Daniel Han-Chen', 'Michael Han', 'Z', 'XiaoYang',
       'moontidef', 'Nazim Ali', 'vo1d-ai', 'Edd', 'Datta Nimmaturi',
       'Uday Girish Maradana', 'Zewen Shen', 'Kareem', 'tastelikefeet',
       'Gennadii Manzhos', 'Nino Risteski', 'Jyotin Goel', 'Igor Kilbas',
       'Charles London', 'Mohamed Mekkouri', 'Mukkesh Ganesh',
       'Isaac Breen', 'lurf21', 'naliazheli', 'jeromeku', 'Etherll',
       'Richi', 'Erland366', 'Dattu Sharma', 'Mathew Mathew',
       'Roland Tannous', 'feng lui', 'David Dobolyi', 'Emmanuel Ferdman',
       'DoubleMathew', 'RunFMe', 'Salpingopharyngeus', 'pluesclues',
       'Lei Zhenyuan', 'Dhia Eddine Rhaiem', 'billishyahao',
       'Muzammil Khan', 'Sekinal'], dtype=object)

### Number of files, Number of commits

In [10]:
print("Number of commits:",df['Hash'].nunique())
print("Number of files:", df['Filename'].nunique())

Number of commits: 309
Number of files: 113


In [9]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32103, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32103, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [None]:
import pandas as pd
pd.read_csv('')

In [10]:
def predict(input_text):
    # Tokenize and generate 
    inputs = tokenizer(input_text, return_tensors="pt")
    outputs = model.generate(**inputs)

    # Decode generated tokens
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return prediction

In [33]:
import torch
def predict_batch(input_texts, batch_size=1000):
    predictions = []
    model.eval()

    with torch.no_grad():
        for i in range(0, len(input_texts), batch_size):
            batch = input_texts[i:i+batch_size]
            inputs = tokenizer(
                batch,
                return_tensors="pt",
                padding=True,
                truncation=True
            )
            # Move to same device as model
            inputs = {k: v.to(model.device) for k, v in inputs.items()}

            # Generate predictions
            outputs = model.generate(**inputs)

            # Decode each prediction
            batch_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            predictions.extend(batch_preds)

    return predictions

In [None]:
import pandas as pd
from pydriller import Repository
# Keywords used to identify bug-fix commits
keywords = ["fixed", "bug", "fixes", "fix", "crash", "solves", "resolves", "issue", "regression", "fall back", 
            "assertion", "coverity", "reproducible", "stack-wanted", "steps-wanted", "testcase", "failur", 
            "fail" "npe", "except", "broken", "differential testing", "error", "hang", "test fix", 
            "steps to reproduce", "crash", "assertion", "failure", "leak", "stack trace", "heap overflow", 
            "freez", "problem", "overflow", "avoid", "workaround", "break", "stop"]

rows = []
rows2 = []
for commit in Repository("https://github.com/unslothai/unsloth").traverse_commits():
    for keyword in keywords:
        if keyword in commit.msg:
            rows.append({
                'Hash': commit.hash,
                'Author' : commit.author.name,
                'Message': commit.msg,
                'Hashes of parents': commit.parents,
                'Is a merge commit?': len(commit.parents) > 1,
                'List of modified files': [mod.filename for mod in commit.modified_files],
            })
            for mod in commit.modified_files:
                rows2.append({
                    'Hash': commit.hash,
                    'Author' : commit.author.name,
                    'Message': commit.msg,
                    'Filename' : mod.filename,
                    'Change Type': mod.change_type.name,
                    'Source Code (before)' : mod.source_code_before,
                    'Source Code (current)' : mod.source_code,
                    'Diff' : mod.diff
                })

# Create DataFrame
df = pd.DataFrame(rows)
df2 = pd.DataFrame(rows2)

In [21]:
df2.to_csv("files_fixed.csv")

In [29]:
diffs = df2['Diff'].to_list()
diffs

['@@ -1,23 +1,25 @@\n <div class="align-center">\n   <img src="./images/unsloth new logo.png" width="400" />\n   <a href="https://discord.gg/u54VK8m8tk"><img src="./images/Discord.png" width="180"></a>\n+  <a href="https://colab.research.google.com/drive/1oW55fBmwzCOrBVX66RcpptL3a99qWBxb?usp=sharing"><img src="./images/try live demo green.png" width="130"></a>\n </div>\n \n-\n-## 80% faster 50% less memory local QLoRA finetuning\n+## 2-5x faster 50% less memory local LLM finetuning\n * Manual autograd engine - hand derived backprop steps.\n-* QLoRA / LoRA 80% faster, 50% less memory.\n-* All kernels written in OpenAI\'s Triton language.\n+* 2x to 5x faster than QLoRA. 50% less memory usage.\n+* All kernels written in [OpenAI\'s Triton](https://openai.com/research/triton) language.\n * 0% loss in accuracy - no approximation methods - all exact.\n-* No change of hardware necessary. Supports NVIDIA GPUs since 2018+. CUDA 7.5+. Tesla T4, RTX 20, 30, 40 series, A100, H100s\n-* Flash Attenti

In [None]:
['add more info about nvidia gpu',
 'distro binary files',
 'test the image',
 'add missing missing nodes in skeleton skeleton',
 'add missing missing nodes in skeleton skeleton',
 'test binary file',
 'add missing colab',
 'add missing deprecation warning']

In [None]:
predict_batch(diffs)

In [55]:
df.to_csv("commit_info.csv")

In [7]:
model

NameError: name 'model' is not defined

In [2]:
import pandas as pd
df = pd.read_csv("commit_info.csv")

In [5]:
df

Unnamed: 0.1,Unnamed: 0,Hash,Author,Message,Hashes of parents,Is a merge commit?,List of modified files
0,0,4b97a810b509c93f44be4c037c7aa18fb8922884,Daniel Han,"Pre-release 2023 December version (Mistral, Pr...",['3aa16bb452ab82d7a2b2987ec3bfb47c6812582c'],False,[<pydriller.domain.commit.ModifiedFile object ...
1,1,4b97a810b509c93f44be4c037c7aa18fb8922884,Daniel Han,"Pre-release 2023 December version (Mistral, Pr...",['3aa16bb452ab82d7a2b2987ec3bfb47c6812582c'],False,[<pydriller.domain.commit.ModifiedFile object ...
2,2,4b97a810b509c93f44be4c037c7aa18fb8922884,Daniel Han,"Pre-release 2023 December version (Mistral, Pr...",['3aa16bb452ab82d7a2b2987ec3bfb47c6812582c'],False,[<pydriller.domain.commit.ModifiedFile object ...
3,3,2d5d88487463e76f75002be3b2704267ec96e68a,Daniel Han-Chen,tokenizer pad fix,['28f3b971d21e469fb985f384db10a03982c4ce12'],False,[<pydriller.domain.commit.ModifiedFile object ...
4,4,f380cc1170447800c112dc8568bdff3dd34c79a3,Daniel Han-Chen,Fix Mistral\n\nBlockDiagonalCausalMask fix cou...,['399f8ed56f40df0919208d1ffdee64a31a1b22c8'],False,[<pydriller.domain.commit.ModifiedFile object ...
...,...,...,...,...,...,...,...
1014,1014,17688b54ea608946853cfb8d10e63e3f8ae7a839,Daniel Han,Nightly (#3102)\n\n* Update synthetic.py\n\n* ...,['5b14b8fbd2e8aaa9eb560f23c42de10db45b45b3'],False,[<pydriller.domain.commit.ModifiedFile object ...
1015,1015,17688b54ea608946853cfb8d10e63e3f8ae7a839,Daniel Han,Nightly (#3102)\n\n* Update synthetic.py\n\n* ...,['5b14b8fbd2e8aaa9eb560f23c42de10db45b45b3'],False,[<pydriller.domain.commit.ModifiedFile object ...
1016,1016,17688b54ea608946853cfb8d10e63e3f8ae7a839,Daniel Han,Nightly (#3102)\n\n* Update synthetic.py\n\n* ...,['5b14b8fbd2e8aaa9eb560f23c42de10db45b45b3'],False,[<pydriller.domain.commit.ModifiedFile object ...
1017,1017,f1508c9259f91e33f5c7fdf95d971a309196471c,Daniel Han,GPT OSS fixes,['76be074f266e0710c3b8b293859a53dd09b683dc'],False,[<pydriller.domain.commit.ModifiedFile object ...


In [17]:
rows = []
for i in range(len(df)):
    commit_hash = df.iloc[i]['Hash']
    commit_msg = df.iloc[i]['Message']
    author_name = df.iloc[i]['Author']
    for mod in df['List of modified files'].iloc[i]:
        rows.append({
            'Hash': commit_hash,
            'Message': commit_msg,
            "Author" : author_name,
            'Filename' : mod.filename,
            'Change Type': mod.change_type.name,
            'Source Code (before)' : mod.source_code_before,
            'Source Code (current)' : mod.source_code,
            'Diff' : mod.diff
        })

OSError: [Errno 22] Invalid argument

In [60]:
!git config --global --add safe.directory "C:\Users\Vedant\AppData\Local\Temp\tmpnq9rzntx\unsloth"

In [49]:
diff_df = pd.DataFrame(rows)
diff_df

Unnamed: 0,Hash,Message,Author,Filename,Change Type,Source Code (before),Source Code (current),Diff
0,644362e5c630f3c7710ec678705980bc3277d01d,Adding Assignment Questions,Ayush Shrivastava,README.md,ADD,,# Assignment 2 \n\n**Total marks: 10 (This ass...,"@@ -0,0 +1,95 @@\n+# Assignment 2 \n+\n+**Tota..."
1,644362e5c630f3c7710ec678705980bc3277d01d,Adding Assignment Questions,Ayush Shrivastava,1colour.jpg,ADD,,�JFIF��`�`���C�\...,Binary files /dev/null and b/sample_images/1co...
2,644362e5c630f3c7710ec678705980bc3277d01d,Adding Assignment Questions,Ayush Shrivastava,2-3_colours.jpg,ADD,,�JFIF��`�`���C�\...,Binary files /dev/null and b/sample_images/2-3...
3,644362e5c630f3c7710ec678705980bc3277d01d,Adding Assignment Questions,Ayush Shrivastava,multiple_colours.jpg,ADD,,�JFIF��`�`���C�\...,Binary files /dev/null and b/sample_images/mul...
4,457dc952c512f1960152a571c7a5c2837084a258,Simplifying the asignmnet,Ayush Shrivastava,README.md,MODIFY,# Assignment 2 \n\n**Total marks: 10 (This ass...,# Assignment 2 \n\n**Total marks: 10 (This ass...,"@@ -16,63 +16,61 @@ eps = np.random.randn(num_..."
...,...,...,...,...,...,...,...,...
109,24069667cf494dffff47c0fe730a60d3264c2207,Finalized task 1 Resolved all mistakes,haarit19058,Task1.ipynb,MODIFY,"{\n ""cells"": [\n {\n ""cell_type"": ""code"",\n...","{""cells"":[{""cell_type"":""code"",""execution_count...","@@ -1,1053 +1 @@\n-{\n- ""cells"": [\n- {\n- ..."
110,2a22e90cf1cd4b2a668c6aa36e841f5a5ad3161f,Added md for momentum,AnuragSingh0000,Task1.ipynb,MODIFY,"{""cells"":[{""cell_type"":""code"",""execution_count...","{""cells"":[{""cell_type"":""code"",""execution_count...","@@ -1 +1 @@\n-{""cells"":[{""cell_type"":""code"",""e..."
111,1d0f7855a94bce5fa5543d0d1924f5dc2e99da34,Finalizing Task 2_Q2 & Task 4_Q1,Vedant Acharya,Task2_Q2.ipynb,MODIFY,"{""metadata"":{""kernelspec"":{""language"":""python""...","{""metadata"":{""kernelspec"":{""language"":""python""...","@@ -1 +1 @@\n-{""metadata"":{""kernelspec"":{""lang..."
112,1d0f7855a94bce5fa5543d0d1924f5dc2e99da34,Finalizing Task 2_Q2 & Task 4_Q1,Vedant Acharya,Task4_Q1.ipynb,MODIFY,"{\n ""cells"": [\n {\n ""cell_type"": ""co...","{\n ""cells"": [\n {\n ""cell_type"": ""co...","@@ -210,6 +210,13 @@\n "" return rec..."


In [39]:
df['List of modified files'].iloc[0]

[<pydriller.domain.commit.ModifiedFile at 0x2366a2288d0>,
 <pydriller.domain.commit.ModifiedFile at 0x2366a58b6d0>,
 <pydriller.domain.commit.ModifiedFile at 0x2366e2d0690>,
 <pydriller.domain.commit.ModifiedFile at 0x2366e2d0a50>]

In [36]:
df

Unnamed: 0,Hash,Author,Message,Hashes of parents,Is a merge commit?,List of modified files
0,644362e5c630f3c7710ec678705980bc3277d01d,Ayush Shrivastava,Adding Assignment Questions,[],False,"[README.md, sample_images\1colour.jpg, sample_..."
1,457dc952c512f1960152a571c7a5c2837084a258,Ayush Shrivastava,Simplifying the asignmnet,[644362e5c630f3c7710ec678705980bc3277d01d],False,[README.md]
2,38bead3ce526071586bafc87c2e282eb4f095ecf,haarit19058,gd sgd added,[457dc952c512f1960152a571c7a5c2837084a258],False,[Task1.ipynb]
3,f141d4d8184df386d4dc3d76f12406ef83973496,Aditya Borate,Structured the directory,[38bead3ce526071586bafc87c2e282eb4f095ecf],False,"[Task 1\Task1.ipynb, Task 2\assets\baboon.png]"
4,fd9b75816847896c031b3793327b777aff2642af,Aditya Borate,Completed Task2_Q1,[f141d4d8184df386d4dc3d76f12406ef83973496],False,"[Task 2\Task2_Q1.ipynb, Task 2\assets\baboon.p..."
5,a9c48c0361ec0f26c86146a6403e33c1764d08de,Aditya Borate,Corrected image loading,[fd9b75816847896c031b3793327b777aff2642af],False,[Task 2\Task2_Q1.ipynb]
6,447be57e4b68903e934ed24b36e1a06910e67c55,AnuragSingh0000,Task 1 part 3 and 4 done,[38bead3ce526071586bafc87c2e282eb4f095ecf],False,[Task1.ipynb]
7,4138e9533c2e52745677ad1bc8807e26abe03160,AnuragSingh0000,Merge branch 'main' of https://github.com/adi7...,"[447be57e4b68903e934ed24b36e1a06910e67c55, a9c...",True,[]
8,ac1f9e86c95d6109eca69470520abba7d4c858be,AnuragSingh0000,Task 5 completed,[4138e9533c2e52745677ad1bc8807e26abe03160],False,"[Task_1\Task1.ipynb, Task_2\Task2_Q1.ipynb, Ta..."
9,120d8b374ac6b9bff4875c34a4a02abef88b087e,haarit19058,Added Task 4,[ac1f9e86c95d6109eca69470520abba7d4c858be],False,[Task_4\Task_4.ipynb]


In [32]:
data

{'Hash': '2ac9c31c47a4a260c7db08321b4129cbe47e6701',
 'Message': 'Final task2',
 'Hashes of parents': ['1d0f7855a94bce5fa5543d0d1924f5dc2e99da34'],
 'Is a merge commit?': False,
 'List of modified files': [<pydriller.domain.commit.ModifiedFile at 0x2366e9ef5d0>]}

In [28]:
for i in data['List of modified files']:
    print(i.new_path or i.old_path)

.gitignore
README.md


In [29]:
data

{'Hash': '43badf217d1ccfaf486e2cbb1b3567226b5e95bf',
 'Message': 'Initial commit',
 'Hashes of parents': [],
 'Is a merge commit?': False,
 'List of modified files': [<pydriller.domain.commit.ModifiedFile at 0x2366a5b1010>,
  <pydriller.domain.commit.ModifiedFile at 0x2366e73f4d0>]}