In [2]:
import glob, json
from datetime import datetime
import pandas as pd

In [3]:
# One sample data load

In [4]:
pr_files = glob.glob("snapshot_20230831/*_pr_sharings.json")
print("Found PR files:", pr_files)



Found PR files: ['snapshot_20230831/20230831_060603_pr_sharings.json']


# Data Exploration
Json files as a dictionary

In [5]:
import json

with open("snapshot_20230831/20230831_060603_pr_sharings.json", "r") as f:
    data = json.load(f)

print(type(data))          # should print dict
print(data.keys())         # list the top-level keys


<class 'dict'>
dict_keys(['Sources'])


In [6]:
first_key = list(data.keys())[0]
print("First key:", first_key)
print("Type of value:", type(data[first_key]))

# If it's a list of PRs, show its length and keys
if isinstance(data[first_key], list):
    print("Number of items:", len(data[first_key]))
    if len(data[first_key]) > 0:
        print("Keys in first PR:", data[first_key][0].keys())

First key: Sources
Type of value: <class 'list'>
Number of items: 193
Keys in first PR: dict_keys(['Type', 'URL', 'Author', 'RepoName', 'RepoLanguage', 'Number', 'Title', 'Body', 'CreatedAt', 'ClosedAt', 'MergedAt', 'UpdatedAt', 'State', 'Additions', 'Deletions', 'ChangedFiles', 'CommitsTotalCount', 'ChatgptSharing', 'CommitSha'])


In [7]:
import pprint

if isinstance(data[first_key], list) and len(data[first_key]) > 0:
    pprint.pprint(data[first_key][0])
else:
    print("Unexpected structure:", type(data[first_key]))


{'Additions': 1,
 'Author': 'dae-bot',
 'Body': '⏳ dae-wing (origin/main) is currently out-of-sync to dae '
         '(origin/main); changes are proposed by @daebot in actions - '
         'https://github.com/daeuniverse/dae-wing/actions/runs/5981460109\n'
         '\n'
         '### #300 - docs(en): update how-it-works\n'
         '\n'
         'PR: <https://github.com/daeuniverse/dae/pull/300>\n'
         '\n'
         'Context:\n'
         '\n'
         'As the title suggests. It is worth mentioning that the task was '
         'completed by ChatGPT 3.5\r\n'
         '\r\n'
         'Ref: '
         'https://chat.openai.com/share/9fb7d601-2a86-4079-8cdc-3ecdf90cab48\n'
         '\n'
         '---\n'
         '\n'
         '### #300 - docs(en): update how-it-works\n'
         '\n'
         'PR: <https://github.com/daeuniverse/dae/pull/300>\n'
         '\n'
         'Context:\n'
         '\n'
         'As the title suggests. It is worth mentioning that the task was '
         'complet

In [8]:
# Extract PRs into a DataFrame
prs = data["Sources"]

# convert into dataframe
df = pd.DataFrame(prs)
print("DataFrame shape:", df.shape)
print("DataFrame columns:", df.columns)
print(df.head(5))


DataFrame shape: (193, 19)
DataFrame columns: Index(['Type', 'URL', 'Author', 'RepoName', 'RepoLanguage', 'Number', 'Title',
       'Body', 'CreatedAt', 'ClosedAt', 'MergedAt', 'UpdatedAt', 'State',
       'Additions', 'Deletions', 'ChangedFiles', 'CommitsTotalCount',
       'ChatgptSharing', 'CommitSha'],
      dtype='object')
           Type                                                URL  \
0  pull request   https://github.com/daeuniverse/dae-wing/pull/115   
1  pull request  https://github.com/FlorianWoelki/obsidian-symb...   
2  pull request            https://github.com/labdao/plex/pull/469   
3  pull request            https://github.com/labdao/plex/pull/468   
4  pull request  https://github.com/mlc-ai/web-stable-diffusion...   

              Author                                   RepoName  \
0            dae-bot                       daeuniverse/dae-wing   
1  sharshuv-quotient  FlorianWoelki/obsidian-symbols-prettifier   
2          AdamGoyer                            

In [9]:
# Check available languages
print(df["RepoLanguage"].value_counts().head(10))

RepoLanguage
TypeScript          116
Python               17
Java                 12
Go                   10
JavaScript            9
Shell                 4
C                     4
HTML                  4
C#                    3
Jupyter Notebook      2
Name: count, dtype: int64


In [10]:
# Filter for Python bug-fix PRs
df_python = df[df["RepoLanguage"] == "Python"]

# Simple bug-fix heuristic: title contains "fix", "bug", "error"
df_bugfix = df_python[df_python["Title"].str.contains("fix|bug|error", case=False, na=False)]

print("Python PRs:", df_python.shape)
print("Bug-fix PRs:", df_bugfix.shape)
df_bugfix.head(5)


Python PRs: (17, 19)
Bug-fix PRs: (2, 19)


Unnamed: 0,Type,URL,Author,RepoName,RepoLanguage,Number,Title,Body,CreatedAt,ClosedAt,MergedAt,UpdatedAt,State,Additions,Deletions,ChangedFiles,CommitsTotalCount,ChatgptSharing,CommitSha
14,pull request,https://github.com/metaphorsystems/metaphor-py...,cmishra,metaphorsystems/metaphor-python,Python,5,fixed incorrect description of type arg in met...,The description of the type keyword currently ...,2023-08-26T21:17:35Z,2023-08-26T21:19:56Z,2023-08-26T21:19:56Z,2023-08-26T21:19:57Z,MERGED,1,1,1,1,[{'URL': 'https://chat.openai.com/share/3b3a60...,[fa0dbcb4a361aafcbe2947dc75565f1b94656542]
43,pull request,https://github.com/chitalian/gptask/pull/2,calum-bird,chitalian/gptask,Python,2,Fix: recursive/glob support,"Changes:\r\n`-r` is now a flag, not an argumen...",2023-07-24T18:09:25Z,2023-07-24T19:52:21Z,2023-07-24T19:52:21Z,2023-07-24T19:52:21Z,MERGED,78,35,5,3,[{'URL': 'https://chat.openai.com/share/902cd3...,"[74b84f4cebb32e6c84b07e85e579c9e86664c8ff, 5d6..."


In [12]:
from datetime import datetime

def parse_time(ts):
    if ts is None: return None
    return datetime.fromisoformat(ts.replace("Z", "+00:00"))

df_bugfix = df_bugfix.copy()
df_bugfix["CreatedAt_dt"] = df_bugfix["CreatedAt"].apply(parse_time)
df_bugfix["MergedAt_dt"] = df_bugfix["MergedAt"].apply(parse_time)

df_bugfix["TimeToIntegration_hours"] = (
    df_bugfix["MergedAt_dt"] - df_bugfix["CreatedAt_dt"]
).dt.total_seconds() / 3600

df_bugfix[["RepoName", "Title", "TimeToIntegration_hours", "Additions", "Deletions"]].head(5)


Unnamed: 0,RepoName,Title,TimeToIntegration_hours,Additions,Deletions
14,metaphorsystems/metaphor-python,fixed incorrect description of type arg in met...,0.039167,1,1
43,chitalian/gptask,Fix: recursive/glob support,1.715556,78,35
