In [1]:
import requests
import csv
import json
import pandas as pd
import numpy as np

In [2]:
# Replace with your GitHub personal access token
access_token = "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"

# GitHub API endpoint
base_url = "https://api.github.com"

# Repository details
owner = "autowarefoundation"
repo_name = "autoware"

# Define the year you want to filter discussions for
year = "2022"

# Initialize data containers
discussions_data = []

# Prepare headers with authorization
headers = {
    "Authorization": f"Bearer {access_token}"
}

# Define the API endpoint for issues in the repository
issues_endpoint = f"/repos/{owner}/{repo_name}/issues"

In [3]:
# Function to fetch comments for an issue
def fetch_comments(issue_number):
    comments_url = f"{base_url}{issues_endpoint}/{issue_number}/comments"
    response = requests.get(comments_url, headers=headers)
    if response.status_code == 200:
        return response.json()
    return []

In [4]:
# Fetch the comments for issue 3
issue_comments = fetch_comments(3)

# Use json.dumps with indent to make the JSON response readable
print(json.dumps(issue_comments, indent=4))

[
    {
        "url": "https://api.github.com/repos/autowarefoundation/autoware/issues/comments/134894759",
        "html_url": "https://github.com/autowarefoundation/autoware/issues/3#issuecomment-134894759",
        "issue_url": "https://api.github.com/repos/autowarefoundation/autoware/issues/3",
        "id": 134894759,
        "node_id": "MDEyOklzc3VlQ29tbWVudDEzNDg5NDc1OQ==",
        "user": {
            "login": "kuriking",
            "id": 1448722,
            "node_id": "MDQ6VXNlcjE0NDg3MjI=",
            "avatar_url": "https://avatars.githubusercontent.com/u/1448722?v=4",
            "gravatar_id": "",
            "url": "https://api.github.com/users/kuriking",
            "html_url": "https://github.com/kuriking",
            "followers_url": "https://api.github.com/users/kuriking/followers",
            "following_url": "https://api.github.com/users/kuriking/following{/other_user}",
            "gists_url": "https://api.github.com/users/kuriking/gists{/gist_id}",
        

In [5]:
# Fetch all issues (discussions)
page = 1
while True:
    params = {
        "state": "all",
        "per_page": 100,
        "page": page
    }
    response = requests.get(base_url + issues_endpoint, headers=headers, params=params)
    if response.status_code == 200:
        issues = response.json()
        if not issues:
            break
        for issue in issues:
            # Check if the issue was created in the specified year
            created_at = issue["created_at"][:4]  # Extract the year part
            if created_at == year:
                discussion_data = {
                    "title": issue["title"],
                    "user_id": issue["user"]["login"],
                    "created_at": issue["created_at"],
                    "comments": []
                }
                # Fetch comments for this issue
                comments = fetch_comments(issue["number"])
                for comment in comments:
                    comment_data = {
                        "user_id": comment["user"]["login"],
                        "created_at": comment["created_at"],
                        "comment_text": comment["body"]
                    }
                    discussion_data["comments"].append(comment_data)
                discussions_data.append(discussion_data)
        page += 1
    else:
        print(f"Failed to fetch issues. Status code: {response.status_code}")
        break

In [6]:
# Create a CSV file with the discussions data
csv_file_path = f"discussions_{year}.csv"
with open(csv_file_path, "w", newline="", encoding="utf-8") as csv_file:
    fieldnames = ["title", "user_id", "created_at", "comments"]
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    for discussion in discussions_data:
        writer.writerow(discussion)

In [7]:
# Create a JSON file with the discussions data
json_file_path = f"discussions_{year}.json"
with open(json_file_path, "w", encoding="utf-8") as json_file:
    json.dump(discussions_data, json_file, indent=4)

In [8]:
print(f"Data for year {year} has been saved to {csv_file_path} and {json_file_path}")

Data for year 2022 has been saved to discussions_2022.csv and discussions_2022.json


In [9]:
discussion_df = pd.read_csv(f"discussions_{year}.csv")

In [10]:
discussion_df.head(5)

Unnamed: 0,title,user_id,created_at,comments
0,No package mactching ros-galactic-desktop avai...,anilbommareddy,2022-12-27T04:14:09Z,"[{'user_id': 'kenji-miyake', 'created_at': '20..."
1,chore(pre-commit workflows): add comments to r...,kenji-miyake,2022-12-22T06:15:56Z,[]
2,DevOps Dojo: ROS Node Documentation (Phase 1),kaspermeck-arm,2022-12-21T22:27:29Z,"[{'user_id': 'kaspermeck-arm', 'created_at': '..."
3,DevOps Dojo: ROS Node Configuration (Phase 1),kaspermeck-arm,2022-12-21T22:15:16Z,"[{'user_id': 'kaspermeck-arm', 'created_at': '..."
4,ci(pre-commit): autoupdate,pre-commit-ci[bot],2022-12-12T17:29:27Z,[]


In [11]:
# Read the JSON file
with open(f"discussions_{year}.json", "r", encoding="utf-8") as json_file:
    data = json.load(json_file)

In [12]:
# Print the first 1-2 entries in a pretty JSON format
print(json.dumps(data[:2], indent=4, ensure_ascii=False))

[
    {
        "title": "No package mactching ros-galactic-desktop available  for AWSIM-stable branch in Ubuntu 22.04",
        "user_id": "anilbommareddy",
        "created_at": "2022-12-27T04:14:09Z",
        "comments": [
            {
                "user_id": "kenji-miyake",
                "created_at": "2022-12-27T04:19:06Z",
                "comment_text": "@shmpwk Could you tell me the right branch we should use for Humble?"
            },
            {
                "user_id": "shmpwk",
                "created_at": "2022-12-27T05:17:42Z",
                "comment_text": "@anilbommareddy \r\nCurrently, the official support for AWSIM is only ROS2 Galactic, Ubuntu20.04.\r\n\r\nFor Ubuntu 22.04, try work in progress version which includes our individual repos but will be officially supported soon:\r\nAutoware: https://github.com/shmpwk/autoware/tree/humble-awsim-stable\r\nAWSIM: https://github.com/tier4/AWSIM/tree/humble\r\n\r\nTry the following command\r\n```\r\ngit clone h

In [13]:
# Create a new data structure with the desired format
new_data = []

for discussion in discussions_data:
    thread_id = len(new_data) + 1  # Assign unique thread ID
    thread_creator = discussion["user_id"]  # Get the thread creator's username

    # Create a placeholder for threads without comments
    if not discussion["comments"]:
        comment_data = {
            "title": discussion["title"],  # Include the title
            "Created_at": discussion["created_at"],
            "thread_id": thread_id,
            "subtitle_id": "",  # Empty for threads without comments
            "Type": "",  # Empty for threads without comments
            "Comment_text": "",  # Empty for threads without comments
            "creator": thread_creator  # Include the thread creator
        }
        new_data.append(comment_data)

    for index, comment in enumerate(discussion["comments"], start=1):
        comment_creator = comment["user_id"]
        if comment_creator == thread_creator:
            comment_type = "questioned"
        else:
            comment_type = "answered"

        comment_data = {
            "title": discussion["title"],  # Include the title
            "Created_at": comment["created_at"],
            "thread_id": thread_id,
            "subtitle_id": index,  # Assign unique subtitle ID
            "Type": comment_type,
            "Comment_text": comment["comment_text"],
            "creator": comment_creator  # Include the comment creator
        }
        new_data.append(comment_data)

In [14]:
# Create a new CSV file with the desired columns
csv_file_path = f"discussions_{year}_new.csv"
with open(csv_file_path, "w", newline="", encoding="utf-8") as csv_file:
    fieldnames = ["title", "Created_at", "thread_id", "subtitle_id", "Type", "Comment_text", "creator"]
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    for data in new_data:
        writer.writerow(data)

In [15]:
print(f"Data analysis and manipulation completed. New data saved to {csv_file_path}")

Data analysis and manipulation completed. New data saved to discussions_2022_new.csv


In [16]:
discussion_new_df = pd.read_csv(f"discussions_{year}_new.csv")

In [17]:
discussion_new_df.head(5)

Unnamed: 0,title,Created_at,thread_id,subtitle_id,Type,Comment_text,creator
0,No package mactching ros-galactic-desktop avai...,2022-12-27T04:19:06Z,1,1.0,answered,@shmpwk Could you tell me the right branch we ...,kenji-miyake
1,No package mactching ros-galactic-desktop avai...,2022-12-27T05:17:42Z,1,2.0,answered,"@anilbommareddy \r\nCurrently, the official su...",shmpwk
2,No package mactching ros-galactic-desktop avai...,2023-01-04T04:51:38Z,1,3.0,answered,hii @shmpwk \r\nShall we install autoware for ...,patelabhay-12
3,No package mactching ros-galactic-desktop avai...,2023-01-04T04:57:38Z,1,4.0,answered,@patelabhay-12 \r\nStill the above step is not...,shmpwk
4,No package mactching ros-galactic-desktop avai...,2023-01-04T05:03:21Z,1,5.0,answered,\r\n![Screenshot from 2023-01-04 10-30-12](htt...,patelabhay-12
