In [3]:
#!pip install python_dotenv
#!pip install os
#!pip install requests
#!pip install gql
#!pip install requests-toolbelt
#!pip install re
#!pip install pandas
#!pip install fasttext
#!pip install numpy
#!pip install sklearn
#!pip install matplotlib
#!pip install seaborn
#!pip install nltk
#!pip install fpdf

In [4]:
import os
from dotenv import load_dotenv
import pandas as pd

In [5]:
#carrega a chave da API
load_dotenv(".env")
token = os.getenv('GH_TOKEN')
if token is None:
    raise ValueError("GitHub token is not set. Check your .env file.")

In [6]:
# load the Full taxonomy

# TO DO: load the full taxonomy from the csv file
# TO DO: create a dictionary with the taxonomy in the pdf

# example taxonomy of ethics in software
taxonomy = {
    "Privacy": [
        "data protection",
        "user consent",
        "encryption",
        "anonymity",
        "data retention policies",
        "GDPR compliance",
        "privacy by design"
    ],
    "Security": [
        "cybersecurity",
        "vulnerability management",
        "authentication",
        "secure coding",
        "penetration testing",
        "incident response",
        "zero trust architecture"
    ],
    "Transparency": [
        "open source",
        "code auditing",
        "algorithm explainability",
        "accountability",
        "documentation standards",
        "clear user agreements",
        "audit trails"
    ],
    "Bias and Fairness": [
        "algorithmic bias",
        "inclusive design",
        "diversity in datasets",
        "equitable outcomes",
        "unbiased decision-making",
        "ethical AI",
        "fair representation"
    ],
    "Sustainability": [
        "energy-efficient algorithms",
        "hardware lifecycle management",
        "carbon footprint reduction",
        "eco-friendly coding practices",
        "green IT",
        "sustainable software design"
    ],
    "Accessibility": [
        "universal design",
        "assistive technology compatibility",
        "web accessibility guidelines (WCAG)",
        "inclusive UX design",
        "multi-language support",
        "alternative text for media",
        "keyboard navigation"
    ],
    "Responsibility": [
        "ethical responsibility",
        "user harm prevention",
        "impact assessments",
        "corporate social responsibility",
        "responsible disclosure",
        "misuse prevention",
        "code of conduct"
    ]
}


In [7]:
import requests
import json
import pandas as pd

# Query to fetch issues and their comments
QUERY = """
query FetchIssues($repoOwner: String!, $repoName: String!, $cursor: String) {
  repository(owner: $repoOwner, name: $repoName) {
    issues(first: 100, after: $cursor) {
      edges {
        node {
          title
          body
          createdAt
          comments(first: 100) {
            edges {
              node {
                body
                createdAt
              }
            }
          }
        }
      }
      pageInfo {
        endCursor
        hasNextPage
      }
    }
  }
}
"""

# Function to fetch data
def fetch_issues(repo_owner, repo_name):
    cursor = None
    all_issues = []
    while True:
        variables = {"repoOwner": repo_owner, "repoName": repo_name, "cursor": cursor}
        response = requests.post(
            "https://api.github.com/graphql",
            json={"query": QUERY, "variables": variables},
            headers={"Authorization": f"Bearer {token}"}
        )
        data = response.json()
        if "errors" in data:
            raise Exception(f"GraphQL Error: {data['errors']}")
        
        issues = data["data"]["repository"]["issues"]["edges"]
        for issue in issues:
            node = issue["node"]
            issue_data = {
                "title": node["title"],
                "body": node["body"],
                "createdAt": node["createdAt"],
                "comments": [
                    {"body": comment["node"]["body"], "createdAt": comment["node"]["createdAt"]}
                    for comment in node["comments"]["edges"]
                ]
            }
            all_issues.append(issue_data)
        
        # Pagination check
        page_info = data["data"]["repository"]["issues"]["pageInfo"]
        if not page_info["hasNextPage"]:
            break
        cursor = page_info["endCursor"]

    return all_issues

def fetch_repos(repo_csv): 
    print('Fetching repositories from csv file...')
    # Load the csv file
    df = pd.read_csv(repo_csv)
    # Fetch the repositories saving owner/repo
    repos = []
    for index, row in df.iterrows():
        repos.append(row['owner'] + '/' + row['repo'])
    return repos

def fetch_issues_from_repos(repos):
    issues_by_repo = {}
    for repo in repos:
        print(f"Fetching issues from {repo}...")
        owner, repo_name = repo.split("/")
        issues = fetch_issues(owner, repo_name)
        print(f"Found {len(issues)} issues.")
        issues_by_repo[repo] = issues  # Store issues by repository
    return issues_by_repo


In [8]:
repos = fetch_repos('coins_base.csv')
issues = fetch_issues_from_repos(repos)

#save the issues to a json file for raw issues
with open('raw_issues.json', 'w') as f:
    json.dump(issues, f)

Fetching repositories from csv file...
Fetching issues from oasisprotocol/oasis-core...
Found 1850 issues.
Fetching issues from klaytn/klaytn...
Found 339 issues.
Fetching issues from harmony-one/harmony...
Found 1207 issues.
Fetching issues from parallel-finance/parallel...
Found 834 issues.
Fetching issues from freeverseio/laos...
Found 307 issues.
Fetching issues from nucypher/nucypher...
Found 1734 issues.
Fetching issues from ethereum/go-ethereum...
Found 8155 issues.
Fetching issues from witnet/witnet-rust...
Found 1156 issues.
Fetching issues from sora-xor/sora2-network...
Found 611 issues.
Fetching issues from Agoric/agoric-sdk...
Found 4606 issues.
Fetching issues from ainblockchain/ain-blockchain...
Found 410 issues.
Fetching issues from bigbangcore/BigBang...
Found 235 issues.
Fetching issues from massalabs/massa...
Found 3001 issues.
Fetching issues from ton-blockchain/TEPs...
Found 252 issues.
Fetching issues from red/red...
Found 4025 issues.
Fetching issues from cryptobl