In [50]:
import pandas as pd
from dotenv import load_dotenv
import os
import requests
import json

In [45]:
load_dotenv()

token = os.getenv('TOKEN')

df = pd.read_csv("./dataset.csv", delimiter=';')

def get_paginated_data(url, headers):
    data = []
    while url:
        response = requests.get(url, headers=headers)
        
        if response.status_code == 200:
            response_data = response.json()
            data.extend(parse_data(response_data))
            
            link_header = response.headers.get('Link')
            if link_header and 'rel="next"' in link_header:
                next_url = [url_part.split(";")[0].strip("<>") 
                            for url_part in link_header.split(",") 
                            if 'rel="next"' in url_part]
                url = next_url[0] if next_url else None
            else:
                url = None
        else:
            print(f"Erro: {response.status_code}, {response.text}")
            break
    return data

def parse_data(data):
    if isinstance(data, list):
        return data

    if not data:
        return []

    data.pop('incomplete_results', None)
    data.pop('repository_selection', None)
    data.pop('total_count', None)

    namespace_key = list(data.keys())[0]
    return data[namespace_key]

In [14]:
df.columns.values

array(['URL', 'Unnamed: 0', 'Identifier', '_id', 'FileInfo', 'NumAuthors',
       'MonNauth', 'NumActiveMon', 'EarliestCommitDate',
       'EarliestCommitDateConverted', 'NumBlobs', 'LatestCommitDate',
       'LastCommitDateConverted', 'ProjectID', 'MonNcmt', 'NumCore',
       'NumCommits', 'CommunitySize', 'NumFiles', 'Core', 'NumForks',
       'n_microservices', 'Tot. Duration (Y)', 'Application Type',
       'Application Purpose', 'Developed by', 'Archived',
       'WIP/Incomplete', 'Is a Microservices?', 'num_services', 'servers',
       'languages', 'num_langs', 'images', 'num_dbs', 'dbs',
       'num_servers ', 'num_buses', 'buses', 'gates', 'monitors',
       'num_discos', 'shared_dbs', 'num_dockers', 'dockers_raw',
       'structure_raw', 'ms_depend_graph', 'avg_size_service',
       'commit_hash'], dtype=object)

In [15]:
mediana_num_authors = df['NumAuthors'].median()

mediana_n_microservices = df['n_microservices'].median()

mediana_community_size = df['CommunitySize'].median()

print(mediana_num_authors)
print(mediana_n_microservices)
print(mediana_community_size)

18.5
5.0
11.0


In [16]:
df_filtrado = df.where(
    (df['NumAuthors'] > mediana_num_authors) &
    (df['n_microservices'] > mediana_n_microservices) &
    (df['CommunitySize'] > mediana_community_size) &
    (df['Archived'] == 'No')
)

df_filtrado = df_filtrado.dropna(how='any')

In [17]:
df_filtrado.head()

Unnamed: 0.1,URL,Unnamed: 0,Identifier,_id,FileInfo,NumAuthors,MonNauth,NumActiveMon,EarliestCommitDate,EarliestCommitDateConverted,...,gates,monitors,num_discos,shared_dbs,num_dockers,dockers_raw,structure_raw,ms_depend_graph,avg_size_service,commit_hash
0,https://github.com/claranet/spryker-demoshop,4.0,claranet/spryker-demoshop,ObjectId(629018d88b2efbc6e3b0c2ca),"{""Ruby"":20,""Perl"":1,""TypeScript"":1021,""Python""...",429.0,"{""2016-01"":20,""2020-07"":80,""2018-12"":102,""2016...",99.0,1377164000.0,22.8.2013,...,[],[],0.0,False,1.0,"{files:[{'path': '/package.json', 'dbs': ['fou...","{'path': '/docker/docker-compose.yml', 'num_se...","{'nodes': 12, 'edges': 0, 'avg_deps_per_servic...",90807.0,42a417db0247a90d52b061b8cf7ddb42b03b3eaa
6,https://github.com/EGroupware/egroupware,20.0,EGroupware/egroupware,ObjectId(62904fdd8b2efbc6e300d678),"{""Perl"":25,""TypeScript"":101,""Python"":14,""Rust""...",117.0,"{""2010-09"":6,""2016-01"":4,""2020-07"":6,""2005-10""...",253.0,966569100.0,18.8.2000,...,['nginx'],[],0.0,True,2.0,"{files:[{'path': '/package.json', 'dbs': [], '...","{'path': '/doc/docker/docker-compose.yml', 'nu...","{'nodes': 6, 'edges': 2, 'avg_deps_per_service...",22495.5,9b6a14f2d8ec1bb8959db252d7b01655a0567518
7,https://github.com/mozilla-bteam/bmo,21.0,mozilla-bteam/bmo,ObjectId(62905f438b2efbc6e3d7dfe1),"{""Ruby"":1,""Perl"":2192,""Rust"":180,""Python"":20,""...",449.0,"{""2016-01"":18,""2005-10"":12,""2001-09"":12,""2018-...",280.0,904137300.0,26.8.1998,...,[],[],0.0,True,1.0,{files:[{'path': '/docs/en/rst/requirements.tx...,"{'path': '/docker-compose.yml', 'num_services'...","{'nodes': 6, 'edges': 1, 'avg_deps_per_service...",54004.0,617b0ebb46c3cefa3d3844f3e6dd7414f4dd7632
10,https://github.com/taskcluster/taskcluster,28.0,taskcluster/taskcluster,ObjectId(6290411a8b2efbc6e3325313),"{""Ruby"":46,""Perl"":10,""Rust"":101,""Python"":319,""...",256.0,"{""2016-01"":12,""2020-07"":15,""2018-12"":23,""2016-...",96.0,1388476000.0,31.12.2013,...,['nginx'],['monitoring'],0.0,True,7.0,{files:[{'path': '/taskcluster/requirements.tx...,"{'path': '/docker-compose.yml', 'num_services'...","{'nodes': 50, 'edges': 25, 'avg_deps_per_servi...",4773.0,4c354f41baa9f4cfd9472679f0403958155c9985
11,https://github.com/samvera/hyrax,29.0,samvera/hyrax,ObjectId(6290354a8b2efbc6e3903ae5),"{""Ruby"":12962,""Perl"":188,""Python"":9,""PHP"":9,""o...",288.0,"{""2010-09"":5,""2016-01"":22,""2020-07"":13,""2018-1...",141.0,1269196000.0,21.3.2010,...,[],[],0.0,True,1.0,"{files:[{'path': '/package.json', 'dbs': [], '...","{'path': '/docker-compose.yml', 'num_services'...","{'nodes': 6, 'edges': 7, 'avg_deps_per_service...",16399.0,ed2b4fbb48dd61da78e93c741b86e2f04422b1cd


In [42]:
urlApi = "https://api.github.com/repos"

urlRepo = df_filtrado['URL'][0]

parts = urlRepo.split('https://github.com/')[1].split('/')

owner = parts[0]
repo = parts[1]


headers = {
    "Accept": "application/vnd.github+json",
    "Authorization": f"Bearer {token}",
    "X-GitHub-Api-Version": "2022-11-28"
}

urlFinal = f"{urlApi}/{owner}/{repo}/commits"

response = requests.get(urlFinal, headers=headers)

if response.status_code == 200:
    commits = response.json()
else:
    print(f"Erro: {response.status_code}, {response.text}")

In [44]:
len(commits)

13

In [46]:
urlApi = "https://api.github.com/repos"

urlRepo = df_filtrado['URL'][0]

parts = urlRepo.split('https://github.com/')[1].split('/')

owner = parts[0]
repo = parts[1]

url = f"https://api.github.com/repos/{owner}/{repo}/commits"
headers = {
    "Accept": "application/vnd.github+json",
    "Authorization": f"Bearer {token}",
    "X-GitHub-Api-Version": "2022-11-28"
}

data = get_paginated_data(url, headers)


In [48]:
len(data)

13309

In [51]:
def save_data_to_json(data, filename="data.json"):
    with open(filename, "w") as json_file:
        json.dump(data, json_file, indent=4)

save_data_to_json(data, "github_commits.json")