In [1]:
import os
import requests
import time
import pandas as pd
from collections import Counter

In [2]:
GITHUB_TOKEN = os.getenv("GITHUB_PERSONAL_ACCESS_TOKEN")  
headers = {
    "Authorization": f"Bearer {GITHUB_TOKEN}",
    "Accept": "application/vnd.github+json"
}

In [3]:
stars = "500..1000"
forks = ">100"
pushed = ">2025-03-01"
created = "<2014-01-01"


query = f"created:{created} stars:{stars} forks:{forks} pushed:{pushed} mirror:false"
url = "https://api.github.com/search/repositories"


In [4]:
all_items = []
per_page = 100
max_pages = 10  # GitHub API search results are capped at 1000 results

for page in range(1, max_pages + 1):
    params = {
        "q": query,
        "sort": "stars",
        "order": "desc",
        "per_page": per_page,
        "page": page
    }

    response = requests.get(url, headers=headers, params=params)

    if response.status_code != 200:
        print(f"❌ Error on page {page}: {response.status_code}")
        break

    page_items = response.json().get("items", [])
    if not page_items:
        print(f"✅ No more results on page {page}. Stopping.")
        break

    all_items.extend(page_items)
    print(f"✅ Collected {len(page_items)} items from page {page}")

✅ Collected 100 items from page 1
✅ Collected 100 items from page 2
✅ Collected 100 items from page 3
✅ Collected 100 items from page 4
✅ Collected 100 items from page 5
✅ Collected 100 items from page 6
✅ Collected 100 items from page 7
✅ Collected 100 items from page 8
✅ Collected 1 items from page 9
✅ No more results on page 10. Stopping.


In [5]:
data = [
    {
        "repo_name": item["full_name"],
        "language": item["language"],
        "created_at": item["created_at"],
        "description": item["description"],
        "watchers": item["watchers_count"],
        "stars": item["stargazers_count"],
        "forks": item["forks_count"],
        "link": item["html_url"],
        "topics": item.get("topics", []),   
    }
    for item in all_items
]

df = pd.DataFrame(data)
print(f"\n📦 Final total: Collected {df.shape[0]} repositories")



📦 Final total: Collected 801 repositories


In [6]:
df.to_csv("github_repositories.csv", index=False)
print("✅ Data saved to github_repositories.csv")

✅ Data saved to github_repositories.csv


In [7]:
df


Unnamed: 0,repo_name,language,created_at,description,watchers,stars,forks,link,topics
0,jazzband/django-hosts,Python,2011-09-29T22:45:37Z,Dynamic and static host resolving for Django. ...,999,999,108,https://github.com/jazzband/django-hosts,[]
1,archlinuxarm/PKGBUILDs,Shell,2009-08-10T01:30:23Z,PKGBUILDs modified to build on Arch Linux ARM,998,998,615,https://github.com/archlinuxarm/PKGBUILDs,[]
2,neo4j-contrib/neomodel,Python,2012-09-12T16:17:06Z,An Object Graph Mapper (OGM) for the Neo4j gra...,996,996,232,https://github.com/neo4j-contrib/neomodel,"[neo4j, ogm, python]"
3,coderholic/pyradio,Python,2008-09-24T11:51:13Z,Curses based internet radio player,995,995,127,https://github.com/coderholic/pyradio,[]
4,EventSource/eventsource,TypeScript,2012-02-08T12:03:28Z,"EventSource client for Node.js, browsers and o...",994,994,255,https://github.com/EventSource/eventsource,"[eventsource, server-sent-events, sse]"
...,...,...,...,...,...,...,...,...,...
796,wxMaxima-developers/wxmaxima,C++,2011-02-28T20:25:57Z,A gui for the computer algebra system Maxima b...,502,502,103,https://github.com/wxMaxima-developers/wxmaxima,[]
797,omeka/Omeka,PHP,2011-07-01T02:10:58Z,A flexible web publishing platform for the dis...,501,501,198,https://github.com/omeka/Omeka,[]
798,FasterXML/jackson-module-scala,Scala,2010-11-29T23:01:08Z,Add-on module for Jackson (https://github.com/...,501,501,143,https://github.com/FasterXML/jackson-module-scala,"[cbor, hacktoberfest, jackson, json, scala, se..."
799,twilio/twilio-java,Java,2009-09-15T06:04:28Z,A Java library for communicating with the Twil...,500,500,440,https://github.com/twilio/twilio-java,"[api, mms, phone, sms, telephony, twilio, twim..."


In [8]:
interested_topics = ["cli", "compression", "library", # xz-utils tags
                     "open-source",  "linux", "debian", "ubuntu", # linux-related tags 
                     "firmware", "cybersecurity", "websockets", "network-protocols", "drivers", # security-related tags
                     "embedded", "ios-library", "andrioid-library"] # os-related tags
interested_topics_lower = [topic.lower() for topic in interested_topics]

filtered_repos = []
removed_repos = []

unique_topics = set()
topic_counter = Counter()

removal_stats = {
    "archived": 0,
    "disabled": 0,
    "fork": 0,
    "no_interested_topics": 0,
    "errors": 0
}

initial_count = df.shape[0]

In [9]:
for idx, row in df.iterrows():
    repo_topics = [topic.lower() for topic in row["topics"]]
    
    # Update the set and counter with topics from this repo
    unique_topics.update(repo_topics)
    topic_counter.update(repo_topics)
    
    # Check for interested topics (already defined as interested_topics_lower)
    matching_topics = [topic for topic in repo_topics if topic in interested_topics_lower]
    
    # If there are no matching topics, record removal and skip this repo
    if len(matching_topics) == 0:
        removal_stats["no_interested_topics"] += 1
        removed_repos.append({
            "Repo Name": row["repo_name"],
            "Reason": "No matching topics"
        })
        continue
    
    filtered_repos.append({
        "repo_name": row["repo_name"],
        "language": row["language"],
        "created_at": row["created_at"],
        "description": row["description"],
        "watchers": row["watchers"],
        "stars": row["stars"],
        "forks": row["forks"],
        "link": row["link"],
        "topics": matching_topics
    })
    
final_df = pd.DataFrame(filtered_repos)
final_df.to_csv("filtered_repositories.csv", index=False)
final_count = final_df.shape[0]

removed_df = pd.DataFrame(removed_repos)
removed_df.to_csv("removed_repos.csv", index=False)

# Convert the topic counter to a DataFrame (sorted descending by count)
topic_counts_df = pd.DataFrame(topic_counter.items(), columns=["Topic", "Count"])
topic_counts_df.sort_values(by="Count", ascending=False, inplace=True)
topic_counts_df.to_csv("topic_counts.csv", index=False)

# Print summary of counts
print(f"Initial repository count: {df.shape[0]}")
print(f"Removed (no matching topics): {removal_stats['no_interested_topics']}")
print(f"Final repository count after filtering: {final_count}")

Initial repository count: 801
Removed (no matching topics): 763
Final repository count after filtering: 38


In [10]:
final_df

Unnamed: 0,repo_name,language,created_at,description,watchers,stars,forks,link,topics
0,svinota/pyroute2,Python,2013-04-08T05:38:22Z,Python Netlink and PF_ROUTE library — network ...,987,987,251,https://github.com/svinota/pyroute2,[linux]
1,viblo/pymunk,Python,2013-10-02T14:36:46Z,Pymunk is a easy-to-use pythonic 2d physics li...,967,967,191,https://github.com/viblo/pymunk,[library]
2,nitrogen/nitrogen,Erlang,2008-10-28T01:21:03Z,Nitrogen Web Framework for Erlang (now with we...,960,960,159,https://github.com/nitrogen/nitrogen,[websockets]
3,openSUSE/open-build-service,Ruby,2011-07-14T06:52:57Z,Build and distribute Linux packages from sourc...,956,956,446,https://github.com/openSUSE/open-build-service,"[debian, ubuntu]"
4,apache/mina-sshd,Java,2010-05-26T23:38:08Z,Apache MINA sshd is a comprehensive Java libra...,954,954,369,https://github.com/apache/mina-sshd,[library]
5,drhelius/Gearboy,C++,2012-07-19T21:45:46Z,Game Boy / Gameboy Color emulator and debugger...,946,946,142,https://github.com/drhelius/Gearboy,[linux]
6,DSpace/DSpace,Java,2012-03-16T21:44:52Z,(Official) The DSpace digital asset management...,945,945,1331,https://github.com/DSpace/DSpace,[open-source]
7,cxong/cdogs-sdl,C,2013-01-12T05:08:52Z,Classic overhead run-and-gun game,939,939,122,https://github.com/cxong/cdogs-sdl,[open-source]
8,ninenines/gun,Erlang,2013-08-21T17:57:12Z,"HTTP/1.1, HTTP/2, Websocket client (and more) ...",909,909,232,https://github.com/ninenines/gun,[websockets]
9,commontk/CTK,C++,2010-06-14T15:23:41Z,A set of common support code for medical imagi...,888,888,502,https://github.com/commontk/CTK,[open-source]


In [11]:
numeric_stats = final_df[['watchers', 'stars', 'forks']].agg(['mean', 'median'])
print("Mean and Median of numeric columns (watchers, stars, forks):")
print(numeric_stats)


# Count the number of repositories per language
language_counts = final_df['language'].value_counts(dropna=True)
print("Repository counts by language:")
print(language_counts)

# Count the number of repositories per topic
# First, flatten the topics lists into one Series
topics_exploded = final_df['topics'].explode()
topic_counts = topics_exploded.value_counts(dropna=True)
print("\nRepository counts by topic:")
print(topic_counts)




Mean and Median of numeric columns (watchers, stars, forks):
          watchers       stars       forks
mean    764.131579  764.131579  267.157895
median  768.000000  768.000000  213.000000
Repository counts by language:
language
C         9
C++       8
Java      6
PHP       4
Python    3
Erlang    2
HTML      2
Ruby      1
ABAP      1
Lua       1
Scala     1
Name: count, dtype: int64

Repository counts by topic:
topics
linux          12
open-source    11
library         7
websockets      3
cli             3
debian          1
ubuntu          1
compression     1
embedded        1
Name: count, dtype: int64


In [12]:
removed_df

Unnamed: 0,Repo Name,Reason
0,jazzband/django-hosts,No matching topics
1,archlinuxarm/PKGBUILDs,No matching topics
2,neo4j-contrib/neomodel,No matching topics
3,coderholic/pyradio,No matching topics
4,EventSource/eventsource,No matching topics
...,...,...
758,wxMaxima-developers/wxmaxima,No matching topics
759,omeka/Omeka,No matching topics
760,FasterXML/jackson-module-scala,No matching topics
761,twilio/twilio-java,No matching topics


In [13]:
topic_counts_df


Unnamed: 0,Topic,Count
2,python,78
75,hacktoberfest,69
90,java,59
116,javascript,36
118,php,32
...,...,...
884,typelevel,1
883,scodec,1
882,encoding,1
881,decoding,1
