In [1]:
# Crawl GHArchive data (2023-07)

In [1]:
import requests

In [93]:
import datetime

In [118]:
tmp_date = datetime.datetime(year=2023, month=1, day=1, hour=0)

In [119]:
tmp_date

datetime.datetime(2023, 1, 1, 0, 0)

In [121]:
def gharchive_path(dt):
    return "{:04d}-{:02d}-{:02d}-{}.json.gz".format(dt.year, dt.month, dt.day, dt.hour)

In [122]:
gharchive_path(tmp_date)

'2023-01-01-0.json.gz'

In [123]:
def crawl_gharchive(path):
    archive_url = "https://data.gharchive.org/{}".format(path)
    resp = requests.get(archive_url)
    with open(path , "wb") as f:
        f.write(resp.content)

In [125]:
tmp_dt = datetime.datetime(year=2023, month=6, day=26, hour=19)
gharchive_json_gz_path = gharchive_path(tmp_dt)
crawl_gharchive(gharchive_json_gz_path)

# ClickHouse

Attempt to store json.gz into `clickhouse-server` host

# SPARK

In [5]:
import pyspark

In [36]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [126]:
df = spark.read.json(gharchive_json_gz_path)

In [127]:
df.show()

+--------------------+--------------------+-----------+--------------------+--------------------+------+--------------------+--------------------+
|               actor|          created_at|         id|                 org|             payload|public|                repo|                type|
+--------------------+--------------------+-----------+--------------------+--------------------+------+--------------------+--------------------+
|{https://avatars....|2023-06-26T19:00:00Z|30016285448|{https://avatars....|{created, null, n...|  true|{610919196, sora-...|PullRequestReview...|
|{https://avatars....|2023-06-26T19:00:00Z|30016285453|                null|{null, f7d36a024e...|  true|{657376817, Itsum...|           PushEvent|
|{https://avatars....|2023-06-26T19:00:00Z|30016285469|                null|{null, null, null...|  true|{658916140, sunsh...|         CreateEvent|
|{https://avatars....|2023-06-26T19:00:00Z|30016285480|                null|{null, db45eda74c...|  true|{639115888, al

In [128]:
split_col = pyspark.sql.functions.split(df['repo.name'], '/')

In [129]:
df = df.withColumn('repo_author', split_col.getItem(0))
df = df.withColumn('repo_name', split_col.getItem(1))

In [130]:
df.createOrReplaceTempView("activity")

In [131]:
spark.sql("SELECT repo_author, count(DISTINCT repo_name) as total_repo_name FROM activity GROUP BY repo_author ORDER BY total_repo_name desc").show(50)

+--------------------+---------------+
|         repo_author|total_repo_name|
+--------------------+---------------+
|     direwolf-github|            145|
|           microsoft|            120|
|redhat-appstudio-...|            101|
|          tf-vcs-e2e|             96|
|       released-info|             85|
|          Automattic|             70|
|       gentoo-mirror|             65|
|               YNSTT|             56|
|               Azure|             55|
|              google|             53|
| learn-co-curriculum|             50|
|           openshift|             49|
|                Mu-L|             48|
|              apache|             45|
|                dvsa|             44|
|                Esri|             44|
|               bcgov|             43|
|                 aws|             42|
|        mate-academy|             39|
|          slskopytko|             38|
|             kkpan11|             35|
|         HUISSINSJKN|             34|
|         conda-forge|   

In [132]:
spark.sql("SELECT distinct repo.name FROM activity WHERE repo_author = 'SwatGetmann'").show(50)

+--------------------+
|                name|
+--------------------+
|SwatGetmann/ge_cu...|
+--------------------+



In [133]:
df.groupBy("repo.name").count().show()

+--------------------+-----+
|                name|count|
+--------------------+-----+
|PreTeXtBook/prete...|    1|
|Viveksati5143/Edu...|    3|
|mobilehackinglab/...|    1|
|ricardo-campos-or...|    4|
|EkkoG/openwrt-pac...|   64|
|   tony-eneh/nwaeneh|    1|
|         pddemo/demo|   58|
|wantonraven/alx-f...|    8|
|    hapiel/Acrobot-2|    1|
|iamtushar007/DSA-...|    1|
|toolforge/tool-sp...|    1|
|pmmonarrez/challenge|    1|
|anasilverio19/qua...|    2|
|samyakraka2908/Am...|    1|
|NicholaJoe/improv...|    2|
|ProdigyReloaded/d...|    1|
|   openshift/console|    7|
|quenktechnologies...|    3|
|microsoft/TypeScript|   23|
|trycourier/courie...|    1|
+--------------------+-----+
only showing top 20 rows



In [111]:
df.select(
    df["repo.name"], 
    df["org.id"],
    df["org.login"], 
    df["payload.action"], 
    df["type"]
).show()

+--------------------+---------+--------------------+-------+--------------------+
|                name|       id|               login| action|                type|
+--------------------+---------+--------------------+-------+--------------------+
|heng21/WinMTR-Simple|     null|                null|started|          WatchEvent|
|prawn-test-stagin...|104471395|prawn-test-stagin...|   null|         DeleteEvent|
|status-im/status-...| 11767950|           status-im|created|PullRequestReview...|
|em3ndez/nextrepor...|     null|                null| opened|    PullRequestEvent|
|           gocurr/go|     null|                null|   null|         CreateEvent|
|Floyd87297/Floyd8...|     null|                null|   null|           ForkEvent|
|marcelstoer/nodem...|     null|                null|   null|           PushEvent|
|Chukwu3meka/Socce...|     null|                null| closed|    PullRequestEvent|
|google-test2/sign...|  9579519|        google-test2|   null|         DeleteEvent|
|ser

In [112]:
df.groupBy("repo.name").count().show()

+--------------------+-----+
|                name|count|
+--------------------+-----+
|trim21/bangumi-ep...|   24|
|    rust-lang/mdBook|    4|
|vipinms2/where-fo...|    1|
|CodeSystem2022/Pr...|    8|
|Luxiu123/GitAutoC...|    1|
|kerllouskhairy987...|    2|
|        liaogx/quant|    1|
|  gqylpy/gqylpy-dict|    2|
|lukasariel112/aul...|    3|
|rojasricor/expres...|    3|
|     veigasjeff/free|    2|
|AkantMalviya/Web_...|    3|
|   openshift/console|    5|
|   zhzhyi/jax-triton|    1|
|lmorchard/fossilizer|    1|
|   bytedeck/bytedeck|    4|
|microsoft/TypeScript|    8|
|       loxkim/clamav|    1|
|         pddemo/demo|   58|
|spgirard/rock-pap...|    4|
+--------------------+-----+
only showing top 20 rows



In [43]:
df.printSchema()

root
 |-- actor: struct (nullable = true)
 |    |-- avatar_url: string (nullable = true)
 |    |-- display_login: string (nullable = true)
 |    |-- gravatar_id: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- login: string (nullable = true)
 |    |-- url: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- id: string (nullable = true)
 |-- org: struct (nullable = true)
 |    |-- avatar_url: string (nullable = true)
 |    |-- gravatar_id: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- login: string (nullable = true)
 |    |-- url: string (nullable = true)
 |-- payload: struct (nullable = true)
 |    |-- action: string (nullable = true)
 |    |-- before: string (nullable = true)
 |    |-- comment: struct (nullable = true)
 |    |    |-- _links: struct (nullable = true)
 |    |    |    |-- html: struct (nullable = true)
 |    |    |    |    |-- href: string (nullable = true)
 |    |    |    |-- pull_request: struct (nul