In [0]:
pip install azure.storage.blob

Python interpreter will be restarted.
Collecting azure.storage.blob
  Downloading azure_storage_blob-12.16.0-py3-none-any.whl (387 kB)
Collecting isodate>=0.6.1
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
Collecting azure-core<2.0.0,>=1.26.0
  Downloading azure_core-1.27.1-py3-none-any.whl (174 kB)
Collecting typing-extensions>=4.0.1
  Downloading typing_extensions-4.6.3-py3-none-any.whl (31 kB)
Installing collected packages: typing-extensions, isodate, azure-core, azure.storage.blob
  Attempting uninstall: typing-extensions
    Found existing installation: typing-extensions 4.1.1
    Not uninstalling typing-extensions at /databricks/python3/lib/python3.9/site-packages, outside environment /local_disk0/.ephemeral_nfs/envs/pythonEnv-014eb887-9638-458f-9731-e828cc15af79
    Can't uninstall 'typing-extensions'. No files were found to uninstall.
Successfully installed azure-core-1.27.1 azure.storage.blob-12.16.0 isodate-0.6.1 typing-extensions-4.6.3
Python interpreter will be 

In [0]:
import requests
import pandas as pd
import json
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from azure.core.pipeline.transport import HttpResponse
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, coalesce, lit
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BooleanType, ArrayType
import gzip
from pathlib import Path
import os
import glob



In [0]:

def get_data(date, time):
    filename = str(date) +'-'+ str(time)
    url = "https://data.gharchive.org/"+ filename +'.json.gz'
    print(url)

    response = requests.get(url)

    if response.status_code == 200:

       with open(filename +'.json.gz', "wb") as file:

            file.write(response.content)

       print("File downloaded successfully.")

    else:

       print("Error downloading the file.")

In [0]:
def unzip_jsongz(filename):
    input_file = '/databricks/driver/'+filename+'.json.gz'
    output_file = '/databricks/driver/'+filename+'.json'
    with gzip.open(input_file, 'rb') as gz_file:
        with open(output_file, 'wb') as out_file:
            out_file.write(gz_file.read())

In [0]:
def flatten_df(df):
    columns = df.columns
    select_exprs = [
        col("type"),
        col("public"),
        col("created_at")
    ]
    payload_columns = [
        "payload.action",
        "payload.before",
        "payload.comment",
        "payload.commits",
        "payload.description",
        "payload.distinct_size",
        "payload.forkee",
        "payload.head",
        "payload.issue",
        "payload.master_branch",
        "payload.member",
        "payload.number",
        "payload.release",
        "payload.repository_id",
        "payload.review",
        "payload.pages",
        "payload.pull_request",
        "payload.push_id",
        "payload.pusher_type",
        "payload.ref",
        "payload.size",
        "payload.ref_type"
    ]
    for column in payload_columns:
        #if column in columns:
        select_expr = col(column).cast("string").alias(column)
        #else:
        #    select_expr = lit(None).alias(column)
        
        select_exprs.append(select_expr)
    
    flat_df = df.select(select_exprs)
    return flat_df

In [0]:
def flatten_issues_df(issues_df):
    issues_flat_df = issues_df.select(
    col("type"),
    col("public"),
    col("created_at"),
    col("payload.action").alias("payload.action"),
    col("payload.issue.url").alias("payload.issue.url"),
    col("payload.issue.number").alias("payload.issue.number"),
    col("payload.issue.title").alias("payload.issue.title"),
    col("payload.issue.user.login").alias("payload.issue.user.login"),
    col("payload.issue.user.type").alias("payload.issue.user.type"),
    col("payload.issue.user.site_admin").alias("payload.issue.user.site_admin"),
    col("payload.issue.labels").alias("payload.issue.labels"),
    col("payload.issue.state").alias("payload.issue.state"),
    col("payload.issue.locked").alias("payload.issue.locked"),
    col("payload.issue.comments").alias("payload.issue.comments"),
    col("payload.issue.created_at").alias("payload.issue.created_at"),
    #col("payload.issue.milestone").alias("payload.issue.milestone"),
    #col("payload.issue.assignee").alias("payload.issue.assignee"),
    #col("payload.issue.assignees").alias("payload.issue.assignees"),
    col("payload.issue.updated_at").alias("payload.issue.updated_at"),
    col("payload.issue.closed_at").alias("payload.issue.closed_at"),
    col("payload.issue.author_association").alias("payload.issue.author_association"),
    col("payload.issue.active_lock_reason").alias("payload.issue.active_lock_reason"),
    col("payload.issue.body").alias("payload.issue.body"),
    #col("payload.issue.performed_via_github_app").alias("payload.issue.performed_via_github_app"),
    col("payload.issue.state_reason").alias("payload.issue.state_reason"))
    return issues_flat_df

In [0]:
from pyspark.sql.functions import col, coalesce, lit

def flatten_push_df(push_df):
    columns = push_df.columns
    select_exprs = [
        col("type"),
        col("public"),
        col("created_at"),
        col("org"),
        col("actor"),
        col("repo")
    ]
    payload_columns = [
        "payload.action",
        "payload.before",
        #"payload.comment",
        "payload.commits",
        #"payload.description",
        #"payload.distinct_size",
        #"payload.forkee",
        #"payload.head",
        #"payload.issue",
        #"payload.master_branch",
        #"payload.member",
        #"payload.number",
        #"payload.release",
        "payload.repository_id",
        "payload.review",
        "payload.pages",
        #"payload.pull_request",
        "payload.push_id",
        "payload.pusher_type",
        "payload.ref",
        "payload.size",
        "payload.ref_type"
    ]
    for column in payload_columns:
        #if column in columns:
        select_expr = col(column).alias(column)
        #else:
        #    select_expr = lit(None).alias(column)
        
        select_exprs.append(select_expr)
    
    flat_push_df = push_df.select(select_exprs)
    return flat_push_df

In [0]:
def upload_to_blob_storage( connection_string, container_name, file_path, blob_name):
    blob_service_client = BlobServiceClient.from_connection_string(connection_string)
    blob_client = blob_service_client.get_blob_client(container= container_name, blob= blob_name)

    with open(file_path, "rb") as data:
        blob_client.upload_blob(data)
     

In [0]:
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [0]:
cache_folder_path_spark = "file:///databricks/driver/cache"
cache_folder_path = "/databricks/driver/cache"
folder_path = "/databricks/driver/"
connection_string = "DefaultEndpointsProtocol=https;AccountName=pod4projectstorage;AccountKey=2hClDrVPLGX4QBDBk8OylAkHqczIQfDja66Yl488rmj/0+vb+CAzOxL5qMe5XyM9ZupgwveVRm3N+AStriO5vg==;EndpointSuffix=core.windows.net"
container_name = "pod4-project-blob-container"

for day in range(1,32):
    day_str = "{:02}".format(day)
    main_df = None
    parquet_filename = None
    for hour in range(0,24):
        filename = "2023-05-" + day_str +"-" + str(hour)
        get_data("2023-05-"+day_str, hour)
        unzip_jsongz(filename)
        df = spark.read.json("file:///databricks/driver/"+filename+".json")
        push_df = df.filter(df.type == "PushEvent")
        flat_df = flatten_push_df(df)
        if main_df is not None:
            main_df = main_df.unionAll(flat_df)
        else:
            main_df = flat_df 
    if os.path.exists(cache_folder_path):
        os.system("rm -rf {}".format(cache_folder_path))
        print("cache cleared")
    main_df.coalesce(1).write.parquet(cache_folder_path_spark)
    parquet_files = Path(cache_folder_path).glob("*.parquet")
    for file in parquet_files:
        parquet_filename = file
        print(parquet_filename)
    blob_name = "2023-05-"+day_str+".snappy.parquet"
    upload_to_blob_storage( connection_string, container_name, parquet_filename, blob_name)
    print("uploaded {} to blob storage".format(blob_name))
    json_files = glob.glob(os.path.join(folder_path, "*.json*"))
    for file_path in json_files:
        os.remove(file_path)
        print("File '{}' removed.".format(file_path))
    

https://data.gharchive.org/2023-05-01-0.json.gz
File downloaded successfully.
https://data.gharchive.org/2023-05-01-1.json.gz
File downloaded successfully.
https://data.gharchive.org/2023-05-01-2.json.gz
File downloaded successfully.
https://data.gharchive.org/2023-05-01-3.json.gz
File downloaded successfully.
https://data.gharchive.org/2023-05-01-4.json.gz
File downloaded successfully.
https://data.gharchive.org/2023-05-01-5.json.gz
File downloaded successfully.
https://data.gharchive.org/2023-05-01-6.json.gz
File downloaded successfully.
https://data.gharchive.org/2023-05-01-7.json.gz
File downloaded successfully.
https://data.gharchive.org/2023-05-01-8.json.gz
File downloaded successfully.
https://data.gharchive.org/2023-05-01-9.json.gz
File downloaded successfully.
https://data.gharchive.org/2023-05-01-10.json.gz
File downloaded successfully.
https://data.gharchive.org/2023-05-01-11.json.gz
File downloaded successfully.
https://data.gharchive.org/2023-05-01-12.json.gz
File download

In [0]:
main_df = None
#hours = [11,12]
for hour in range(0,4):
    filename = "2023-06-22-" + str(hour)
    get_data("2023-06-22", hour)
    unzip_jsongz(filename)
    df = spark.read.json("file:///databricks/driver/"+filename+".json")
    flat_df = flatten_df(df)
    if main_df is not None:
        main_df = main_df.unionAll(flat_df)
        print(main_df.count())
    else:
        main_df = flat_df 
        print(main_df.count())

In [0]:
ls

2023-05-01-0.json     2023-05-02-0.json     [0m[01;34mazure[0m/
2023-05-01-0.json.gz  2023-05-02-0.json.gz  [01;34mcache[0m/
2023-05-01-1.json     2023-05-02-1.json     [01;34mconf[0m/
2023-05-01-1.json.gz  2023-05-02-1.json.gz  [01;34meventlogs[0m/
2023-05-01-2.json     2023-05-02-2.json     [01;34mganglia[0m/
2023-05-01-2.json.gz  2023-05-02-2.json.gz  [01;32mhadoop_accessed_config.lst[0m*
2023-05-01-3.json     2023-05-02-3.json     [01;34mlogs[0m/
2023-05-01-3.json.gz  2023-05-02-3.json.gz  [01;32mpreload_class.lst[0m*


In [0]:
output_file = "schema.txt"
with open(output_file, "w") as file:
    file.write(df.select(col("payload")).schema.simpleString())

In [0]:
main_df = None
#hours = [11,12]
for hour in range(10,12):
    filename = "2023-06-22-" + str(hour)
    get_data("2023-06-22", hour)
    unzip_jsongz(filename)
    df = spark.read.json("file:///databricks/driver/"+filename+".json")
    issues_df = df.filter(df.type == "IssuesEvent")
    issues_flat_df = flatten_issues_df(issues_df)
    if main_df is not None:
        main_df = main_df.unionAll(issues_flat_df)
        print(main_df.count())
    else:
        main_df = issues_flat_df 
        print(main_df.count())

In [0]:
main_df.coalesce(1).write.parquet("file:///databricks/driver/cache.parquet")