### Config

In [0]:
# Ensure pipeline is idempotent; overwrite data at the partition level only
spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")

### Imports

In [0]:
from pyspark.sql import DataFrame, functions as F
from pyspark.sql.functions import coalesce, lit, row_number, col
from pyspark.sql.window import Window
from functools import reduce
from pyspark.sql.types import *

### Globals

In [0]:
File_Size = 128 #mb
File_Bytes = File_Size * 1024 * 1024
Base_Silver = "abfss://team1-project2@20251124eyproject2.dfs.core.windows.net/gharchive-silver"
Write_Parts = 96
Chunk_Size = 288
first_batch = True


# # spark.conf.set("spark.sql.files.maxRecordsPerFile", 6_000_000)

# spark.conf.set("spark.databricks.delta.optimizeWrite.enabled", "true") 
# spark.conf.set("spark.databricks.delta.autoCompact.enabled", "true")

### Schemas

In [0]:
user_df_schema = StructType([StructField('id', LongType(), True), StructField('login', StringType(), True), StructField('site_admin', BooleanType(), True), StructField('type', StringType(), True)])

raw_data_schema = StructType([StructField('actor', StructType([StructField('avatar_url', StringType(), True), StructField('gravatar_id', StringType(), True), StructField('id', LongType(), True), StructField('login', StringType(), True), StructField('url', StringType(), True)]), True), StructField('created_at', StringType(), True), StructField('id', StringType(), True), StructField('org', StructType([StructField('avatar_url', StringType(), True), StructField('gravatar_id', StringType(), True), StructField('id', LongType(), True), StructField('login', StringType(), True), StructField('url', StringType(), True)]), True), StructField('payload', StructType([StructField('action', StringType(), True), StructField('before', StringType(), True), StructField('comment', StructType([StructField('_links', StructType([StructField('html', StructType([StructField('href', StringType(), True)]), True), StructField('pull_request', StructType([StructField('href', StringType(), True)]), True), StructField('self', StructType([StructField('href', StringType(), True)]), True)]), True), StructField('body', StringType(), True), StructField('commit_id', StringType(), True), StructField('created_at', StringType(), True), StructField('diff_hunk', StringType(), True), StructField('html_url', StringType(), True), StructField('id', LongType(), True), StructField('issue_url', StringType(), True), StructField('line', LongType(), True), StructField('original_commit_id', StringType(), True), StructField('original_position', LongType(), True), StructField('path', StringType(), True), StructField('position', LongType(), True), StructField('pull_request_url', StringType(), True), StructField('updated_at', StringType(), True), StructField('url', StringType(), True), StructField('user', StructType([StructField('avatar_url', StringType(), True), StructField('events_url', StringType(), True), StructField('followers_url', StringType(), True), StructField('following_url', StringType(), True), StructField('gists_url', StringType(), True), StructField('gravatar_id', StringType(), True), StructField('html_url', StringType(), True), StructField('id', LongType(), True), StructField('login', StringType(), True), StructField('organizations_url', StringType(), True), StructField('received_events_url', StringType(), True), StructField('repos_url', StringType(), True), StructField('site_admin', BooleanType(), True), StructField('starred_url', StringType(), True), StructField('subscriptions_url', StringType(), True), StructField('type', StringType(), True), StructField('url', StringType(), True)]), True)]), True), StructField('commits', ArrayType(StructType([StructField('author', StructType([StructField('email', StringType(), True), StructField('name', StringType(), True)]), True), StructField('distinct', BooleanType(), True), StructField('message', StringType(), True), StructField('sha', StringType(), True), StructField('url', StringType(), True)]), True), True), StructField('description', StringType(), True), StructField('distinct_size', LongType(), True), StructField('forkee', StructType([StructField('archive_url', StringType(), True), StructField('assignees_url', StringType(), True), StructField('blobs_url', StringType(), True), StructField('branches_url', StringType(), True), StructField('clone_url', StringType(), True), StructField('collaborators_url', StringType(), True), StructField('comments_url', StringType(), True), StructField('commits_url', StringType(), True), StructField('compare_url', StringType(), True), StructField('contents_url', StringType(), True), StructField('contributors_url', StringType(), True), StructField('created_at', StringType(), True), StructField('default_branch', StringType(), True), StructField('description', StringType(), True), StructField('downloads_url', StringType(), True), StructField('events_url', StringType(), True), StructField('fork', BooleanType(), True), StructField('forks', LongType(), True), StructField('forks_count', LongType(), True), StructField('forks_url', StringType(), True), StructField('full_name', StringType(), True), StructField('git_commits_url', StringType(), True), StructField('git_refs_url', StringType(), True), StructField('git_tags_url', StringType(), True), StructField('git_url', StringType(), True), StructField('has_downloads', BooleanType(), True), StructField('has_issues', BooleanType(), True), StructField('has_pages', BooleanType(), True), StructField('has_wiki', BooleanType(), True), StructField('homepage', StringType(), True), StructField('hooks_url', StringType(), True), StructField('html_url', StringType(), True), StructField('id', LongType(), True), StructField('issue_comment_url', StringType(), True), StructField('issue_events_url', StringType(), True), StructField('issues_url', StringType(), True), StructField('keys_url', StringType(), True), StructField('labels_url', StringType(), True), StructField('language', StringType(), True), StructField('languages_url', StringType(), True), StructField('merges_url', StringType(), True), StructField('milestones_url', StringType(), True), StructField('mirror_url', StringType(), True), StructField('name', StringType(), True), StructField('notifications_url', StringType(), True), StructField('open_issues', LongType(), True), StructField('open_issues_count', LongType(), True), StructField('owner', StructType([StructField('avatar_url', StringType(), True), StructField('events_url', StringType(), True), StructField('followers_url', StringType(), True), StructField('following_url', StringType(), True), StructField('gists_url', StringType(), True), StructField('gravatar_id', StringType(), True), StructField('html_url', StringType(), True), StructField('id', LongType(), True), StructField('login', StringType(), True), StructField('organizations_url', StringType(), True), StructField('received_events_url', StringType(), True), StructField('repos_url', StringType(), True), StructField('site_admin', BooleanType(), True), StructField('starred_url', StringType(), True), StructField('subscriptions_url', StringType(), True), StructField('type', StringType(), True), StructField('url', StringType(), True)]), True), StructField('private', BooleanType(), True), StructField('public', BooleanType(), True), StructField('pulls_url', StringType(), True), StructField('pushed_at', StringType(), True), StructField('releases_url', StringType(), True), StructField('size', LongType(), True), StructField('ssh_url', StringType(), True), StructField('stargazers_count', LongType(), True), StructField('stargazers_url', StringType(), True), StructField('statuses_url', StringType(), True), StructField('subscribers_url', StringType(), True), StructField('subscription_url', StringType(), True), StructField('svn_url', StringType(), True), StructField('tags_url', StringType(), True), StructField('teams_url', StringType(), True), StructField('trees_url', StringType(), True), StructField('updated_at', StringType(), True), StructField('url', StringType(), True), StructField('watchers', LongType(), True), StructField('watchers_count', LongType(), True)]), True), StructField('head', StringType(), True), StructField('issue', StructType([StructField('assignee', StructType([StructField('avatar_url', StringType(), True), StructField('events_url', StringType(), True), StructField('followers_url', StringType(), True), StructField('following_url', StringType(), True), StructField('gists_url', StringType(), True), StructField('gravatar_id', StringType(), True), StructField('html_url', StringType(), True), StructField('id', LongType(), True), StructField('login', StringType(), True), StructField('organizations_url', StringType(), True), StructField('received_events_url', StringType(), True), StructField('repos_url', StringType(), True), StructField('site_admin', BooleanType(), True), StructField('starred_url', StringType(), True), StructField('subscriptions_url', StringType(), True), StructField('type', StringType(), True), StructField('url', StringType(), True)]), True), StructField('body', StringType(), True), StructField('closed_at', StringType(), True), StructField('comments', LongType(), True), StructField('comments_url', StringType(), True), StructField('created_at', StringType(), True), StructField('events_url', StringType(), True), StructField('html_url', StringType(), True), StructField('id', LongType(), True), StructField('labels', ArrayType(StructType([StructField('color', StringType(), True), StructField('name', StringType(), True), StructField('url', StringType(), True)]), True), True), StructField('labels_url', StringType(), True), StructField('locked', BooleanType(), True), StructField('milestone', StructType([StructField('closed_at', StringType(), True), StructField('closed_issues', LongType(), True), StructField('created_at', StringType(), True), StructField('creator', StructType([StructField('avatar_url', StringType(), True), StructField('events_url', StringType(), True), StructField('followers_url', StringType(), True), StructField('following_url', StringType(), True), StructField('gists_url', StringType(), True), StructField('gravatar_id', StringType(), True), StructField('html_url', StringType(), True), StructField('id', LongType(), True), StructField('login', StringType(), True), StructField('organizations_url', StringType(), True), StructField('received_events_url', StringType(), True), StructField('repos_url', StringType(), True), StructField('site_admin', BooleanType(), True), StructField('starred_url', StringType(), True), StructField('subscriptions_url', StringType(), True), StructField('type', StringType(), True), StructField('url', StringType(), True)]), True), StructField('description', StringType(), True), StructField('due_on', StringType(), True), StructField('id', LongType(), True), StructField('labels_url', StringType(), True), StructField('number', LongType(), True), StructField('open_issues', LongType(), True), StructField('state', StringType(), True), StructField('title', StringType(), True), StructField('updated_at', StringType(), True), StructField('url', StringType(), True)]), True), StructField('number', LongType(), True), StructField('pull_request', StructType([StructField('diff_url', StringType(), True), StructField('html_url', StringType(), True), StructField('patch_url', StringType(), True), StructField('url', StringType(), True)]), True), StructField('state', StringType(), True), StructField('title', StringType(), True), StructField('updated_at', StringType(), True), StructField('url', StringType(), True), StructField('user', StructType([StructField('avatar_url', StringType(), True), StructField('events_url', StringType(), True), StructField('followers_url', StringType(), True), StructField('following_url', StringType(), True), StructField('gists_url', StringType(), True), StructField('gravatar_id', StringType(), True), StructField('html_url', StringType(), True), StructField('id', LongType(), True), StructField('login', StringType(), True), StructField('organizations_url', StringType(), True), StructField('received_events_url', StringType(), True), StructField('repos_url', StringType(), True), StructField('site_admin', BooleanType(), True), StructField('starred_url', StringType(), True), StructField('subscriptions_url', StringType(), True), StructField('type', StringType(), True), StructField('url', StringType(), True)]), True)]), True), StructField('master_branch', StringType(), True), StructField('member', StructType([StructField('avatar_url', StringType(), True), StructField('events_url', StringType(), True), StructField('followers_url', StringType(), True), StructField('following_url', StringType(), True), StructField('gists_url', StringType(), True), StructField('gravatar_id', StringType(), True), StructField('html_url', StringType(), True), StructField('id', LongType(), True), StructField('login', StringType(), True), StructField('organizations_url', StringType(), True), StructField('received_events_url', StringType(), True), StructField('repos_url', StringType(), True), StructField('site_admin', BooleanType(), True), StructField('starred_url', StringType(), True), StructField('subscriptions_url', StringType(), True), StructField('type', StringType(), True), StructField('url', StringType(), True)]), True), StructField('number', LongType(), True), StructField('pages', ArrayType(StructType([StructField('action', StringType(), True), StructField('html_url', StringType(), True), StructField('page_name', StringType(), True), StructField('sha', StringType(), True), StructField('summary', StringType(), True), StructField('title', StringType(), True)]), True), True), StructField('pull_request', StructType([StructField('_links', StructType([StructField('comments', StructType([StructField('href', StringType(), True)]), True), StructField('commits', StructType([StructField('href', StringType(), True)]), True), StructField('html', StructType([StructField('href', StringType(), True)]), True), StructField('issue', StructType([StructField('href', StringType(), True)]), True), StructField('review_comment', StructType([StructField('href', StringType(), True)]), True), StructField('review_comments', StructType([StructField('href', StringType(), True)]), True), StructField('self', StructType([StructField('href', StringType(), True)]), True), StructField('statuses', StructType([StructField('href', StringType(), True)]), True)]), True), StructField('additions', LongType(), True), StructField('assignee', StructType([StructField('avatar_url', StringType(), True), StructField('events_url', StringType(), True), StructField('followers_url', StringType(), True), StructField('following_url', StringType(), True), StructField('gists_url', StringType(), True), StructField('gravatar_id', StringType(), True), StructField('html_url', StringType(), True), StructField('id', LongType(), True), StructField('login', StringType(), True), StructField('organizations_url', StringType(), True), StructField('received_events_url', StringType(), True), StructField('repos_url', StringType(), True), StructField('site_admin', BooleanType(), True), StructField('starred_url', StringType(), True), StructField('subscriptions_url', StringType(), True), StructField('type', StringType(), True), StructField('url', StringType(), True)]), True), StructField('base', StructType([StructField('label', StringType(), True), StructField('ref', StringType(), True), StructField('repo', StructType([StructField('archive_url', StringType(), True), StructField('assignees_url', StringType(), True), StructField('blobs_url', StringType(), True), StructField('branches_url', StringType(), True), StructField('clone_url', StringType(), True), StructField('collaborators_url', StringType(), True), StructField('comments_url', StringType(), True), StructField('commits_url', StringType(), True), StructField('compare_url', StringType(), True), StructField('contents_url', StringType(), True), StructField('contributors_url', StringType(), True), StructField('created_at', StringType(), True), StructField('default_branch', StringType(), True), StructField('description', StringType(), True), StructField('downloads_url', StringType(), True), StructField('events_url', StringType(), True), StructField('fork', BooleanType(), True), StructField('forks', LongType(), True), StructField('forks_count', LongType(), True), StructField('forks_url', StringType(), True), StructField('full_name', StringType(), True), StructField('git_commits_url', StringType(), True), StructField('git_refs_url', StringType(), True), StructField('git_tags_url', StringType(), True), StructField('git_url', StringType(), True), StructField('has_downloads', BooleanType(), True), StructField('has_issues', BooleanType(), True), StructField('has_pages', BooleanType(), True), StructField('has_wiki', BooleanType(), True), StructField('homepage', StringType(), True), StructField('hooks_url', StringType(), True), StructField('html_url', StringType(), True), StructField('id', LongType(), True), StructField('issue_comment_url', StringType(), True), StructField('issue_events_url', StringType(), True), StructField('issues_url', StringType(), True), StructField('keys_url', StringType(), True), StructField('labels_url', StringType(), True), StructField('language', StringType(), True), StructField('languages_url', StringType(), True), StructField('merges_url', StringType(), True), StructField('milestones_url', StringType(), True), StructField('mirror_url', StringType(), True), StructField('name', StringType(), True), StructField('notifications_url', StringType(), True), StructField('open_issues', LongType(), True), StructField('open_issues_count', LongType(), True), StructField('owner', StructType([StructField('avatar_url', StringType(), True), StructField('events_url', StringType(), True), StructField('followers_url', StringType(), True), StructField('following_url', StringType(), True), StructField('gists_url', StringType(), True), StructField('gravatar_id', StringType(), True), StructField('html_url', StringType(), True), StructField('id', LongType(), True), StructField('login', StringType(), True), StructField('organizations_url', StringType(), True), StructField('received_events_url', StringType(), True), StructField('repos_url', StringType(), True), StructField('site_admin', BooleanType(), True), StructField('starred_url', StringType(), True), StructField('subscriptions_url', StringType(), True), StructField('type', StringType(), True), StructField('url', StringType(), True)]), True), StructField('private', BooleanType(), True), StructField('pulls_url', StringType(), True), StructField('pushed_at', StringType(), True), StructField('releases_url', StringType(), True), StructField('size', LongType(), True), StructField('ssh_url', StringType(), True), StructField('stargazers_count', LongType(), True), StructField('stargazers_url', StringType(), True), StructField('statuses_url', StringType(), True), StructField('subscribers_url', StringType(), True), StructField('subscription_url', StringType(), True), StructField('svn_url', StringType(), True), StructField('tags_url', StringType(), True), StructField('teams_url', StringType(), True), StructField('trees_url', StringType(), True), StructField('updated_at', StringType(), True), StructField('url', StringType(), True), StructField('watchers', LongType(), True), StructField('watchers_count', LongType(), True)]), True), StructField('sha', StringType(), True), StructField('user', StructType([StructField('avatar_url', StringType(), True), StructField('events_url', StringType(), True), StructField('followers_url', StringType(), True), StructField('following_url', StringType(), True), StructField('gists_url', StringType(), True), StructField('gravatar_id', StringType(), True), StructField('html_url', StringType(), True), StructField('id', LongType(), True), StructField('login', StringType(), True), StructField('organizations_url', StringType(), True), StructField('received_events_url', StringType(), True), StructField('repos_url', StringType(), True), StructField('site_admin', BooleanType(), True), StructField('starred_url', StringType(), True), StructField('subscriptions_url', StringType(), True), StructField('type', StringType(), True), StructField('url', StringType(), True)]), True)]), True), StructField('body', StringType(), True), StructField('changed_files', LongType(), True), StructField('closed_at', StringType(), True), StructField('comments', LongType(), True), StructField('comments_url', StringType(), True), StructField('commits', LongType(), True), StructField('commits_url', StringType(), True), StructField('created_at', StringType(), True), StructField('deletions', LongType(), True), StructField('diff_url', StringType(), True), StructField('head', StructType([StructField('label', StringType(), True), StructField('ref', StringType(), True), StructField('repo', StructType([StructField('archive_url', StringType(), True), StructField('assignees_url', StringType(), True), StructField('blobs_url', StringType(), True), StructField('branches_url', StringType(), True), StructField('clone_url', StringType(), True), StructField('collaborators_url', StringType(), True), StructField('comments_url', StringType(), True), StructField('commits_url', StringType(), True), StructField('compare_url', StringType(), True), StructField('contents_url', StringType(), True), StructField('contributors_url', StringType(), True), StructField('created_at', StringType(), True), StructField('default_branch', StringType(), True), StructField('description', StringType(), True), StructField('downloads_url', StringType(), True), StructField('events_url', StringType(), True), StructField('fork', BooleanType(), True), StructField('forks', LongType(), True), StructField('forks_count', LongType(), True), StructField('forks_url', StringType(), True), StructField('full_name', StringType(), True), StructField('git_commits_url', StringType(), True), StructField('git_refs_url', StringType(), True), StructField('git_tags_url', StringType(), True), StructField('git_url', StringType(), True), StructField('has_downloads', BooleanType(), True), StructField('has_issues', BooleanType(), True), StructField('has_pages', BooleanType(), True), StructField('has_wiki', BooleanType(), True), StructField('homepage', StringType(), True), StructField('hooks_url', StringType(), True), StructField('html_url', StringType(), True), StructField('id', LongType(), True), StructField('issue_comment_url', StringType(), True), StructField('issue_events_url', StringType(), True), StructField('issues_url', StringType(), True), StructField('keys_url', StringType(), True), StructField('labels_url', StringType(), True), StructField('language', StringType(), True), StructField('languages_url', StringType(), True), StructField('merges_url', StringType(), True), StructField('milestones_url', StringType(), True), StructField('mirror_url', StringType(), True), StructField('name', StringType(), True), StructField('notifications_url', StringType(), True), StructField('open_issues', LongType(), True), StructField('open_issues_count', LongType(), True), StructField('owner', StructType([StructField('avatar_url', StringType(), True), StructField('events_url', StringType(), True), StructField('followers_url', StringType(), True), StructField('following_url', StringType(), True), StructField('gists_url', StringType(), True), StructField('gravatar_id', StringType(), True), StructField('html_url', StringType(), True), StructField('id', LongType(), True), StructField('login', StringType(), True), StructField('organizations_url', StringType(), True), StructField('received_events_url', StringType(), True), StructField('repos_url', StringType(), True), StructField('site_admin', BooleanType(), True), StructField('starred_url', StringType(), True), StructField('subscriptions_url', StringType(), True), StructField('type', StringType(), True), StructField('url', StringType(), True)]), True), StructField('private', BooleanType(), True), StructField('pulls_url', StringType(), True), StructField('pushed_at', StringType(), True), StructField('releases_url', StringType(), True), StructField('size', LongType(), True), StructField('ssh_url', StringType(), True), StructField('stargazers_count', LongType(), True), StructField('stargazers_url', StringType(), True), StructField('statuses_url', StringType(), True), StructField('subscribers_url', StringType(), True), StructField('subscription_url', StringType(), True), StructField('svn_url', StringType(), True), StructField('tags_url', StringType(), True), StructField('teams_url', StringType(), True), StructField('trees_url', StringType(), True), StructField('updated_at', StringType(), True), StructField('url', StringType(), True), StructField('watchers', LongType(), True), StructField('watchers_count', LongType(), True)]), True), StructField('sha', StringType(), True), StructField('user', StructType([StructField('avatar_url', StringType(), True), StructField('events_url', StringType(), True), StructField('followers_url', StringType(), True), StructField('following_url', StringType(), True), StructField('gists_url', StringType(), True), StructField('gravatar_id', StringType(), True), StructField('html_url', StringType(), True), StructField('id', LongType(), True), StructField('login', StringType(), True), StructField('organizations_url', StringType(), True), StructField('received_events_url', StringType(), True), StructField('repos_url', StringType(), True), StructField('site_admin', BooleanType(), True), StructField('starred_url', StringType(), True), StructField('subscriptions_url', StringType(), True), StructField('type', StringType(), True), StructField('url', StringType(), True)]), True)]), True), StructField('html_url', StringType(), True), StructField('id', LongType(), True), StructField('issue_url', StringType(), True), StructField('locked', BooleanType(), True), StructField('merge_commit_sha', StringType(), True), StructField('mergeable', BooleanType(), True), StructField('mergeable_state', StringType(), True), StructField('merged', BooleanType(), True), StructField('merged_at', StringType(), True), StructField('merged_by', StructType([StructField('avatar_url', StringType(), True), StructField('events_url', StringType(), True), StructField('followers_url', StringType(), True), StructField('following_url', StringType(), True), StructField('gists_url', StringType(), True), StructField('gravatar_id', StringType(), True), StructField('html_url', StringType(), True), StructField('id', LongType(), True), StructField('login', StringType(), True), StructField('organizations_url', StringType(), True), StructField('received_events_url', StringType(), True), StructField('repos_url', StringType(), True), StructField('site_admin', BooleanType(), True), StructField('starred_url', StringType(), True), StructField('subscriptions_url', StringType(), True), StructField('type', StringType(), True), StructField('url', StringType(), True)]), True), StructField('milestone', StructType([StructField('closed_at', StringType(), True), StructField('closed_issues', LongType(), True), StructField('created_at', StringType(), True), StructField('creator', StructType([StructField('avatar_url', StringType(), True), StructField('events_url', StringType(), True), StructField('followers_url', StringType(), True), StructField('following_url', StringType(), True), StructField('gists_url', StringType(), True), StructField('gravatar_id', StringType(), True), StructField('html_url', StringType(), True), StructField('id', LongType(), True), StructField('login', StringType(), True), StructField('organizations_url', StringType(), True), StructField('received_events_url', StringType(), True), StructField('repos_url', StringType(), True), StructField('site_admin', BooleanType(), True), StructField('starred_url', StringType(), True), StructField('subscriptions_url', StringType(), True), StructField('type', StringType(), True), StructField('url', StringType(), True)]), True), StructField('description', StringType(), True), StructField('due_on', StringType(), True), StructField('id', LongType(), True), StructField('labels_url', StringType(), True), StructField('number', LongType(), True), StructField('open_issues', LongType(), True), StructField('state', StringType(), True), StructField('title', StringType(), True), StructField('updated_at', StringType(), True), StructField('url', StringType(), True)]), True), StructField('number', LongType(), True), StructField('patch_url', StringType(), True), StructField('review_comment_url', StringType(), True), StructField('review_comments', LongType(), True), StructField('review_comments_url', StringType(), True), StructField('state', StringType(), True), StructField('statuses_url', StringType(), True), StructField('title', StringType(), True), StructField('updated_at', StringType(), True), StructField('url', StringType(), True), StructField('user', StructType([StructField('avatar_url', StringType(), True), StructField('events_url', StringType(), True), StructField('followers_url', StringType(), True), StructField('following_url', StringType(), True), StructField('gists_url', StringType(), True), StructField('gravatar_id', StringType(), True), StructField('html_url', StringType(), True), StructField('id', LongType(), True), StructField('login', StringType(), True), StructField('organizations_url', StringType(), True), StructField('received_events_url', StringType(), True), StructField('repos_url', StringType(), True), StructField('site_admin', BooleanType(), True), StructField('starred_url', StringType(), True), StructField('subscriptions_url', StringType(), True), StructField('type', StringType(), True), StructField('url', StringType(), True)]), True)]), True), StructField('push_id', LongType(), True), StructField('pusher_type', StringType(), True), StructField('ref', StringType(), True), StructField('ref_type', StringType(), True), StructField('release', StructType([StructField('assets', ArrayType(StructType([StructField('browser_download_url', StringType(), True), StructField('content_type', StringType(), True), StructField('created_at', StringType(), True), StructField('download_count', LongType(), True), StructField('id', LongType(), True), StructField('label', StringType(), True), StructField('name', StringType(), True), StructField('size', LongType(), True), StructField('state', StringType(), True), StructField('updated_at', StringType(), True), StructField('uploader', StructType([StructField('avatar_url', StringType(), True), StructField('events_url', StringType(), True), StructField('followers_url', StringType(), True), StructField('following_url', StringType(), True), StructField('gists_url', StringType(), True), StructField('gravatar_id', StringType(), True), StructField('html_url', StringType(), True), StructField('id', LongType(), True), StructField('login', StringType(), True), StructField('organizations_url', StringType(), True), StructField('received_events_url', StringType(), True), StructField('repos_url', StringType(), True), StructField('site_admin', BooleanType(), True), StructField('starred_url', StringType(), True), StructField('subscriptions_url', StringType(), True), StructField('type', StringType(), True), StructField('url', StringType(), True)]), True), StructField('url', StringType(), True)]), True), True), StructField('assets_url', StringType(), True), StructField('author', StructType([StructField('avatar_url', StringType(), True), StructField('events_url', StringType(), True), StructField('followers_url', StringType(), True), StructField('following_url', StringType(), True), StructField('gists_url', StringType(), True), StructField('gravatar_id', StringType(), True), StructField('html_url', StringType(), True), StructField('id', LongType(), True), StructField('login', StringType(), True), StructField('organizations_url', StringType(), True), StructField('received_events_url', StringType(), True), StructField('repos_url', StringType(), True), StructField('site_admin', BooleanType(), True), StructField('starred_url', StringType(), True), StructField('subscriptions_url', StringType(), True), StructField('type', StringType(), True), StructField('url', StringType(), True)]), True), StructField('body', StringType(), True), StructField('created_at', StringType(), True), StructField('draft', BooleanType(), True), StructField('html_url', StringType(), True), StructField('id', LongType(), True), StructField('name', StringType(), True), StructField('prerelease', BooleanType(), True), StructField('published_at', StringType(), True), StructField('tag_name', StringType(), True), StructField('tarball_url', StringType(), True), StructField('target_commitish', StringType(), True), StructField('upload_url', StringType(), True), StructField('url', StringType(), True), StructField('zipball_url', StringType(), True)]), True), StructField('size', LongType(), True)]), True), StructField('public', BooleanType(), True), StructField('repo', StructType([StructField('id', LongType(), True), StructField('name', StringType(), True), StructField('url', StringType(), True)]), True), StructField('type', StringType(), True)])


### Methods

In [0]:
def extract_non_event_tables(df):
    """
    Extracts the core dimensional tables (Actor, Org, Repo, Event) 
    and handles global user table updates.
    """
    # 1. Actor Table
    # --- 1. Actor Table (Structured exactly like Repo) ---
    
    # Main Actor
    main_actor_df = df.select(
        col("actor.id").cast("bigint").alias("id"),
        col("actor.login").alias("login"),
        F.lit(None).alias("type"),
        F.lit(None).cast("boolean").alias("site_admin")
    )

    # Issue Assignee
    issue_assignee_df = df.select(
        col("payload.issue.assignee.id").cast("bigint").alias("id"),
        col("payload.issue.assignee.login").alias("login"),
        col("payload.issue.assignee.type").alias("type"),
        col("payload.issue.assignee.site_admin").cast("boolean").alias("site_admin")
    )

    # PR Assignee
    pr_assignee_df = df.select(
        col("payload.pull_request.assignee.id").cast("bigint").alias("id"),
        col("payload.pull_request.assignee.login").alias("login"),
        col("payload.pull_request.assignee.type").alias("type"),
        col("payload.pull_request.assignee.site_admin").cast("boolean").alias("site_admin")
    )

    # Member
    member_df = df.select(
        col("payload.member.id").cast("bigint").alias("id"),
        col("payload.member.login").alias("login"),
        col("payload.member.type").alias("type"),
        col("payload.member.site_admin").cast("boolean").alias("site_admin")
    )

    # Comment User
    comment_user_df = df.select(
        col("payload.comment.user.id").cast("bigint").alias("id"),
        col("payload.comment.user.login").alias("login"),
        col("payload.comment.user.type").alias("type"),
        col("payload.comment.user.site_admin").cast("boolean").alias("site_admin")
    )

    # Forkee Owner
    forkee_owner_df = df.select(
        col("payload.forkee.owner.id").cast("bigint").alias("id"),
        col("payload.forkee.owner.login").alias("login"),
        col("payload.forkee.owner.type").alias("type"),
        col("payload.forkee.owner.site_admin").cast("boolean").alias("site_admin")
    )

    # PR Head Repo Owner
    pr_owner_df = df.select(
        col("payload.pull_request.head.repo.owner.id").cast("bigint").alias("id"),
        col("payload.pull_request.head.repo.owner.login").alias("login"),
        col("payload.pull_request.head.repo.owner.type").alias("type"),
        col("payload.pull_request.head.repo.owner.site_admin").cast("boolean").alias("site_admin")
    )

    # Union all actor sources and deduplicate
    actor_sources = [main_actor_df, issue_assignee_df, pr_assignee_df, member_df, comment_user_df, forkee_owner_df, pr_owner_df]
    actor_df = (
        reduce(lambda a, b: a.unionByName(b, allowMissingColumns=True), actor_sources)
        .filter(col("id").isNotNull())
    )

    # 1.5. Duduplication in final cleanup
    actor_window = Window.partitionBy("id").orderBy(F.col("type").isNotNull().desc_nulls_last(), F.col("site_admin").isNotNull().desc_nulls_last())
    actor_df = actor_df.withColumn("row_num", F.row_number().over(actor_window)).filter(F.col("row_num") == 1).drop("row_num").dropna(subset=["id"])

    # 2. Org Table
    org_df = (
        df.selectExpr("org.id as org_id", "org.login")
        .dropna(how="any", subset=["org_id"])
        .dropDuplicates(["org_id"])
    )

    # 3. Repo Table (using coalesce to handle various schema locations)
    repo_base_df = df.select(col("repo.id").alias("repo_id"), col("repo.name").alias("repo_name"))

    forkee_df = df.select(
        col("payload.forkee.id").alias("repo_id"),
        col("payload.forkee.full_name").alias("repo_name"),
        col("payload.forkee.owner.id").alias("owner_id"),
        col("payload.forkee.owner.login").alias("owner_login"),
        col("payload.forkee.created_at").alias("created_at"),
        col("payload.forkee.default_branch").alias("default_branch"),
        col("payload.forkee.fork").alias("is_fork"),
        col("payload.forkee.forks").alias("forks_count"),
        col("payload.forkee.open_issues").alias("open_issues"),
        col("payload.forkee.private").alias("private"),
        col("payload.forkee.size").alias("size"),
        col("payload.forkee.stargazers_count").alias("stargazers_count")
    )

    base_repo_df = df.select(
        col("payload.pull_request.base.repo.id").alias("repo_id"),
        col("payload.pull_request.base.repo.full_name").alias("repo_name"),
        col("payload.pull_request.base.repo.owner.id").alias("owner_id"),
        col("payload.pull_request.base.repo.owner.login").alias("owner_login"),
        col("payload.pull_request.base.repo.created_at").alias("created_at"),
        col("payload.pull_request.base.repo.default_branch").alias("default_branch"),
        col("payload.pull_request.base.repo.fork").alias("is_fork"),
        col("payload.pull_request.base.repo.forks").alias("forks_count"),
        col("payload.pull_request.base.repo.open_issues").alias("open_issues"),
        col("payload.pull_request.base.repo.private").alias("private"),
        col("payload.pull_request.base.repo.size").alias("size"),
        col("payload.pull_request.base.repo.stargazers_count").alias("stargazers_count")
    )

    head_repo_df = df.select(
        col("payload.pull_request.head.repo.id").alias("repo_id"),
        col("payload.pull_request.head.repo.full_name").alias("repo_name"),
        col("payload.pull_request.head.repo.owner.id").alias("owner_id"),
        col("payload.pull_request.head.repo.owner.login").alias("owner_login"),
        col("payload.pull_request.head.repo.created_at").alias("created_at"),
        col("payload.pull_request.head.repo.default_branch").alias("default_branch"),
        col("payload.pull_request.head.repo.fork").alias("is_fork"),
        col("payload.pull_request.head.repo.forks").alias("forks_count"),
        col("payload.pull_request.head.repo.open_issues").alias("open_issues"),
        col("payload.pull_request.head.repo.private").alias("private"),
        col("payload.pull_request.head.repo.size").alias("size"),
        col("payload.pull_request.head.repo.stargazers_count").alias("stargazers_count")
    )

    # Union all possible repo sources
    repo_df = reduce(lambda df1, df2: df1.unionByName(df2, allowMissingColumns=True), [repo_base_df, forkee_df, base_repo_df, head_repo_df])


    # 3.5. Duduplication in final cleanup
    repo_window = Window.partitionBy("repo_id").orderBy(col("owner_id").isNotNull().desc_nulls_last())
    repo_df = repo_df.withColumn("row_num", row_number().over(repo_window)).filter(col("row_num") == 1).drop("row_num").dropna(subset=["repo_id"])


    # 4. Main Event Fact Table
    event_df = (
        df.select(
            F.col("id").cast("bigint").alias("event_id"),   
            F.col("type").alias("event_type"),
            F.col("created_at"),
            F.col("public"),
            F.col("actor.id").cast("bigint").alias("actor_id"),
            F.col("org.id").cast("bigint").alias("org_id"),
            F.col("repo.id").cast("bigint").alias("repo_id")
        )
        .dropDuplicates(["event_id"])
    )

    return actor_df, org_df, repo_df, event_df

In [0]:
def estimate_avg_row_size_bytes(df, sample_rows=20):
 
    row_json = F.to_json(F.struct(*[F.col(c) for c in df.columns]))
    row_bytes = F.length(F.encode(row_json, "UTF-8")).alias("row_bytes")

    stats = (
        df.limit(sample_rows)
          .select(row_bytes)
          .agg(
              F.count("*").alias("n"),
              F.avg("row_bytes").alias("avg_bytes"),
              F.min("row_bytes").alias("min_bytes"),
              F.max("row_bytes").alias("max_bytes"),
          )
          .collect()[0]
    )
    return stats

In [0]:
def write_partition_by_month(event_data: DataFrame,base_silver_path: str,created_at_col: str = "created_at",mode: str = "overwrite",
    write_parts: int = 96,max_records_per_file: int = 12000000):

    out_path = f"{base_silver_path}/event_data"

    df_write = event_data.withColumn(
        "event_year_month",
        F.date_format(F.col(created_at_col), "yyyy-MM")
    )

    (df_write
        .write
        .mode(mode)
        .partitionBy("event_year_month")
        .option("maxRecordsPerFile", max_records_per_file)
        .parquet(out_path)
    )


In [0]:
def write_table(df: DataFrame,base_silver_path: str,table_name: str,ingest_year_month: str,mode: str = "overwrite",write_parts: int = 96,
    max_records_per_file: int = 12000000):

    out_path = f"{base_silver_path}/{table_name}/event_year_month={ingest_year_month}"

    if table_name in ["actor", "org", "repo"]:
        writer_df = df.coalesce(1)
    else:
        writer_df = df

    (writer_df
        .write
        .mode(mode)
        .option("maxRecordsPerFile", max_records_per_file)
        .parquet(out_path)
    )

In [0]:
def chunk_list(xs, n):
    for i in range(0, len(xs), n):
        yield xs[i:i+n]

### Create the 16 Tables For Events

In [0]:
def extract_commit_comment_events(df):
    """
    Extracts CommitCommentEvent data.
    """
    return (
        df.where(F.col("type") == "CommitCommentEvent")
        .select(
            F.col("id").alias("event_id").cast("bigint"),
            F.col("payload.comment.id").alias("comment_id"),
            F.col("payload.comment.body").alias("body"),
            F.col("payload.action").alias("action"),
            F.col("payload.comment.user.id").alias("user_id")
        )
        .filter(F.col("comment_id").isNotNull())
    )

In [0]:
def extract_pull_request_events(df):
    return (
        df.where(F.col("type") == "PullRequestEvent")
        .select(
            F.col("id").alias("event_id").cast("bigint"),
            F.col("payload.action").alias("action"),
            F.col("payload.pull_request.number").alias("pull_request_number"),
            F.col("payload.pull_request.id").alias("pull_request_id"),
            F.col("payload.pull_request.assignee.id").alias("assignee_id")
        )
        .filter(F.col("pull_request_id").isNotNull())
    )

In [0]:
def extract_member_events(df):
    """
    Extracts MemberEvent data. User details are handled in the master actor_df.
    """
    return (
        df.where(F.col("type") == "MemberEvent")
        .select(
            F.col("id").alias("event_id").cast("bigint"),
            F.col("payload.action").alias("action"),
            F.col("payload.member.id").alias("member_id")
        )
        .filter(F.col("member_id").isNotNull())
    )

In [0]:
def extract_public_events(df):
    return (
        df.where(F.col("type") == "PublicEvent")
        .select(
            F.col("id").alias("event_id").cast("bigint")
        )
    )

In [0]:
def extract_issues_events(df):
    return (
        df.where(F.col("type") == "IssuesEvent")
        .select(
            F.col("id").alias("event_id").cast("bigint"),
            F.col("payload.action").alias("action"),
            F.col("payload.issue.id").alias("issue_id"),
            (F.col("payload.issue.state") == "open").alias("is_open"),
            F.col("payload.issue.assignee.id").alias("assignee_id")
        )
        .filter(F.col("issue_id").isNotNull())
    )

In [0]:
def extract_issue_comment_events(df):
    return (
        df.where(F.col("type") == "IssueCommentEvent")
        .select(
            F.col("id").alias("event_id").cast("bigint"),
            F.col("payload.action").alias("action"),
            F.col("payload.issue.id").alias("issue_id"),
            F.col("payload.issue.body").alias("issue_body")
        )
        .filter(F.col("issue_id").isNotNull())
    )

In [0]:
def extract_create_events(df):
    return (
        df.where(F.col("type") == "CreateEvent")
        .select(
            F.col("id").cast("bigint").alias("event_id"),
            F.col("payload.ref").cast("string").alias("ref"),
            F.col("payload.ref_type").cast("string").alias("ref_type"),
            F.col("payload.master_branch").cast("string").alias("master_branch"),
            F.col("payload.description").cast("string").alias("description"),
            F.col("payload.pusher_type").cast("string").alias("pusher_type")
        )
        .dropDuplicates(["event_id"])
    )

In [0]:
def extract_delete_events(df):
    return (
        df.where(F.col("type") == "DeleteEvent")
        .select(
            F.col("id").cast("bigint").alias("event_id"),
            F.col("payload.ref").cast("string").alias("ref"),
            F.col("payload.ref_type").cast("string").alias("ref_type"),
            F.col("payload.pusher_type").cast("string").alias("pusher_type")
        )
        .dropDuplicates(["event_id"])
    )

In [0]:
def extract_discussion_events(df):
    return (
        df.where(F.col("type") == "DiscussionEvent")
        .select(
            F.col("id").cast("bigint").alias("event_id"),
            F.col("payload.action").cast("string").alias("action")
        )
        .dropDuplicates(["event_id"])
    )

In [0]:
def extract_fork_events(df):
    return (
        df.where(F.col("type") == "ForkEvent")
        .select(
            F.col("id").cast("bigint").alias("event_id"),
            F.col("payload.action").cast("string").alias("action"),
            F.col("payload.forkee.id").cast("bigint").alias("forked_id")
        )
        .dropDuplicates(["event_id"])
    )

In [0]:
def extract_gollum_events(df):
    return (
        df.where(F.col("type") == "GollumEvent")
        .select(F.col("id").cast("bigint").alias("event_id"))
        .dropDuplicates(["event_id"])
    )

In [0]:
def extract_pull_request_reviews(df):
    return df.filter(F.col("type") == "PullRequestReviewEvent").select(
        F.col("id").alias("event_id").cast("bigint"),
        F.col("payload.pull_request.id").alias("pull_request_id"),
        F.col("payload.action")
    )

In [0]:
def extract_pull_request_review_comments(df):
    return df.filter(F.col("type") == "PullRequestReviewCommentEvent").select(
        F.col("id").alias("event_id").cast("bigint"),
        F.col("payload.pull_request.id").alias("pull_request_id"),
        F.col("payload.action"),
        F.col("payload.comment.body").alias("comment")
    )

In [0]:
def extract_push_events(df):
    return df.filter(F.col("type") == "PushEvent").select(
        F.col("id").alias("event_id").cast("bigint"),
        F.col("repo.id").alias("repo_id"),
        F.col("payload.push_id"),
        F.col("payload.ref"),
        F.col("payload.head").alias("head_sha"),
        F.col("payload.before").alias("before_sha")
    )

In [0]:
def extract_release_events(df):
    return df.filter(F.col("type") == "ReleaseEvent").select(
        F.col("id").alias("event_id").cast("bigint"),
        F.col("payload.release.id").alias("release_id"),
        F.col("payload.action")
    )

In [0]:
def extract_watch_events(df):
    return df.filter(F.col("type") == "WatchEvent").select(
        F.col("id").alias("event_id").cast("bigint"),
        F.col("payload.action")
    )

### Main Method Setup

In [0]:
raw_path = "abfss://gharchive-raw@20251124eyproject2.dfs.core.windows.net/"

# Create a list for all the files that we will need to read in to the pipeline
all_files = [f.path for f in dbutils.fs.ls(raw_path) if f.path.endswith(".json.gz")]
all_files = sorted(all_files) 
print("total files:", len(all_files))


In [0]:
# Break up all files by the month they are from
files_by_month = {}
for fp in all_files:
    ingest_month = fp.split("/")[-1][:7]   # "YYYY-MM" from filename
    files_by_month.setdefault(ingest_month, []).append(fp)


for m in files_by_month:
    files_by_month[m].sort()

### Main Method

In [0]:

# loop through each month
for ingest_month in sorted(files_by_month.keys()):

    # Getting all raw files that belong in the current month (ingest_month)
    month_files = files_by_month[ingest_month]
    first_batch = True

    # Processing the monthly files in a fixed chunk size to help control cache size
    for batch_files in chunk_list(month_files, Chunk_Size):

        Write_Mode = "overwrite" if first_batch else "append"
        
        df = spark.read.json(batch_files, schema=raw_data_schema)
        df.cache()
        df.count() 

        # 2. Extract Dimensional Tables
        actor_df, org_df, repo_df, event_data = extract_non_event_tables(df)

        # 3. Extract Standalone Events
        issue_event_df            = extract_issues_events(df)
        issue_comment_event_df    = extract_issue_comment_events(df)
        member_df                 = extract_member_events(df)
        pr_df                     = extract_pull_request_events(df)
        commit_comm_df            = extract_commit_comment_events(df)
        watch_df                  = extract_watch_events(df)
        release_df                = extract_release_events(df)
        push_df                   = extract_push_events(df)
        pr_comm_df                = extract_pull_request_review_comments(df)
        pr_review_df              = extract_pull_request_reviews(df)
        gollum_df                 = extract_gollum_events(df)
        fork_df                   = extract_fork_events(df)
        discussion_df             = extract_discussion_events(df)
        delete_df                 = extract_delete_events(df)
        create_df                 = extract_create_events(df)
        public_df                 = extract_public_events(df)

        
        # 4. Write the results to disk 
        write_partition_by_month(event_data, Base_Silver, mode=Write_Mode,write_parts=Write_Parts)

        write_table(actor_df, Base_Silver, "actor", ingest_month, mode=Write_Mode,write_parts=Write_Parts)
        write_table(org_df,   Base_Silver, "org",   ingest_month, mode=Write_Mode,write_parts=Write_Parts)
        write_table(repo_df,  Base_Silver, "repo",  ingest_month, mode=Write_Mode,write_parts=Write_Parts)


        write_table(create_df, Base_Silver, "CreateEvent", ingest_month, mode=Write_Mode,write_parts=Write_Parts)
        write_table(delete_df, Base_Silver, "DeleteEvent", ingest_month,  mode=Write_Mode,write_parts=Write_Parts)
        write_table(discussion_df, Base_Silver, "DiscussionEvent", ingest_month,  mode=Write_Mode,write_parts=Write_Parts)
        write_table(fork_df, Base_Silver, "ForkEvent", ingest_month, mode=Write_Mode,write_parts=Write_Parts)
        write_table(gollum_df, Base_Silver, "GollumEvent", ingest_month,  mode=Write_Mode,write_parts=Write_Parts)
        write_table(issue_event_df, Base_Silver, "IssueEvent", ingest_month,  mode=Write_Mode,write_parts=Write_Parts)
        write_table(member_df, Base_Silver, "MemberEvent", ingest_month,  mode=Write_Mode,write_parts=Write_Parts)
        write_table(public_df, Base_Silver, "PublicEvent", ingest_month,  mode=Write_Mode,write_parts=Write_Parts)
        write_table(pr_df, Base_Silver, "PullRequestEvent", ingest_month,  mode=Write_Mode,write_parts=Write_Parts)
        write_table(pr_comm_df, Base_Silver, "PullRequestReviewCommentEvent", ingest_month,  mode=Write_Mode,write_parts=Write_Parts)
        write_table(pr_review_df, Base_Silver, "PullRequestReviewEvent", ingest_month,  mode=Write_Mode,write_parts=Write_Parts)
        write_table(push_df, Base_Silver, "PushEvent", ingest_month,  mode=Write_Mode,write_parts=Write_Parts)
        write_table(release_df, Base_Silver, "ReleaseEvent", ingest_month,  mode=Write_Mode,write_parts=Write_Parts)
        write_table(watch_df, Base_Silver, "WatchEvent", ingest_month,  mode=Write_Mode,write_parts=Write_Parts)
        write_table(commit_comm_df, Base_Silver, "CommitCommentEvent", ingest_month,  mode=Write_Mode,write_parts=Write_Parts)
        write_table(issue_comment_event_df, Base_Silver, "IssueCommentEvent", ingest_month, mode=Write_Mode,write_parts=Write_Parts)

        #Toggle batch
        first_batch = False 

        # 6. Memory Cleanup
        df.unpersist()


# Deduplicate Actor, Org, Repo

In [0]:
#schemas
actor_df_schema = StructType([StructField('id', LongType(), True), StructField('login', StringType(), True), StructField('type', StringType(), True), StructField('site_admin', BooleanType(), True), StructField('event_year_month', StringType(), True)])

repo_df_schema = StructType([StructField('repo_id', LongType(), True), StructField('repo_name', StringType(), True), StructField('owner_id', LongType(), True), StructField('owner_login', StringType(), True), StructField('created_at', StringType(), True), StructField('default_branch', StringType(), True), StructField('is_fork', BooleanType(), True), StructField('forks_count', LongType(), True), StructField('open_issues', LongType(), True), StructField('private', BooleanType(), True), StructField('size', LongType(), True), StructField('stargazers_count', LongType(), True), StructField('event_year_month', StringType(), True)])

org_df_schema = StructType([StructField('repo_id', LongType(), True), StructField('repo_name', StringType(), True), StructField('owner_id', LongType(), True), StructField('owner_login', StringType(), True), StructField('created_at', StringType(), True), StructField('default_branch', StringType(), True), StructField('is_fork', BooleanType(), True), StructField('forks_count', LongType(), True), StructField('open_issues', LongType(), True), StructField('private', BooleanType(), True), StructField('size', LongType(), True), StructField('stargazers_count', LongType(), True), StructField('event_year_month', StringType(), True)])

In [0]:
# Repo
repo_df = spark.read.parquet(f"{Base_Silver}/repo", schema=repo_df_schema)

# Deduplicate
repo_window = Window.partitionBy("repo_id").orderBy(col("owner_id").isNotNull().desc_nulls_last())
repo_df = repo_df.withColumn("row_num", row_number().over(repo_window)).filter(col("row_num") == 1).drop("row_num").dropna(subset=["repo_id"])

# Write to output path
out_path = f"{Base_Silver}/repo"

writer_df = actor_df.coalesce(1)

(writer_df
    .write
    .mode("overwrite")
    .option("maxRecordsPerFile", 6000000)
    .partitionBy("event_year_month")
    .parquet(out_path)
)

In [0]:
# Actor
actor_df = spark.read.parquet(f"{Base_Silver}/actor", schema=actor_df_schema)

# Deduplicate
actor_window = Window.partitionBy("id").orderBy(F.col("type").isNotNull().desc_nulls_last(), F.col("site_admin").isNotNull().desc_nulls_last())
actor_df = actor_df.withColumn("row_num", F.row_number().over(actor_window)).filter(F.col("row_num") == 1).drop("row_num").dropna(subset=["id"])

# Write to output path
out_path = f"{Base_Silver}/actor"

writer_df = actor_df.coalesce(1)

(writer_df
    .write
    .mode("overwrite")
    .option("maxRecordsPerFile", 6000000)
    .partitionBy("event_year_month")
    .parquet(out_path)
)

In [0]:
# Org
org_df = spark.read.parquet(f"{Base_Silver}/org", schema=org_df_schema)

# Deduplicate
org_df = org_df.dropDuplicates(["org_id"])

# Write to output path
out_path = f"{Base_Silver}/org"

writer_df = org_df.coalesce(1)

(writer_df
    .write
    .mode("overwrite")
    .option("maxRecordsPerFile", 6000000)
    .partitionBy("event_year_month")
    .parquet(out_path)
)

<img src="https://github.com/TashawnD/gh-data-pipeline-docs/blob/main/docs/erd/Silver%20Layer%20ERD%20-%20edit.png?raw=true"
     style="max-width:100%; border:1px solid #ccc;">

<img src="https://github.com/TashawnD/gh-data-pipeline-docs/blob/main/docs/erd/Silver%20Layer%20ERD%20-%20part%202.png?raw=true"
     style="max-width:100%; border:1px solid #ccc;">