In [1]:
!pip install pyspark
!pip install python-dotenv



In [2]:
from pyspark.sql import SparkSession
from dotenv import load_dotenv
import os

In [3]:
# Initialize variables
load_dotenv("azure_connection.env")

storage_account_name = os.getenv("AZURE_ACCOUNT_NAME")
storage_account_key = os.getenv("AZURE_STORAGE_KEY")
storage_container_name = "kaggle-datasets"
parquet_blob_name = "github-dataset-full.parquet"

In [4]:
# Creating Spark session
spark = SparkSession.builder \
    .appName("Read Parquet from Azure Blob Storage") \
    .config("spark.hadoop.fs.azure.account.key.<your-storage-account>.blob.core.windows.net", storage_account_key) \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-azure:3.3.2,com.microsoft.azure:azure-storage:8.6.6") \
    .getOrCreate()

# Remove garbage error texts
spark.sparkContext.setLogLevel("ERROR")

:: loading settings :: url = jar:file:/opt/anaconda3/envs/naturalistvenv/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/matthewleffler/.ivy2/cache
The jars for the packages stored in: /Users/matthewleffler/.ivy2/jars
org.apache.hadoop#hadoop-azure added as a dependency
com.microsoft.azure#azure-storage added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-c21013d0-e2e1-46ef-b1ae-00f1e7c17772;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-azure;3.3.2 in central
	found org.apache.httpcomponents#httpclient;4.5.13 in local-m2-cache
	found org.apache.httpcomponents#httpcore;4.4.13 in local-m2-cache
	found commons-logging#commons-logging;1.1.3 in central
	found commons-codec#commons-codec;1.11 in local-m2-cache
	found org.apache.hadoop.thirdparty#hadoop-shaded-guava;1.1.1 in central
	found org.eclipse.jetty#jetty-util-ajax;9.4.43.v20210629 in central
	found org.eclipse.jetty#jetty-util;9.4.43.v20210629 in central
	found org.codehaus.jackson#jackson-mapper-asl;1.9.13 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in centr

In [5]:
# Step 3: (Optional) Set Hadoop configurations if not already set during builder
spark.conf.set(
    f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net",
    storage_account_key
)

In [6]:
# Step 4: Define path to the Parquet
parquet_path = f"wasbs://{storage_container_name}@{storage_account_name}.blob.core.windows.net/{parquet_blob_name}"

In [6]:
from pyspark.sql.types import StructType, StructField, StringType, LongType, BooleanType, ArrayType

schema = StructType([
    StructField("bio", StringType(), True),
    StructField("blog", StringType(), True),
    StructField("commit_list", ArrayType(
        StructType([
            StructField("author_id", LongType(), True),
            StructField("commit_at", StringType(), True),
            StructField("committer_id", LongType(), True),
            StructField("generate_at", StringType(), True),
            StructField("message", StringType(), True),
            StructField("repo_description", StringType(), True),
            StructField("repo_id", LongType(), True),
            StructField("repo_name", StringType(), True),
            StructField("repo_owner_id", LongType(), True)
        ])
    ), True),
    StructField("commits", LongType(), True),
    StructField("company", StringType(), True),
    StructField("created_at", StringType(), True),
    StructField("email", StringType(), True),
    StructField("follower_list", ArrayType(LongType(), True), True),
    StructField("followers", LongType(), True),
    StructField("following", LongType(), True),
    StructField("following_list", ArrayType(LongType(), True), True),
    StructField("hirable", BooleanType(), True),
    StructField("id", LongType(), True),
    StructField("is_suspicious", BooleanType(), True),
    StructField("location", StringType(), True),
    StructField("login", StringType(), True),
    StructField("name", StringType(), True),
    StructField("public_gists", LongType(), True),
    StructField("public_repos", LongType(), True),
    StructField("repo_list", ArrayType(
        StructType([
            StructField("created_at", StringType(), True),
            StructField("default_branch", StringType(), True),
            StructField("description", StringType(), True),
            StructField("fork", BooleanType(), True),
            StructField("forks_count", LongType(), True),
            StructField("full_name", StringType(), True),
            StructField("has_wiki", BooleanType(), True),
            StructField("id", LongType(), True),
            StructField("language", StringType(), True),
            StructField("license", StringType(), True),
            StructField("open_issues", LongType(), True),
            StructField("owner_id", LongType(), True),
            StructField("pushed_at", StringType(), True),
            StructField("size", LongType(), True),
            StructField("stargazers_count", LongType(), True),
            StructField("updated_at", StringType(), True)
        ])
    ), True),
    StructField("type", StringType(), True),
    StructField("updated_at", StringType(), True)
])

In [10]:

# Step 5: Read the Parquet file
df = spark.read.schema(schema).parquet(parquet_blob_name)

# Step 6: Preview
df.printSchema()

AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/Users/matthewleffler/Documents/DATA 228/Group Project/github-dataset-full.parquet.

In [9]:
df.createOrReplaceTempView("github_user_view")

In [10]:
spark.sql("""SELECT id, size(commit_list) AS num_commits FROM github_user_view""").show()

                                                                                

+--------+-----------+
|      id|num_commits|
+--------+-----------+
|14413602|         -1|
| 9025223|          0|
|17626302|         -1|
|16860856|          0|
|15806633|          9|
| 1151203|          1|
|  141210|         59|
|16686692|          0|
|18952046|         -1|
| 6932921|          0|
|22808018|          0|
| 2058695|         -1|
| 8110867|        265|
|17561063|          0|
|16121097|          2|
|25191693|          0|
|28223563|          0|
| 4119857|        176|
|12907200|          0|
| 6862511|          0|
+--------+-----------+
only showing top 20 rows



In [8]:
from pyspark.sql.functions import size, avg

# Step 1: Create a new column first
df_with_size = df.withColumn("commit_list_size", size("commit_list"))

# Step 2: Then compute the average
df_with_size.agg(avg("commit_list_size")).show()

NameError: name 'df' is not defined