In [14]:
!pip install pyspark



In [15]:
!pip install python-dotenv



In [20]:
from pyspark.sql import SparkSession
from dotenv import load_dotenv
import os
from pyspark.sql.functions import explode, col, to_timestamp, substring
from pyspark.sql.types import StructType, StructField, StringType, LongType, BooleanType, ArrayType

In [21]:
from google.colab import userdata


storage_account_name = userdata.get("AZURE_ACCOUNT_NAME")
storage_account_key = userdata.get("AZURE_STORAGE_KEY")

storage_container_name = "kaggle-datasets"
parquet_blob_name = "github-dataset-full.parquet"



In [22]:
# Creating Spark session
spark = SparkSession.builder \
    .appName("Read Parquet from Azure Blob Storage") \
    .config(f"spark.hadoop.fs.azure.account.key.{storage_account_name}.blob.core.windows.net", storage_account_key) \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-azure:3.3.2,com.microsoft.azure:azure-storage:8.6.6") \
    .config("spark.driver.memory", "8g") \
    .getOrCreate()

# Remove garbage error texts
spark.sparkContext.setLogLevel("ERROR")

IllegalArgumentException: The value of property spark.hadoop.fs.azure.account.key.None.blob.core.windows.net must not be null

JVM stacktrace:
java.lang.IllegalArgumentException: The value of property spark.hadoop.fs.azure.account.key.None.blob.core.windows.net must not be null
	at org.apache.hadoop.thirdparty.com.google.common.base.Preconditions.checkArgument(Preconditions.java:219)
	at org.apache.hadoop.conf.Configuration.set(Configuration.java:1403)
	at org.apache.hadoop.conf.Configuration.set(Configuration.java:1384)
	at org.apache.spark.sql.internal.SharedState.$anonfun$x$1$2(SharedState.scala:77)
	at scala.collection.immutable.Map$Map4.foreach(Map.scala:493)
	at org.apache.spark.sql.internal.SharedState.<init>(SharedState.scala:69)
	at org.apache.spark.sql.SparkSession.$anonfun$sharedState$1(SparkSession.scala:143)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.SparkSession.sharedState$lzycompute(SparkSession.scala:143)
	at org.apache.spark.sql.SparkSession.sharedState(SparkSession.scala:142)
	at org.apache.spark.sql.SparkSession.$anonfun$sessionState$2(SparkSession.scala:162)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.SparkSession.sessionState$lzycompute(SparkSession.scala:160)
	at org.apache.spark.sql.SparkSession.sessionState(SparkSession.scala:157)
	at org.apache.spark.sql.SparkSession$.conf$lzycompute$1(SparkSession.scala:1213)
	at org.apache.spark.sql.SparkSession$.conf$1(SparkSession.scala:1213)
	at org.apache.spark.sql.SparkSession$.applyModifiableSettings(SparkSession.scala:1216)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)


In [None]:
spark.conf.set(
    f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net",storage_account_key)


# Set path to parquet file for access
parquet_path = f"wasbs://{storage_container_name}@{storage_account_name}.blob.core.windows.net/{parquet_blob_name}"


In [None]:
users_df = spark.read.parquet(
    "wasbs://kaggle-datasets@matthewleffler1.blob.core.windows.net/clean_data/non_list_data"
)

repo_list_df = spark.read.parquet(
    "wasbs://kaggle-datasets@matthewleffler1.blob.core.windows.net/clean_data/repo_list_data"
)

In [None]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, coalesce, lit


# SQL: License Trend by Language
license_trend_df = spark.sql("""
    SELECT
        license,
        language,
        COUNT(*) AS repo_count,
        SUM(CAST(stargazers_count AS INT)) AS total_stars
    FROM repo_list
    WHERE license IS NOT NULL AND license != ''
    GROUP BY license, language
    ORDER BY repo_count DESC
""")




In [None]:
license_trend_df.show(20, truncate=False)

In [None]:
license_trend_df.orderBy("total_stars", ascending=False).show(20, truncate=False)

In [None]:
# Convert Spark DataFrame to Pandas for plotting
license_trend_pd = license_trend_df.toPandas()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Top 10 licenses by repo count
top_license_trend = license_trend_pd.sort_values(by="repo_count", ascending=False).head(10)

plt.figure(figsize=(12, 6))
sns.barplot(x="repo_count", y="license", data=top_license_trend, palette="viridis")
plt.title("Top 10 Open Source Licenses by Repository Count")
plt.xlabel("Repository Count")
plt.ylabel("License")
plt.tight_layout()
plt.show()



# Top 10 licenses by total stars
top_stars_trend = license_trend_pd.sort_values(by="total_stars", ascending=False).head(10)

plt.figure(figsize=(12, 6))
sns.barplot(x="total_stars", y="license", data=top_stars_trend, palette="magma")
plt.title("Top 10 Open Source Licenses by Total Stargazers")
plt.xlabel("Total Stars")
plt.ylabel("License")
plt.tight_layout()
plt.show()

In [None]:
sfOptions = {
    "sfURL" : "https://bfdeidg-mvb00946.snowflakecomputing.com",
    "sfDatabase" : "BIGDATA_GITHUB",
    "sfSchema" : "ANALYTICS",
    "sfWarehouse" : "COMPUTE_WH",
    "sfRole": "ACCOUNTADMIN",  # or appropriate role
    "sfUser" : "bigdata228",
    "sfPassword" : "SJSUbigdata@1234"
}

# Write to Snowflake
license_trend_df.write \
    .format("snowflake") \
    .options(**sfOptions) \
    .option("dbtable", "license_trends_summary") \
    .mode("overwrite") \
    .save()
