In [1]:

# Install necessary libraries
!pip install pyspark
!pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.1.0


In [18]:

# Import necessary libraries
from pyspark.sql import SparkSession
from dotenv import load_dotenv
import os
from pyspark.sql.functions import explode, col, to_timestamp, substring
from pyspark.sql.types import StructType, StructField, StringType, LongType, BooleanType, ArrayType

In [3]:

# Load environment variables for Azure access information
load_dotenv("/content/azure_connection.env")

storage_account_name = os.getenv("AZURE_ACCOUNT_NAME")
storage_account_key = os.getenv("AZURE_STORAGE_KEY")
storage_container_name = "kaggle-datasets"
parquet_blob_name = "github-dataset-full.parquet"

In [4]:

# Creating Spark session
spark = SparkSession.builder \
    .appName("Read Parquet from Azure Blob Storage") \
    .config(f"spark.hadoop.fs.azure.account.key.{storage_account_name}.blob.core.windows.net", storage_account_key) \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-azure:3.3.2,com.microsoft.azure:azure-storage:8.6.6") \
    .config("spark.driver.memory", "8g") \
    .getOrCreate()

# Remove garbage error texts
spark.sparkContext.setLogLevel("ERROR")

In [5]:
# Set authentification for Spark to connect to Azure
spark.conf.set(
    f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net",
    storage_account_key
)


In [6]:
# Read data to ensure data was properly saved
repo_list_df = spark.read.parquet(
    "wasbs://kaggle-datasets@matthewleffler1.blob.core.windows.net/clean_data/repo_list_data"
)

In [7]:

# Ensure data was saved
repo_list_df.show(10, truncate=False)

+-------+----------+-------------------+-------------------+-----------------------------------------------------------------------------------------+---------+----------------+---------------------------------------+-------------+--------+-------------+---------------------------------------+----------------+-------------+-------------------+---------+---------------------+-------------------+
|user_id|user_login|repo_created_at    |repo_default_branch|repo_description                                                                         |repo_fork|repo_forks_count|repo_full_name                         |repo_has_wiki|repo_id |repo_language|repo_license                           |repo_open_issues|repo_owner_id|repo_pushed_at     |repo_size|repo_stargazers_count|repo_updated_at    |
+-------+----------+-------------------+-------------------+-----------------------------------------------------------------------------------------+---------+----------------+---------------------------

In [8]:

# Read data to ensure data was properly saved
following_list_df = spark.read.parquet(
    "wasbs://kaggle-datasets@matthewleffler1.blob.core.windows.net/clean_data/following_list_data"
)


In [9]:

# Ensure data was saved
following_list_df.show(10, truncate=False)

+--------+---------------+------------+
|user_id |user_login     |following_id|
+--------+---------------+------------+
|3103473 |ashubhadani    |401908      |
|16760183|jandersonaraujo|16691304    |
|167845  |joelennon      |46539       |
|167845  |joelennon      |643307      |
|167845  |joelennon      |1073533     |
|167845  |joelennon      |2650468     |
|167845  |joelennon      |2650485     |
|167845  |joelennon      |2656153     |
|167845  |joelennon      |2656266     |
|167845  |joelennon      |2789596     |
+--------+---------------+------------+
only showing top 10 rows



In [10]:

# Read data to ensure data was properly saved
follower_list_df = spark.read.parquet(
    "wasbs://kaggle-datasets@matthewleffler1.blob.core.windows.net/clean_data/follower_list_data"
)

In [11]:
# Ensure data was saved
follower_list_df.show(10, truncate=False)

+--------+---------------+--------------+
|user_id |user_login     |follower_login|
+--------+---------------+--------------+
|15133929|blackstonep    |15222051      |
|15133929|blackstonep    |17273091      |
|15133929|blackstonep    |9455290       |
|15133929|blackstonep    |6808931       |
|6819477 |smartraysam    |25648077      |
|11713529|avielmenter    |15976384      |
|11713529|avielmenter    |28541828      |
|16760183|jandersonaraujo|1280437       |
|16760183|jandersonaraujo|26728605      |
|167845  |joelennon      |643307        |
+--------+---------------+--------------+
only showing top 10 rows



In [12]:

# Read data to ensure data was properly saved
commit_list_df = spark.read.parquet(
    "wasbs://kaggle-datasets@matthewleffler1.blob.core.windows.net/clean_data/commit_list_data"
)

In [13]:

# Ensure data was saved
repo_list_df.show(10, truncate=False)

+-------+----------+-------------------+-------------------+-----------------------------------------------------------------------------------------+---------+----------------+---------------------------------------+-------------+--------+-------------+---------------------------------------+----------------+-------------+-------------------+---------+---------------------+-------------------+
|user_id|user_login|repo_created_at    |repo_default_branch|repo_description                                                                         |repo_fork|repo_forks_count|repo_full_name                         |repo_has_wiki|repo_id |repo_language|repo_license                           |repo_open_issues|repo_owner_id|repo_pushed_at     |repo_size|repo_stargazers_count|repo_updated_at    |
+-------+----------+-------------------+-------------------+-----------------------------------------------------------------------------------------+---------+----------------+---------------------------

In [14]:

# Read data to ensure data was properly saved
non_list_df = spark.read.parquet(
    "wasbs://kaggle-datasets@matthewleffler1.blob.core.windows.net/clean_data/non_list_data"
)

In [15]:

# Ensure data was saved
repo_list_df.show(10, truncate=False)

+-------+----------+-------------------+-------------------+-----------------------------------------------------------------------------------------+---------+----------------+---------------------------------------+-------------+--------+-------------+---------------------------------------+----------------+-------------+-------------------+---------+---------------------+-------------------+
|user_id|user_login|repo_created_at    |repo_default_branch|repo_description                                                                         |repo_fork|repo_forks_count|repo_full_name                         |repo_has_wiki|repo_id |repo_language|repo_license                           |repo_open_issues|repo_owner_id|repo_pushed_at     |repo_size|repo_stargazers_count|repo_updated_at    |
+-------+----------+-------------------+-------------------+-----------------------------------------------------------------------------------------+---------+----------------+---------------------------

✅ Query 1: Most Active Repositories by Commit Volume

Goal: Identify which repositories have the most commits — great for finding high-activity projects.

🔧 SQL Query (based on commit_list_df):

In [20]:

# Register as SQL view
commit_list_df.createOrReplaceTempView("commits")

# Run SQL query
active_repos_df = spark.sql("""
    SELECT
        repo_name,
        COUNT(*) AS total_commits
    FROM commits
    GROUP BY repo_name
    ORDER BY total_commits DESC
    LIMIT 50
""")

# Save to new Parquet file
active_repos_df.write.mode("overwrite").parquet(
    "wasbs://kaggle-datasets@matthewleffler1.blob.core.windows.net/analytics/most_active_repos"
)


 ✅ Query 2: Average Followers per Developer

 Goal: Measure developer reach/influence based on how many followers they have.

 🔧 SQL Query (based on follower_list_df):


In [23]:

# Register as temp view
follower_list_df.createOrReplaceTempView("followers")

# Corrected SQL Query
influence_df = spark.sql("""
    SELECT
        user_login AS developer,
        COUNT(follower_login) AS follower_count
    FROM followers
    GROUP BY user_login
    ORDER BY follower_count DESC
    LIMIT 50
""")

# Save to Parquet
influence_df.write.mode("overwrite").parquet(
    "wasbs://kaggle-datasets@matthewleffler1.blob.core.windows.net/analytics/top_influential_developers"
)
