In [1]:
!pip install pyspark
!pip install python-dotenv



In [2]:
from pyspark.sql import SparkSession
from dotenv import load_dotenv
import os

In [3]:
# Step 1: Initialize variables
from google.colab import userdata

load_dotenv("azure_connection.env")

storage_account_name = userdata.get('AZURE_ACCOUNT_NAME')
storage_account_key = userdata.get('AZURE_STORAGE_KEY')
storage_container_name = "kaggle-datasets"
parquet_blob_name = "github-dataset-full.parquet"

In [4]:
# Step 2: Creating Spark session
spark = SparkSession.builder \
    .appName("Read Parquet from Azure Blob Storage") \
    .config(f"spark.hadoop.fs.azure.account.key.{storage_account_name}.blob.core.windows.net", storage_account_key) \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-azure:3.3.2,com.microsoft.azure:azure-storage:8.6.6") \
    .getOrCreate()

# Remove garbage error texts
spark.sparkContext.setLogLevel("ERROR")

In [5]:
# Step 3: (Optional) Set Hadoop configurations if not already set during builder
spark.conf.set(
    f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net",
    storage_account_key
)

In [6]:
# Step 4: Read parquet file spliited based on non-nesting
non_list_df = spark.read.parquet(
    "wasbs://kaggle-datasets@matthewleffler1.blob.core.windows.net/clean_data/non_list_data"
)

# Ensure data was saved
non_list_df.show(10, truncate=False)

+----+----------------+-------+----------+-------------------+-----+---------+---------+-------+--------+-------------+--------+----------------------+-----------+------------+------------+----+-------------------+
|bio |blog            |commits|company   |created_at         |email|followers|following|hirable|id      |is_suspicious|location|login                 |name       |public_gists|public_repos|type|updated_at         |
+----+----------------+-------+----------+-------------------+-----+---------+---------+-------+--------+-------------+--------+----------------------+-----------+------------+------------+----+-------------------+
|NULL|                |NULL   |NULL      |2015-09-21 02:52:29|NULL |0        |0        |NULL   |14413602|true         |NULL    |llciq992              |NULL       |0           |0           |User|2016-02-28 18:26:34|
|NULL|                |0      |NULL      |2014-10-05 17:46:27|NULL |0        |0        |NULL   |9025223 |false        |NULL    |cymssss45   

In [7]:
# Step 5: Read parquet file spliited based on nesting

# Read data to ensure data was properly saved
repo_list_df = spark.read.parquet(
    "wasbs://kaggle-datasets@matthewleffler1.blob.core.windows.net/clean_data/repo_list_data"
)

# Read data to ensure data was properly saved
following_list_df = spark.read.parquet(
    "wasbs://kaggle-datasets@matthewleffler1.blob.core.windows.net/clean_data/following_list_data"
)

# Read data to ensure data was properly saved
follower_list_df = spark.read.parquet(
    "wasbs://kaggle-datasets@matthewleffler1.blob.core.windows.net/clean_data/follower_list_data"
)

# Read data to ensure data was properly saved
commit_list_df = spark.read.parquet(
    "wasbs://kaggle-datasets@matthewleffler1.blob.core.windows.net/clean_data/commit_list_data"
)

In [10]:
# Step 6: Join and aggregate logic to compute influence score

from pyspark.sql import functions as F

# 1. Compute follower counts per user
followers_agg = (
    follower_list_df
    .groupBy("user_id", "user_login")
    .agg(F.count("*").alias("follower_count"))
)

# 2. Compute total stargazers per user
stars_agg = (
    repo_list_df
    .groupBy("user_id", "user_login")
    .agg(F.sum("repo_stargazers_count").alias("total_stars"))
)

# 3. Compute total commits per user
commits_agg = (
    commit_list_df
    .groupBy("user_id", "user_login")
    .agg(F.count("*").alias("total_commits"))
)

# 4. Compute total forks per user
forks_agg = (
    repo_list_df
    .groupBy("user_id", "user_login")
    .agg(F.sum("repo_forks_count").alias("total_forks"))
)

# 5. Join all aggregates together
influence_df = (
    followers_agg
    .join(stars_agg,   on=["user_id","user_login"], how="full_outer")
    .join(commits_agg, on=["user_id","user_login"], how="full_outer")
    .join(forks_agg,   on=["user_id","user_login"], how="full_outer")
    .na.fill(0, ["follower_count", "total_stars", "total_commits", "total_forks"])
)


In [11]:
# Step 7: Compute a weighted influence score
# followers: 40%, stars: 25%, forks: 20%, commits: 15%

influence_df = influence_df.withColumn(
    "influence_score",
    0.4   * F.col("follower_count") +
    0.25  * F.col("total_stars")    +
    0.20  * F.col("total_forks")    +
    0.15  * F.col("total_commits")
)


In [12]:
# Step 8. Get Top N developers by influence

top_developers = (
    influence_df
    .orderBy(F.col("influence_score").desc())
    .limit(100)
)

top_developers.show(100, truncate=False)

+--------+-----------------+--------------+-----------+-------------+-----------+------------------+
|user_id |user_login       |follower_count|total_stars|total_commits|total_forks|influence_score   |
+--------+-----------------+--------------+-----------+-------------+-----------+------------------+
|6154722 |Microsoft        |0             |415789     |0            |103979     |124743.05         |
|6128107 |vuejs            |0             |243181     |0            |41423      |69079.85          |
|82592   |square           |2             |205975     |0            |36317      |58757.950000000004|
|18461506|Tencent          |0             |149961     |0            |33305      |44151.25          |
|3006190 |shadowsocks      |0             |125507     |0            |61991      |43774.95          |
|1136800 |h5bp             |3             |110190     |0            |22505      |32049.7           |
|10639145|apple            |0             |99580      |1            |14436      |27782.3500

In [13]:
# Step 9: Load data into Parquet files

# Create parquet file for Top-100 influencers
top_developers \
  .coalesce(1) \
  .write \
  .mode("overwrite") \
  .parquet("wasbs://kaggle-datasets@matthewleffler1.blob.core.windows.net/clean_data/Influence_Top")

In [None]:
# Step 10: Read parquet file with top-100 data and get sample data

Influence_Top_df = spark.read.parquet(
    "wasbs://kaggle-datasets@matthewleffler1.blob.core.windows.net/clean_data/Influence_Top"
)

# Ensure data was saved
Influence_Top_df.show(100, truncate=False)