In [7]:
# Snowpark for Python
from snowflake.snowpark.session import Session
from snowflake.snowpark.functions import month,year,col,sum,row_number
from snowflake.snowpark.version import VERSION
from snowflake.snowpark import Window

# Misc
import json
import logging 
logger = logging.getLogger("snowflake.snowpark.session")
logger.setLevel(logging.ERROR)

In [8]:
# Create Snowflake Session object
connection_parameters = json.load(open('connection.json'))
session = Session.builder.configs(connection_parameters).create()
session.sql_simplifier_enabled = True

snowflake_environment = session.sql('select current_user(), current_version()').collect()
snowpark_version = VERSION

# Current Environment Details
print('User                        : {}'.format(snowflake_environment[0][0]))
print('Role                        : {}'.format(session.get_current_role()))
print('Database                    : {}'.format(session.get_current_database()))
print('Schema                      : {}'.format(session.get_current_schema()))
print('Warehouse                   : {}'.format(session.get_current_warehouse()))
print('Snowflake version           : {}'.format(snowflake_environment[0][1]))
print('Snowpark for Python version : {}.{}.{}'.format(snowpark_version[0],snowpark_version[1],snowpark_version[2]))

User                        : VINOD
Role                        : "ACCOUNTADMIN"
Database                    : "GITHUB_ARCHIVE_EVENTS"
Schema                      : "CYBERSYN"
Warehouse                   : "VINO_L"
Snowflake version           : 7.28.0
Snowpark for Python version : 1.5.1


In [9]:
# create dataframe objects from snowflake tables
events_df = session.table("github_archive_events.cybersyn.gh_events")
repos_df = session.table("github_archive_events.cybersyn.gh_repos")
stars_df = session.table("github_archive_events.cybersyn.gh_stars")

In [10]:
print(f"events: {events_df.count()}. That is, {events_df.count()//(10**6)}M records")
print(f"repos: {repos_df.count()}. That is, {repos_df.count()//(10**6)}M records")
print(f"stars: {stars_df.count()}. That is, {stars_df.count()//(10**6)}M records")

events: 6347782847. That is, 6347M records
repos: 352889466. That is, 352M records
stars: 190902947. That is, 190M records


In [11]:
# print schema od events_df
events_df.schema

StructType([StructField('ID', StringType(16777216), nullable=True), StructField('CREATED_AT', StringType(16777216), nullable=True), StructField('ACTOR_AVATAR_URL', StringType(16777216), nullable=True), StructField('ACTOR_DISPLAY_LOGIN', StringType(16777216), nullable=True), StructField('ACTOR_GRAVATAR_ID', StringType(16777216), nullable=True), StructField('ACTOR_ID', LongType(), nullable=True), StructField('ACTOR_LOGIN', StringType(16777216), nullable=True), StructField('ACTOR_URL', StringType(16777216), nullable=True), StructField('ORG_AVATAR_URL', StringType(16777216), nullable=True), StructField('ORG_GRAVATAR_ID', StringType(16777216), nullable=True), StructField('ORG_ID', LongType(), nullable=True), StructField('ORG_LOGIN', StringType(16777216), nullable=True), StructField('ORG_URL', StringType(16777216), nullable=True), StructField('PAYLOAD', VariantType(), nullable=True), StructField('PAYLOAD_ACTION', StringType(16777216), nullable=True), StructField('PAYLOAD_DESCRIPTION', String

In [12]:
# print schema in a readable format
for s in events_df.schema:
    print(s)

StructField('ID', StringType(16777216), nullable=True)
StructField('CREATED_AT', StringType(16777216), nullable=True)
StructField('ACTOR_AVATAR_URL', StringType(16777216), nullable=True)
StructField('ACTOR_DISPLAY_LOGIN', StringType(16777216), nullable=True)
StructField('ACTOR_GRAVATAR_ID', StringType(16777216), nullable=True)
StructField('ACTOR_ID', LongType(), nullable=True)
StructField('ACTOR_LOGIN', StringType(16777216), nullable=True)
StructField('ACTOR_URL', StringType(16777216), nullable=True)
StructField('ORG_AVATAR_URL', StringType(16777216), nullable=True)
StructField('ORG_GRAVATAR_ID', StringType(16777216), nullable=True)
StructField('ORG_ID', LongType(), nullable=True)
StructField('ORG_LOGIN', StringType(16777216), nullable=True)
StructField('ORG_URL', StringType(16777216), nullable=True)
StructField('PAYLOAD', VariantType(), nullable=True)
StructField('PAYLOAD_ACTION', StringType(16777216), nullable=True)
StructField('PAYLOAD_DESCRIPTION', StringType(16777216), nullable=Tr

In [14]:
repos_df.schema

StructType([StructField('REPO_ID', LongType(), nullable=True), StructField('REPO_NAME', StringType(16777216), nullable=True), StructField('FIRST_SEEN', DateType(), nullable=True)])

In [15]:
stars_df.schema

StructType([StructField('REPO_ID', LongType(), nullable=True), StructField('DATE', DateType(), nullable=True), StructField('COUNT', LongType(), nullable=True)])

In [18]:
event_sample = events_df.limit(1).take()
event_sample

Row(ID='22532153225', CREATED_AT='2022-06-24T21:49:26Z', ACTOR_AVATAR_URL='https://avatars.githubusercontent.com/u/87029524?', ACTOR_DISPLAY_LOGIN='indivp21', ACTOR_GRAVATAR_ID='', ACTOR_ID=87029524, ACTOR_LOGIN='indivp21', ACTOR_URL='https://api.github.com/users/indivp21', ORG_AVATAR_URL=None, ORG_GRAVATAR_ID=None, ORG_ID=None, ORG_LOGIN=None, ORG_URL=None, PAYLOAD='{\n  "before": "5ef08eb0745d6478ae7f9314efaf5b339e8d44ef",\n  "commits": [\n    {\n      "author": {\n        "email": "87029524+indivp21@users.noreply.github.com",\n        "name": "indivp21"\n      },\n      "distinct": true,\n      "message": "20220624",\n      "sha": "eb16fc7dac8e64c369921b7f12328b51fd1e9830",\n      "url": "https://api.github.com/repos/indivp21/indivp_front/commits/eb16fc7dac8e64c369921b7f12328b51fd1e9830"\n    }\n  ],\n  "distinct_size": 1,\n  "head": "eb16fc7dac8e64c369921b7f12328b51fd1e9830",\n  "push_id": 10259675844,\n  "ref": "refs/heads/main",\n  "size": 1\n}', PAYLOAD_ACTION=None, PAYLOAD_DESC

In [19]:
repos_df.limit(5).show()

-----------------------------------------------------------------
|"REPO_ID"  |"REPO_NAME"                         |"FIRST_SEEN"  |
-----------------------------------------------------------------
|69068314   |dgonzalez/ammo                      |2016-09-23    |
|25273737   |ksmyth/webgme-cyphy                 |2015-06-11    |
|37967662   |Linosh/liquibase-nochangelogupdate  |2015-06-24    |
|30379511   |Linell/grand_padrino                |2015-02-05    |
|30379886   |kushkalra/cs925                     |2015-02-05    |
-----------------------------------------------------------------



In [20]:
stars_df.limit(5).show()

------------------------------------
|"REPO_ID"  |"DATE"      |"COUNT"  |
------------------------------------
|677577973  |2023-08-13  |1        |
|561463664  |2023-08-13  |1        |
|677856804  |2023-08-13  |2        |
|215804347  |2023-08-13  |1        |
|671576165  |2023-08-13  |1        |
------------------------------------



 # Let's analyze the stars dataframe

### Let's find the all-time top 10 repositories by total number of stars on each repo:

In [37]:
total_stars_per_repo = stars_df.group_by("repo_id")\
                                .agg(sum("COUNT"))\
                                .with_column_renamed('"SUM(COUNT)"', "sum_of_stars")\
                                .sort("sum_of_stars", ascending=False)

In [38]:
total_stars_per_repo.show(10)

------------------------------
|"REPO_ID"  |"SUM_OF_STARS"  |
------------------------------
|28457823   |447314          |
|177736533  |380031          |
|60493101   |289722          |
|21737465   |288695          |
|13491895   |287963          |
|85077558   |266189          |
|54346799   |265323          |
|11730342   |244857          |
|10270250   |240837          |
|83222441   |236998          |
------------------------------



In [39]:
# TODO: popular repo with names 
# Join repos with stars to display names alongside the repo_id here.

### For each repository, for every year, what is the total number of stars added?

In [40]:
total_stars_repo_year = stars_df.group_by("REPO_ID", year("DATE"))\
                                .agg(sum("COUNT"))\
                                .with_column_renamed('"SUM(COUNT)"', "sum_of_stars")\
                                .with_column_renamed('"YEAR(DATE)"', "year")\
                                .sort("sum_of_stars", ascending=False)

In [41]:
total_stars_repo_year.describe().show()

---------------------------------------------------------------------------
|"SUMMARY"  |"REPO_ID"          |"YEAR"              |"SUM_OF_STARS"      |
---------------------------------------------------------------------------
|max        |678178357.0        |2023.0              |345750.0            |
|mean       |240194043.314612   |2020.119233         |8.286336            |
|stddev     |187747220.1552129  |2.4269439630943275  |150.43484356690772  |
|min        |1.0                |2011.0              |1.0                 |
|count      |44560490.0         |44560490.0          |44560490.0          |
---------------------------------------------------------------------------



In [42]:
total_stars_repo_year.show(10)

---------------------------------------
|"REPO_ID"  |"YEAR"  |"SUM_OF_STARS"  |
---------------------------------------
|177736533  |2019    |345750          |
|28457823   |2016    |182175          |
|614765452  |2023    |139325          |
|28457823   |2017    |96359           |
|211104957  |2022    |92884           |
|241576270  |2020    |81986           |
|34526884   |2021    |79855           |
|123458551  |2019    |76845           |
|604826790  |2023    |75869           |
|574523116  |2023    |71645           |
---------------------------------------



### Let's peek into the first 5 repos printed in the previous output:

177736533,
28457823,
614765452,
28457823,
211104957

In [43]:
repos_df.filter(col("repo_id")=="177736533").show()

---------------------------------------------
|"REPO_ID"  |"REPO_NAME"     |"FIRST_SEEN"  |
---------------------------------------------
|177736533  |996icu/996.ICU  |2019-03-26    |
|177736533  |997icu/996.ICU  |2019-03-28    |
---------------------------------------------



In [44]:
repos_df.filter(col("repo_id")=="28457823").show()

-----------------------------------------------------------
|"REPO_ID"  |"REPO_NAME"                   |"FIRST_SEEN"  |
-----------------------------------------------------------
|28457823   |freeCodeCamp/                 |2018-06-01    |
|28457823   |FreeCodeCamp/FreeCodeCamp     |2015-08-13    |
|28457823   |FreeCodeCamp/freecodecamp     |2015-01-01    |
|28457823   |FreeCodeCampers/freecodecamp  |2015-04-20    |
|28457823   |FreeCodeCamp/freeCodeCamp     |2017-01-10    |
|28457823   |freeCodeCamp/freeCodeCamp     |2017-01-11    |
-----------------------------------------------------------



In [45]:
repos_df.filter(col("repo_id")=="614765452").show()

------------------------------------------------------------
|"REPO_ID"  |"REPO_NAME"                    |"FIRST_SEEN"  |
------------------------------------------------------------
|614765452  |Torantulino/entrepreneur-gpt   |2023-03-24    |
|614765452  |Torantulino/Entrepreneur-GPT   |2023-03-16    |
|614765452  |Significant-Gravitas/Auto-GPT  |2023-04-15    |
|614765452  |Torantulino/auto-gpt           |2023-03-28    |
|614765452  |Torantulino/Auto-GPT           |2023-03-24    |
------------------------------------------------------------



In [46]:
repos_df.filter(col("repo_id")=="28457823").show()

-----------------------------------------------------------
|"REPO_ID"  |"REPO_NAME"                   |"FIRST_SEEN"  |
-----------------------------------------------------------
|28457823   |freeCodeCamp/                 |2018-06-01    |
|28457823   |FreeCodeCamp/FreeCodeCamp     |2015-08-13    |
|28457823   |FreeCodeCamp/freecodecamp     |2015-01-01    |
|28457823   |FreeCodeCampers/freecodecamp  |2015-04-20    |
|28457823   |FreeCodeCamp/freeCodeCamp     |2017-01-10    |
|28457823   |freeCodeCamp/freeCodeCamp     |2017-01-11    |
-----------------------------------------------------------



In [47]:
repos_df.filter(col("repo_id")=="211104957").show()

--------------------------------------------------
|"REPO_ID"  |"REPO_NAME"          |"FIRST_SEEN"  |
--------------------------------------------------
|211104957  |aplus-framework/app  |2021-07-22    |
|211104957  |natanfelles/app      |2019-09-26    |
--------------------------------------------------



### For each repository, in 2023, what is the total number of stars added?

In [49]:
repos_2023 = total_stars_repo_year.filter(col("year")=="2023")\
                                    .sort("sum_of_stars", ascending=False)

In [50]:
repos_2023.show(25)

---------------------------------------
|"REPO_ID"  |"YEAR"  |"SUM_OF_STARS"  |
---------------------------------------
|614765452  |2023    |139325          |
|604826790  |2023    |75869           |
|574523116  |2023    |71645           |
|527591471  |2023    |67624           |
|596115135  |2023    |65466           |
|619859161  |2023    |58624           |
|552661142  |2023    |54182           |
|619959033  |2023    |47126           |
|620936652  |2023    |42897           |
|577116112  |2023    |40797           |
|468576060  |2023    |39837           |
|634224458  |2023    |39361           |
|612344730  |2023    |38147           |
|616372661  |2023    |37183           |
|575340621  |2023    |37098           |
|13491895   |2023    |36987           |
|21737465   |2023    |36502           |
|601538369  |2023    |36392           |
|612354784  |2023    |36036           |
|618058471  |2023    |35555           |
|635240594  |2023    |32935           |
|155220641  |2023    |32531           |


In [51]:
repos_2023.schema

StructType([StructField('REPO_ID', LongType(), nullable=True), StructField('YEAR', LongType(), nullable=True), StructField('SUM_OF_STARS', LongType(), nullable=True)])

In [54]:
repos_df = repos_df.with_column_renamed("repo_id", "id")

In [55]:
top_repos_2023 = repos_2023.join(repos_df, repos_2023.repo_id==repos_df.id)\
                            .select(repos_df.repo_name, 
                                    repos_df.id, 
                                    repos_2023.sum_of_stars)\
                            .sort("sum_of_stars", ascending=False)

In [56]:
top_repos_2023.show(25)

-----------------------------------------------------------------------
|"REPO_NAME"                             |"ID"       |"SUM_OF_STARS"  |
-----------------------------------------------------------------------
|Torantulino/Entrepreneur-GPT            |614765452  |139325          |
|Torantulino/Auto-GPT                    |614765452  |139325          |
|Significant-Gravitas/Auto-GPT           |614765452  |139325          |
|Torantulino/auto-gpt                    |614765452  |139325          |
|Torantulino/entrepreneur-gpt            |614765452  |139325          |
|base-org/chains                         |604826790  |75869           |
|f/awesome-chatgpt-prompts               |574523116  |71645           |
|AUTOMATIC1111/stable-diffusion-webui    |527591471  |67624           |
|base-org/node                           |596115135  |65466           |
|twitter/the-algorithm                   |619859161  |58624           |
|hwchase17/langchain                     |552661142  |54182     

# Writing SQL queries in Snowpark

### Now, let's find the top 25 popular repositories in 2023 based on the total number of stars added, but this time using SQL instead.

In [58]:
session.sql("""use github_archive_events""")
session.sql("""
SELECT repo.repo_name,
       repo.repo_id,
       SUM(stars.count) AS sum_stars
FROM cybersyn.gh_stars AS stars
JOIN cybersyn.gh_repos AS repo
    ON (repo.repo_id = stars.repo_id)
WHERE stars.date >= \'2023-01-01\'
GROUP BY repo.repo_name, repo.repo_id
ORDER BY sum_stars DESC NULLS LAST
LIMIT 50""").show(25)

--------------------------------------------------------------------
|"REPO_NAME"                             |"REPO_ID"  |"SUM_STARS"  |
--------------------------------------------------------------------
|Significant-Gravitas/Auto-GPT           |614765452  |139325       |
|Torantulino/Entrepreneur-GPT            |614765452  |139325       |
|Torantulino/auto-gpt                    |614765452  |139325       |
|Torantulino/Auto-GPT                    |614765452  |139325       |
|Torantulino/entrepreneur-gpt            |614765452  |139325       |
|base-org/chains                         |604826790  |75869        |
|f/awesome-chatgpt-prompts               |574523116  |71645        |
|AUTOMATIC1111/stable-diffusion-webui    |527591471  |67624        |
|base-org/node                           |596115135  |65466        |
|twitter/the-algorithm                   |619859161  |58624        |
|langchain-ai/langchain                  |552661142  |54182        |
|hwchase17/langchain              

In [59]:
session.close()

In [60]:
# END