In [1]:
# Snowpark for Python
from snowflake.snowpark.session import Session
from snowflake.snowpark.functions import month,year,col,sum,row_number
from snowflake.snowpark.version import VERSION
from snowflake.snowpark import Window

# Misc
import json
import logging 
logger = logging.getLogger("snowflake.snowpark.session")
logger.setLevel(logging.ERROR)

In [2]:
# Create Snowflake Session object
connection_parameters = json.load(open('connection.json'))
session = Session.builder.configs(connection_parameters).create()
session.sql_simplifier_enabled = True

snowflake_environment = session.sql('select current_user(), current_version()').collect()
snowpark_version = VERSION

# Current Environment Details
print('User                        : {}'.format(snowflake_environment[0][0]))
print('Role                        : {}'.format(session.get_current_role()))
print('Database                    : {}'.format(session.get_current_database()))
print('Schema                      : {}'.format(session.get_current_schema()))
print('Warehouse                   : {}'.format(session.get_current_warehouse()))
print('Snowflake version           : {}'.format(snowflake_environment[0][1]))
print('Snowpark for Python version : {}.{}.{}'.format(snowpark_version[0],snowpark_version[1],snowpark_version[2]))

User                        : VINOD
Role                        : "ACCOUNTADMIN"
Database                    : "GITHUB_ARCHIVE_EVENTS"
Schema                      : "CYBERSYN"
Warehouse                   : "VINO_L"
Snowflake version           : 7.29.1
Snowpark for Python version : 1.5.1


In [3]:
# create dataframe objects from snowflake tables
events_df = session.table("github_archive_events.cybersyn.gh_events")
repos_df = session.table("github_archive_events.cybersyn.gh_repos")
stars_df = session.table("github_archive_events.cybersyn.gh_stars")

In [4]:
events_df.schema

StructType([StructField('ID', StringType(16777216), nullable=True), StructField('CREATED_AT', StringType(16777216), nullable=True), StructField('ACTOR_AVATAR_URL', StringType(16777216), nullable=True), StructField('ACTOR_DISPLAY_LOGIN', StringType(16777216), nullable=True), StructField('ACTOR_GRAVATAR_ID', StringType(16777216), nullable=True), StructField('ACTOR_ID', LongType(), nullable=True), StructField('ACTOR_LOGIN', StringType(16777216), nullable=True), StructField('ACTOR_URL', StringType(16777216), nullable=True), StructField('ORG_AVATAR_URL', StringType(16777216), nullable=True), StructField('ORG_GRAVATAR_ID', StringType(16777216), nullable=True), StructField('ORG_ID', LongType(), nullable=True), StructField('ORG_LOGIN', StringType(16777216), nullable=True), StructField('ORG_URL', StringType(16777216), nullable=True), StructField('PAYLOAD', VariantType(), nullable=True), StructField('PAYLOAD_ACTION', StringType(16777216), nullable=True), StructField('PAYLOAD_DESCRIPTION', String

In [5]:
repos_df.schema

StructType([StructField('REPO_ID', LongType(), nullable=True), StructField('REPO_NAME', StringType(16777216), nullable=True), StructField('FIRST_SEEN', DateType(), nullable=True)])

In [6]:
repos_df_columns = repos_df.schema.names
repos_df_columns

['REPO_ID', 'REPO_NAME', 'FIRST_SEEN']

In [9]:
stars_df.schema

StructType([StructField('REPO_ID', LongType(), nullable=True), StructField('DATE', DateType(), nullable=True), StructField('COUNT', LongType(), nullable=True)])

 # Let's analyze `stars_df` dataframe

For each repository, for every year, what is the total number of stars added?

In [10]:
total_stars_by_repo_year = stars_df.group_by("REPO_ID", year("DATE"))\
                                .agg(sum("COUNT"))\
                                .with_column_renamed('"SUM(COUNT)"', "sum_of_stars")\
                                .with_column_renamed('"YEAR(DATE)"', "year")\
                                .sort("sum_of_stars", ascending=False)

In [11]:
total_stars_by_repo_year.describe().show()

---------------------------------------------------------------------------
|"SUMMARY"  |"REPO_ID"           |"YEAR"             |"SUM_OF_STARS"      |
---------------------------------------------------------------------------
|max        |681399568.0         |2023.0             |345750.0            |
|mean       |241157453.879243    |2020.129013        |8.292049            |
|count      |44712281.0          |44712281.0         |44712281.0          |
|min        |1.0                 |2011.0             |1.0                 |
|stddev     |188549228.93505043  |2.428608449297663  |150.60252150611555  |
---------------------------------------------------------------------------



In [None]:
# total_stars_by_repo_year.describe().show()
"""
SELECT  *  
FROM 
(
      ( SELECT 'count' AS "SUMMARY", 
                "COUNT(REPO_ID)" AS "REPO_ID", 
                "COUNT(YEAR)" AS "YEAR", 
                "COUNT(SUM_OF_STARS)" AS "SUM_OF_STARS" 
        FROM ( SELECT count("REPO_ID") AS "COUNT(REPO_ID)", 
                    count("YEAR") AS "COUNT(YEAR)", 
                    count("SUM_OF_STARS") AS "COUNT(SUM_OF_STARS)" 
                FROM ( SELECT  *  
                        FROM ( SELECT "REPO_ID", 
                                        "YEAR(DATE)" AS "YEAR", 
                                        "SUM(COUNT)" AS "SUM_OF_STARS" 
                                FROM ( SELECT "REPO_ID", 
                                                year("DATE") AS "YEAR(DATE)", 
                                                sum("COUNT") AS "SUM(COUNT)"
                                        FROM ( SELECT  *  
                                                FROM github_archive_events.cybersyn.gh_stars
                                             ) 
                                        GROUP BY "REPO_ID", year("DATE")
                                      )
                             )
                        ORDER BY "SUM_OF_STARS" DESC NULLS LAST
                     ) 
                LIMIT 1
            )
        ) 
        
        UNION 
        
        ( SELECT 'mean' AS "SUMMARY", 
                "AVG(REPO_ID)" AS "REPO_ID", 
                "AVG(YEAR)" AS "YEAR", 
                "AVG(SUM_OF_STARS)" AS "SUM_OF_STARS" 
            FROM ( SELECT avg("REPO_ID") AS "AVG(REPO_ID)", 
                            avg("YEAR") AS "AVG(YEAR)", 
                            avg("SUM_OF_STARS") AS "AVG(SUM_OF_STARS)" 
                    FROM ( SELECT  *  
                            FROM ( SELECT "REPO_ID", 
                                            "YEAR(DATE)" AS "YEAR", 
                                            "SUM(COUNT)" AS "SUM_OF_STARS" 
                                    FROM ( SELECT "REPO_ID", 
                                                    year("DATE") AS "YEAR(DATE)", 
                                                    sum("COUNT") AS "SUM(COUNT)" 
                                            FROM ( SELECT  *  
                                                    FROM github_archive_events.cybersyn.gh_stars
                                                 ) 
                                            GROUP BY "REPO_ID", year("DATE")
                                         )
                                 ) ORDER BY "SUM_OF_STARS" DESC NULLS LAST
                         ) 
                    LIMIT 1
                )
        ) 
        
        UNION 
        
        ( SELECT 'stddev' AS "SUMMARY", 
                    "STDDEV(REPO_ID)" AS "REPO_ID", 
                    "STDDEV(YEAR)" AS "YEAR", 
                    "STDDEV(SUM_OF_STARS)" AS "SUM_OF_STARS" 
            FROM ( SELECT stddev("REPO_ID") AS "STDDEV(REPO_ID)", 
                            stddev("YEAR") AS "STDDEV(YEAR)", 
                            stddev("SUM_OF_STARS") AS "STDDEV(SUM_OF_STARS)" 
                    FROM ( SELECT  *  
                            FROM ( SELECT "REPO_ID", 
                                            "YEAR(DATE)" AS "YEAR", 
                                            "SUM(COUNT)" AS "SUM_OF_STARS" 
                                    FROM ( SELECT "REPO_ID", 
                                                    year("DATE") AS "YEAR(DATE)", 
                                                    sum("COUNT") AS "SUM(COUNT)" 
                                            FROM ( SELECT  *  
                                                    FROM github_archive_events.cybersyn.gh_stars
                                                 ) 
                                            GROUP BY "REPO_ID", year("DATE")
                                         )
                                 ) ORDER BY "SUM_OF_STARS" DESC NULLS LAST
                         ) 
                    LIMIT 1
                )
        ) 
        
        UNION 
        
        ( SELECT 'min' AS "SUMMARY", 
                    "MIN(REPO_ID)" AS "REPO_ID", 
                    "MIN(YEAR)" AS "YEAR", 
                    "MIN(SUM_OF_STARS)" AS "SUM_OF_STARS" 
         FROM ( SELECT min("REPO_ID") AS "MIN(REPO_ID)", 
                         min("YEAR") AS "MIN(YEAR)", 
                         min("SUM_OF_STARS") AS "MIN(SUM_OF_STARS)" 
                FROM ( SELECT  *  
                        FROM ( SELECT "REPO_ID", 
                                        "YEAR(DATE)" AS "YEAR", 
                                        "SUM(COUNT)" AS "SUM_OF_STARS" 
                                FROM ( SELECT "REPO_ID", 
                                                year("DATE") AS "YEAR(DATE)", 
                                                sum("COUNT") AS "SUM(COUNT)" 
                                        FROM ( SELECT  *  
                                                FROM github_archive_events.cybersyn.gh_stars
                                             ) 
                                        GROUP BY "REPO_ID", year("DATE")
                                     )
                             ) ORDER BY "SUM_OF_STARS" DESC NULLS LAST
                    ) 
                LIMIT 1
                )
        ) 
        
        UNION 
        
        ( SELECT 'max' AS "SUMMARY", 
                    "MAX(REPO_ID)" AS "REPO_ID", 
                    "MAX(YEAR)" AS "YEAR", 
                    "MAX(SUM_OF_STARS)" AS "SUM_OF_STARS"
          FROM ( SELECT max("REPO_ID") AS "MAX(REPO_ID)", 
                          max("YEAR") AS "MAX(YEAR)", 
                          max("SUM_OF_STARS") AS "MAX(SUM_OF_STARS)" 
                  FROM ( SELECT  *  
                          FROM ( SELECT "REPO_ID", 
                                          "YEAR(DATE)" AS "YEAR", 
                                          "SUM(COUNT)" AS "SUM_OF_STARS" 
                                 FROM ( SELECT "REPO_ID", 
                                                 year("DATE") AS "YEAR(DATE)", 
                                                 sum("COUNT") AS "SUM(COUNT)" 
                                        FROM ( SELECT  *  
                                                FROM github_archive_events.cybersyn.gh_stars
                                             ) 
                                        GROUP BY "REPO_ID", year("DATE")
                                        )
                                ) ORDER BY "SUM_OF_STARS" DESC NULLS LAST
                        ) 
                 LIMIT 1
                 )
        )
) 
LIMIT 10
"""

### NOTE:
1. `describe()` is a very expensive operation. Use sparingly.

2. Even if we include a specific column in the `describe()` argument, it is still an expensive operation. So, beware!

In [12]:
total_stars_by_repo_year.describe("year").show()

---------------------------------
|"SUMMARY"  |"YEAR"             |
---------------------------------
|stddev     |2.428608449297663  |
|mean       |2020.129013        |
|max        |2023.0             |
|min        |2011.0             |
|count      |44712281.0         |
---------------------------------



In [13]:
total_stars_by_repo_year.select('year').distinct().show()

----------
|"YEAR"  |
----------
|2011    |
|2017    |
|2020    |
|2021    |
|2019    |
|2022    |
|2015    |
|2023    |
|2012    |
|2018    |
----------



In [None]:
# total_stars_by_repo_year.select('year').distinct().show()

"""
SELECT  *  
FROM ( SELECT "YEAR" 
        FROM ( SELECT "YEAR" 
                FROM ( SELECT "REPO_ID", 
                                "YEAR(DATE)" AS "YEAR", 
                                "SUM(COUNT)" AS "SUM_OF_STARS" 
                        FROM ( SELECT "REPO_ID", 
                                        year("DATE") AS "YEAR(DATE)", 
                                        sum("COUNT") AS "SUM(COUNT)" 
                                FROM ( SELECT  *  
                                        FROM github_archive_events.cybersyn.gh_stars
                                     ) 
                                GROUP BY "REPO_ID", year("DATE")
                             )
                     ) 
                 ORDER BY "SUM_OF_STARS" DESC NULLS LAST
             ) 
        GROUP BY "YEAR"
    ) 
LIMIT 10
"""

# Caching the dataframe for faster querying

Since Snowpark executes its commands lazily, there are times when the same transformation on a DataFrame is used multiple times. In this instance, the Snowpark DataFrame can be cached. In the background, Snowpark will create a temporary table and load the transformed data into that table. When that base DataFrame is used, later on, that table will be used instead of recalculating the transformations.

In the above example, we are performing multiple operations on the `total_stars_by_repo_year` dataframe. However, since we are not caching the dataframe, you can see in the SQL query that the same set of operations are being done repeatedly everytime we run transformations on the dataframe.

In [14]:
total_stars_by_repo_year = total_stars_by_repo_year.cache_result()

SnowparkSQLException: (1304): 01ae7b4d-0603-7de8-001e-2483015af6b6: 003540 (42501): SQL execution error: Creating table on shared database 'GITHUB_ARCHIVE_EVENTS' is not allowed.

In [17]:
total_stars_by_repo_year.write.mode("overwrite").save_as_table("total_stars_by_repo_year")

SnowparkSQLException: (1304): 01ae7b4d-0603-7dad-001e-2483015ac8ca: 003540 (42501): SQL execution error: Creating table on shared database 'GITHUB_ARCHIVE_EVENTS' is not allowed.

#### SnowparkSQLException: SQL execution error: 
Creating table on shared database `GITHUB_ARCHIVE_EVENTS` is not allowed. Let's copy the data into our own tables for faster querying.

## Save the `total_stars_by_repo_year` dataframe into a table in `vino_db` and `vino_schema` for faster query execution.

In [18]:
session.sql("USE VINO_DB")

<snowflake.snowpark.dataframe.DataFrame at 0x147be4ee0>

In [19]:
total_stars_by_repo_year.write.mode("overwrite").save_as_table("VINO_DB.VINO_SCHEMA.total_stars_by_repo_year")

In [20]:
total_stars_by_repo_year_table = session.table("VINO_DB.VINO_SCHEMA.total_stars_by_repo_year")
total_stars_by_repo_year_table.count()

44712281

### Let's peek into the top 5 repos:
### 1. using `take()`:

In [21]:
# query from the cybersyn github archive 
total_stars_by_repo_year.take(5)

[Row(REPO_ID=177736533, YEAR=2019, SUM_OF_STARS=345750),
 Row(REPO_ID=28457823, YEAR=2016, SUM_OF_STARS=182175),
 Row(REPO_ID=614765452, YEAR=2023, SUM_OF_STARS=139979),
 Row(REPO_ID=28457823, YEAR=2017, SUM_OF_STARS=96359),
 Row(REPO_ID=211104957, YEAR=2022, SUM_OF_STARS=92884)]

In [None]:
"""
SELECT  *  
FROM ( SELECT "REPO_ID", 
                "YEAR(DATE)" AS "YEAR", 
                "SUM(COUNT)" AS "SUM_OF_STARS" 
        FROM ( SELECT "REPO_ID", 
                        year("DATE") AS "YEAR(DATE)", 
                        sum("COUNT") AS "SUM(COUNT)" 
                FROM ( SELECT  *  
                        FROM github_archive_events.cybersyn.gh_stars
                    ) 
                GROUP BY "REPO_ID", year("DATE")
            )
        ) 
ORDER BY "SUM_OF_STARS" DESC NULLS LAST 
LIMIT 5

"""

In [24]:
# query from the locally saved table
total_stars_by_repo_year_table.sort(col("sum_of_stars").desc()).take(5)

[Row(REPO_ID=177736533, YEAR=2019, SUM_OF_STARS=345750),
 Row(REPO_ID=28457823, YEAR=2016, SUM_OF_STARS=182175),
 Row(REPO_ID=614765452, YEAR=2023, SUM_OF_STARS=139979),
 Row(REPO_ID=28457823, YEAR=2017, SUM_OF_STARS=96359),
 Row(REPO_ID=211104957, YEAR=2022, SUM_OF_STARS=92884)]

In [None]:
"""
SELECT  *  
FROM VINO_DB.VINO_SCHEMA.total_stars_by_repo_year 
ORDER BY "SUM_OF_STARS" DESC NULLS LAST 
LIMIT 5
"""

### NOTE: 
Caching or saving intermediate dataframes that will be queried frequently into a separate table improves the query performance.

### 2. Using `limit()`: 

In [25]:
total_stars_by_repo_year.limit(5).show()

---------------------------------------
|"REPO_ID"  |"YEAR"  |"SUM_OF_STARS"  |
---------------------------------------
|177736533  |2019    |345750          |
|28457823   |2016    |182175          |
|614765452  |2023    |139979          |
|28457823   |2017    |96359           |
|211104957  |2022    |92884           |
---------------------------------------



In [None]:
"""
SELECT  *  
FROM ( SELECT "REPO_ID", 
                "YEAR(DATE)" AS "YEAR", 
                "SUM(COUNT)" AS "SUM_OF_STARS" 
        FROM ( SELECT "REPO_ID", 
                        year("DATE") AS "YEAR(DATE)", 
                        sum("COUNT") AS "SUM(COUNT)" 
                FROM ( SELECT  *  
                        FROM github_archive_events.cybersyn.gh_stars
                    ) 
                GROUP BY "REPO_ID", year("DATE")
            )
        ) 
ORDER BY "SUM_OF_STARS" DESC NULLS LAST 
LIMIT 5

"""

In [27]:
total_stars_by_repo_year_table.sort(col("sum_of_stars").desc()).limit(5).show()

---------------------------------------
|"REPO_ID"  |"YEAR"  |"SUM_OF_STARS"  |
---------------------------------------
|177736533  |2019    |345750          |
|28457823   |2016    |182175          |
|614765452  |2023    |139979          |
|28457823   |2017    |96359           |
|211104957  |2022    |92884           |
---------------------------------------



In [None]:
"""
SELECT  *  
FROM VINO_DB.VINO_SCHEMA.total_stars_by_repo_year 
ORDER BY "SUM_OF_STARS" DESC NULLS LAST 
LIMIT 5
"""

### 3. Using `show()`:

In [28]:
total_stars_by_repo_year.show(5)

---------------------------------------
|"REPO_ID"  |"YEAR"  |"SUM_OF_STARS"  |
---------------------------------------
|177736533  |2019    |345750          |
|28457823   |2016    |182175          |
|614765452  |2023    |139979          |
|28457823   |2017    |96359           |
|211104957  |2022    |92884           |
---------------------------------------



In [None]:
"""
SELECT  *  
FROM ( SELECT "REPO_ID", 
                "YEAR(DATE)" AS "YEAR", 
                "SUM(COUNT)" AS "SUM_OF_STARS" 
        FROM ( SELECT "REPO_ID", 
                        year("DATE") AS "YEAR(DATE)", 
                        sum("COUNT") AS "SUM(COUNT)" 
                FROM ( SELECT  *  
                        FROM github_archive_events.cybersyn.gh_stars
                    ) 
                GROUP BY "REPO_ID", year("DATE")
            )
        ) 
ORDER BY "SUM_OF_STARS" DESC NULLS LAST 
LIMIT 5

"""

In [29]:
total_stars_by_repo_year_table.sort(col("sum_of_stars").desc()).show(5)

---------------------------------------
|"REPO_ID"  |"YEAR"  |"SUM_OF_STARS"  |
---------------------------------------
|177736533  |2019    |345750          |
|28457823   |2016    |182175          |
|614765452  |2023    |139979          |
|28457823   |2017    |96359           |
|211104957  |2022    |92884           |
---------------------------------------



In [None]:
"""
SELECT  *  
FROM VINO_DB.VINO_SCHEMA.total_stars_by_repo_year 
ORDER BY "SUM_OF_STARS" DESC NULLS LAST 
LIMIT 5
"""

### NOTE: 
You can use the functions `limit()`, `take()` interchangeably. `take()` returns a list of Rows. `limit()` and `show()` returns a dataframe.

## The top 5 repositories are: 
`177736533`, `28457823`, `614765452`, `28457823`, `211104957`. Let's dive deep.

In [30]:
repos_df.filter(col("repo_id")=="177736533").show()

---------------------------------------------
|"REPO_ID"  |"REPO_NAME"     |"FIRST_SEEN"  |
---------------------------------------------
|177736533  |997icu/996.ICU  |2019-03-28    |
|177736533  |996icu/996.ICU  |2019-03-26    |
---------------------------------------------



In [31]:
repos_df.filter(col("repo_id")=="28457823").show()

-----------------------------------------------------------
|"REPO_ID"  |"REPO_NAME"                   |"FIRST_SEEN"  |
-----------------------------------------------------------
|28457823   |FreeCodeCampers/freecodecamp  |2015-04-20    |
|28457823   |freeCodeCamp/                 |2018-06-01    |
|28457823   |FreeCodeCamp/freeCodeCamp     |2017-01-10    |
|28457823   |FreeCodeCamp/FreeCodeCamp     |2015-08-13    |
|28457823   |FreeCodeCamp/freecodecamp     |2015-01-01    |
|28457823   |freeCodeCamp/freeCodeCamp     |2017-01-11    |
-----------------------------------------------------------



In [32]:
repos_df.filter(col("repo_id")=="614765452").show()

------------------------------------------------------------
|"REPO_ID"  |"REPO_NAME"                    |"FIRST_SEEN"  |
------------------------------------------------------------
|614765452  |Significant-Gravitas/Auto-GPT  |2023-04-15    |
|614765452  |Torantulino/auto-gpt           |2023-03-28    |
|614765452  |Torantulino/Entrepreneur-GPT   |2023-03-16    |
|614765452  |Torantulino/Auto-GPT           |2023-03-24    |
|614765452  |Torantulino/entrepreneur-gpt   |2023-03-24    |
------------------------------------------------------------



In [33]:
repos_df.filter(col("repo_id")=="28457823").show()

-----------------------------------------------------------
|"REPO_ID"  |"REPO_NAME"                   |"FIRST_SEEN"  |
-----------------------------------------------------------
|28457823   |FreeCodeCampers/freecodecamp  |2015-04-20    |
|28457823   |freeCodeCamp/                 |2018-06-01    |
|28457823   |FreeCodeCamp/freeCodeCamp     |2017-01-10    |
|28457823   |FreeCodeCamp/FreeCodeCamp     |2015-08-13    |
|28457823   |FreeCodeCamp/freecodecamp     |2015-01-01    |
|28457823   |freeCodeCamp/freeCodeCamp     |2017-01-11    |
-----------------------------------------------------------



In [34]:
repos_df.filter(col("repo_id")=="211104957").show()

--------------------------------------------------
|"REPO_ID"  |"REPO_NAME"          |"FIRST_SEEN"  |
--------------------------------------------------
|211104957  |natanfelles/app      |2019-09-26    |
|211104957  |aplus-framework/app  |2021-07-22    |
--------------------------------------------------



## OOPS! Looks like there are duplicate entries in the `repos_df`. Notice that `repo_id` appears in multiple rows in the dataframe.

In [None]:
# TODO: Count repo_id, count distinct repo_id. How many duplicates are there? How many distinct repo_id are there?
# TODO: How to deal with duplicates? 
    # drop duplicates?
    # drop duplicates by any specific condition?
    # keep the latest record based on first_seen column?

# Dealing with Duplicates

In [35]:
stars_df.count()

191614275

In [36]:
stars_df.distinct().count()

191614275

There are no duplicate rows in `stars_df` dataframe. Let's check the total number of unique `repo_id`

In [37]:
# number of unique repo_id in stars_df
stars_df.drop_duplicates('repo_id').count()

30383934

Let's check `repos_df` now. 

In [38]:
repos_df.count()

354483346

#### 1. What happens if we drop all the duplicate rows?

In [39]:
repos_dedup_1 = repos_df.drop_duplicates()
repos_dedup_1.count()

354482846

#### 2. What is the difference between passing the columns as arguments vs simply calling the `drop_duplicates()` function without arguments?

In [40]:
repos_dedup_2 = repos_df.drop_duplicates("repo_id", "repo_name", "first_seen")
repos_dedup_2.count()

354482846

#### 3. What is the difference between `distinct()` vs `drop_duplicates()` functions?

In [41]:
repos_dedup_3 = repos_df.distinct()
repos_dedup_3.count()

354482846

#### 4. Let's drop all the duplicate rows based on `repo_id`. That is, let's keep only one row per `repo_id`.

In [42]:
repos_dedup_4 = repos_df.drop_duplicates("repo_id")
repos_dedup_4.count()

339596086

### `repos_dedup_4` is the clean dataframe and we will use this for the rest of the analysis. 

Note: It has one row for each `repo_id` value.

As you can see from the above commands, you can remove duplicates from a dataframe using different functions. But which one is more optimal? Let's dive into the query profile in the Snowsight UI, and understand the SQL query generated by each API.

#### 1. SQL query for `repos_df.drop_duplicates().count()`

In [None]:
# repos_df.drop_duplicates().count()
"""
SELECT count(1) AS "COUNT(LITERAL())" 
FROM ( SELECT "REPO_ID", "REPO_NAME", "FIRST_SEEN" 
        FROM ( SELECT  *  FROM github_archive_events.cybersyn.gh_repos) 
        GROUP BY "REPO_ID", "REPO_NAME", "FIRST_SEEN"
    ) 
LIMIT 1

"""

#### 2. SQL query for `repos_df.drop_duplicates("repo_id", "repo_name", "first_seen").count()`

In [None]:
# repos_df.drop_duplicates("repo_id", "repo_name", "first_seen").count()
"""
SELECT count(1) AS "COUNT(LITERAL())" 
FROM ( SELECT "REPO_ID", "REPO_NAME", "FIRST_SEEN" 
        FROM ( SELECT "REPO_ID", 
                        "REPO_NAME", 
                        "FIRST_SEEN", 
                        row_number() OVER (PARTITION BY "REPO_ID", 
                                                        "REPO_NAME", 
                                                        "FIRST_SEEN"  ORDER BY "REPO_ID" ASC NULLS FIRST, 
                                                        "REPO_NAME" ASC NULLS FIRST, 
                                                        "FIRST_SEEN" ASC NULLS FIRST ) AS "69vt86qrwf" 
                FROM github_archive_events.cybersyn.gh_repos
            ) 
        WHERE ("69vt86qrwf" = 1 :: INT)
        ) 
LIMIT 1
"""

#### 3. SQL query for `repos_df.distinct().count()`

In [None]:
# repos_df.distinct().count()
"""
SELECT count(1) AS "COUNT(LITERAL())" 
FROM ( SELECT "REPO_ID", "REPO_NAME", "FIRST_SEEN" 
        FROM ( SELECT  *  FROM github_archive_events.cybersyn.gh_repos) 
        GROUP BY "REPO_ID", "REPO_NAME", "FIRST_SEEN"
    ) 
LIMIT 1
"""

#### 4. SQL query for `repos_df.drop_duplicates("repo_id").count()`

In [None]:
# repos_df.drop_duplicates("repo_id").count()
"""
SELECT count(1) AS "COUNT(LITERAL())" 
FROM ( SELECT "REPO_ID", "REPO_NAME", "FIRST_SEEN" 
        FROM ( SELECT "REPO_ID", 
                        "REPO_NAME", 
                        "FIRST_SEEN", 
                        row_number() OVER (PARTITION BY "REPO_ID"  ORDER BY "REPO_ID" ASC NULLS FIRST ) AS "4zgjpf60dd" 
                FROM github_archive_events.cybersyn.gh_repos
                ) 
        WHERE ("4zgjpf60dd" = 1 :: INT)
    ) 
LIMIT 1

"""

## Let's verify the top 5 repositories again: 
`177736533`, `28457823`, `614765452`, `28457823`, `211104957`. 

In [43]:
repos_df.filter(col("repo_id")=="177736533").show(),
repos_dedup_4.filter(col("repo_id")=="177736533").show()

---------------------------------------------
|"REPO_ID"  |"REPO_NAME"     |"FIRST_SEEN"  |
---------------------------------------------
|177736533  |997icu/996.ICU  |2019-03-28    |
|177736533  |996icu/996.ICU  |2019-03-26    |
---------------------------------------------

---------------------------------------------
|"REPO_ID"  |"REPO_NAME"     |"FIRST_SEEN"  |
---------------------------------------------
|177736533  |997icu/996.ICU  |2019-03-28    |
---------------------------------------------



In [44]:
repos_df.filter(col("repo_id")=="28457823").show(),
repos_dedup_4.filter(col("repo_id")=="28457823").show()

-----------------------------------------------------------
|"REPO_ID"  |"REPO_NAME"                   |"FIRST_SEEN"  |
-----------------------------------------------------------
|28457823   |FreeCodeCampers/freecodecamp  |2015-04-20    |
|28457823   |freeCodeCamp/                 |2018-06-01    |
|28457823   |FreeCodeCamp/freeCodeCamp     |2017-01-10    |
|28457823   |FreeCodeCamp/FreeCodeCamp     |2015-08-13    |
|28457823   |FreeCodeCamp/freecodecamp     |2015-01-01    |
|28457823   |freeCodeCamp/freeCodeCamp     |2017-01-11    |
-----------------------------------------------------------

-----------------------------------------------------------
|"REPO_ID"  |"REPO_NAME"                   |"FIRST_SEEN"  |
-----------------------------------------------------------
|28457823   |FreeCodeCampers/freecodecamp  |2015-04-20    |
-----------------------------------------------------------



In [45]:
repos_df.filter(col("repo_id")=="614765452").show(),
repos_dedup_4.filter(col("repo_id")=="614765452").show()

------------------------------------------------------------
|"REPO_ID"  |"REPO_NAME"                    |"FIRST_SEEN"  |
------------------------------------------------------------
|614765452  |Significant-Gravitas/Auto-GPT  |2023-04-15    |
|614765452  |Torantulino/auto-gpt           |2023-03-28    |
|614765452  |Torantulino/Entrepreneur-GPT   |2023-03-16    |
|614765452  |Torantulino/Auto-GPT           |2023-03-24    |
|614765452  |Torantulino/entrepreneur-gpt   |2023-03-24    |
------------------------------------------------------------

---------------------------------------------------
|"REPO_ID"  |"REPO_NAME"           |"FIRST_SEEN"  |
---------------------------------------------------
|614765452  |Torantulino/Auto-GPT  |2023-03-24    |
---------------------------------------------------



In [46]:
repos_df.filter(col("repo_id")=="211104957").show(),
repos_dedup_4.filter(col("repo_id")=="211104957").show()

--------------------------------------------------
|"REPO_ID"  |"REPO_NAME"          |"FIRST_SEEN"  |
--------------------------------------------------
|211104957  |natanfelles/app      |2019-09-26    |
|211104957  |aplus-framework/app  |2021-07-22    |
--------------------------------------------------

--------------------------------------------------
|"REPO_ID"  |"REPO_NAME"          |"FIRST_SEEN"  |
--------------------------------------------------
|211104957  |aplus-framework/app  |2021-07-22    |
--------------------------------------------------



## GREAT!!!! NO MORE DUPLICATES!

**BUT....** instead of simply dropping duplicates based on `repo_id`, we want to keep the row with the recent value for `FIRST_SEEN` column. How can we do that? We can use **window functions** for that.

In [None]:
# TODO: Keep the record with latest date for `FIRST_SEEN` column

In [47]:
window1 = Window.partition_by("repo_id").order_by(col("first_seen").desc())

In [48]:
repos_dedup_df = repos_df.select(row_number().over(window1).as_("rnum"), 
                                     "repo_id", 
                                     "repo_name", 
                                     "first_seen")\
                         .filter(col("rnum")=="1")
repos_dedup_df.count()

339596086

### SQL query for `repos_dedup_df.count()`

In [None]:
"""
SELECT count(1) AS "COUNT(LITERAL())" 
FROM ( SELECT  *  
        FROM ( SELECT row_number() OVER (PARTITION BY "REPO_ID"  ORDER BY "FIRST_SEEN" ASC NULLS FIRST ) AS "RNUM", 
                    "REPO_ID", 
                    "REPO_NAME", 
                    "FIRST_SEEN" 
                FROM github_archive_events.cybersyn.gh_repos
            ) 
        WHERE ("RNUM" = '1')
    ) 
LIMIT 1
"""

## Let's verify the top 5 repositories again: `177736533`, `28457823`, `614765452`, `28457823`, `211104957`. 

In [49]:
repos_df.filter(col("repo_id")=="177736533").show(),
repos_dedup_df.filter(col("repo_id")=="177736533").show()

---------------------------------------------
|"REPO_ID"  |"REPO_NAME"     |"FIRST_SEEN"  |
---------------------------------------------
|177736533  |997icu/996.ICU  |2019-03-28    |
|177736533  |996icu/996.ICU  |2019-03-26    |
---------------------------------------------

------------------------------------------------------
|"RNUM"  |"REPO_ID"  |"REPO_NAME"     |"FIRST_SEEN"  |
------------------------------------------------------
|1       |177736533  |997icu/996.ICU  |2019-03-28    |
------------------------------------------------------



In [50]:
repos_df.filter(col("repo_id")=="28457823").show(),
repos_dedup_df.filter(col("repo_id")=="28457823").show()

-----------------------------------------------------------
|"REPO_ID"  |"REPO_NAME"                   |"FIRST_SEEN"  |
-----------------------------------------------------------
|28457823   |FreeCodeCampers/freecodecamp  |2015-04-20    |
|28457823   |freeCodeCamp/                 |2018-06-01    |
|28457823   |FreeCodeCamp/freeCodeCamp     |2017-01-10    |
|28457823   |FreeCodeCamp/FreeCodeCamp     |2015-08-13    |
|28457823   |FreeCodeCamp/freecodecamp     |2015-01-01    |
|28457823   |freeCodeCamp/freeCodeCamp     |2017-01-11    |
-----------------------------------------------------------

-----------------------------------------------------
|"RNUM"  |"REPO_ID"  |"REPO_NAME"    |"FIRST_SEEN"  |
-----------------------------------------------------
|1       |28457823   |freeCodeCamp/  |2018-06-01    |
-----------------------------------------------------



## PHEW! Unique records with latest values for `FIRST_SEEN` column are retained.

We don't need the `rnum` column, let's delete it.

In [51]:
repos_dedup_df = repos_dedup_df.drop('rnum')

In [52]:
repos_dedup_df.columns

['REPO_ID', 'REPO_NAME', 'FIRST_SEEN']

## Let's write this clean `repos_df` to our table.

In [53]:
session.sql("USE VINO_DB")

<snowflake.snowpark.dataframe.DataFrame at 0x1688cbee0>

In [54]:
repos_dedup_df.write.mode("overwrite").save_as_table("VINO_DB.VINO_SCHEMA.repos_deduped")

In [56]:
repos_dedup_table = session.table("VINO_DB.VINO_SCHEMA.repos_deduped")
repos_dedup_table.count()

339596086

### Now calculate the all-time top 50 repositories based on the number of stars. 

In [57]:
repos_dedup_table.count()

339596086

In [58]:
total_stars_by_repo_year_table.count()

44712281

In [59]:
repos_dedup_table.columns

['REPO_ID', 'REPO_NAME', 'FIRST_SEEN']

In [60]:
total_stars_by_repo_year_table.columns

['REPO_ID', 'YEAR', 'SUM_OF_STARS']

#### `repo_id` column is present in both dataframes. If we join two dataframes with the same column name, with the join key being `repo_id`, Snowpark automatically appends a random prefix. 

See below!

In [62]:
temp_df = total_stars_by_repo_year_table.join(repos_dedup_table, 
                                              total_stars_by_repo_year_table['repo_id']==repos_dedup_table['repo_id'])\
                                        .select(total_stars_by_repo_year_table['repo_id'],
                                                 "year",
                                                 "repo_name",
                                                 "sum_of_stars",
                                                 "first_seen")
temp_df.columns

['"l_yecd_REPO_ID"', 'YEAR', 'REPO_NAME', 'SUM_OF_STARS', 'FIRST_SEEN']

#### Let us `alias()` one of the columns and then join the dataframes to avoid this. 

In [63]:
top_repos = total_stars_by_repo_year_table.join(repos_dedup_table, 
                                                total_stars_by_repo_year_table['repo_id'] == repos_dedup_table['repo_id'])\
                                          .select(total_stars_by_repo_year_table['repo_id'].as_("repo_id"),
                                                 "year",
                                                 "repo_name",
                                                 "sum_of_stars",
                                                 "first_seen")
top_repos.columns

['REPO_ID', 'YEAR', 'REPO_NAME', 'SUM_OF_STARS', 'FIRST_SEEN']

In [64]:
top_repos.count()

44712281

By default, the join operation is `inner join`. Can we look at the SQL query that runs under the hood?

In [None]:
"""
SELECT count(1) AS "COUNT(LITERAL())" 
FROM ( SELECT "l_csuo_REPO_ID" AS "REPO_ID", 
                "YEAR", 
                "REPO_NAME", 
                "SUM_OF_STARS", 
                "FIRST_SEEN" 
        FROM ( 
                SELECT  *  
                FROM (( SELECT "REPO_ID" AS "l_csuo_REPO_ID", 
                                "YEAR" AS "YEAR", 
                                "SUM_OF_STARS" AS "SUM_OF_STARS" 
                        FROM VINO_DB.VINO_SCHEMA.total_stars_by_repo_year
                       ) AS SNOWPARK_LEFT 
                       
                       INNER JOIN 
                       
                       ( SELECT "REPO_ID" AS "r_lxll_REPO_ID", 
                                   "REPO_NAME" AS "REPO_NAME", 
                                   "FIRST_SEEN" AS "FIRST_SEEN" 
                         FROM VINO_DB.VINO_SCHEMA.repos_deduped
                       ) AS SNOWPARK_RIGHT 
                       
                       ON 
                       
                       ("l_csuo_REPO_ID" = "r_lxll_REPO_ID")
                      )
             )
    ) 
LIMIT 1

"""

#### In this dataset using two dataframes, `stars_df` is a smaller dataframe and `repos_df` is the larger one.  The output of this join operation will only have the repositories with `total_stars`.

That is, the join output will be of the same size as the `stars_df` dataframe. So, can we use a left join instead? `stars_df` on the `left`, and `repos_df` on the `right`.

In [68]:
top_repos_1 = total_stars_by_repo_year_table.join(repos_dedup_table, 
                                                    total_stars_by_repo_year_table['repo_id']==repos_dedup_table['repo_id'],
                                                    join_type='left')\
                                            .select(total_stars_by_repo_year_table['repo_id'].as_("repo_id"),
                                                    "year",
                                                     "repo_name",
                                                     "sum_of_stars",
                                                     "first_seen")
top_repos_1.columns

['REPO_ID', 'YEAR', 'REPO_NAME', 'SUM_OF_STARS', 'FIRST_SEEN']

In [69]:
top_repos_1.count()

44712281

Can we look at the SQL query that runs under the hood for this `left join`?

In [None]:
"""
SELECT count(1) AS "COUNT(LITERAL())" 
FROM ( SELECT "l_3qvm_REPO_ID" AS "REPO_ID", 
                "YEAR", 
                "REPO_NAME", 
                "SUM_OF_STARS", 
                "FIRST_SEEN" 
        FROM ( SELECT  *  
                FROM (
                        ( SELECT "REPO_ID" AS "l_3qvm_REPO_ID", 
                                    "YEAR" AS "YEAR", 
                                    "SUM_OF_STARS" AS "SUM_OF_STARS" 
                           FROM VINO_DB.VINO_SCHEMA.total_stars_by_repo_year
                        ) AS SNOWPARK_LEFT 
                        
                        LEFT OUTER JOIN 
                        
                        ( SELECT "REPO_ID" AS "r_zw73_REPO_ID", 
                                    "REPO_NAME" AS "REPO_NAME", 
                                    "FIRST_SEEN" AS "FIRST_SEEN" 
                          FROM VINO_DB.VINO_SCHEMA.repos_deduped
                         ) AS SNOWPARK_RIGHT 
                         
                         ON 
                         
                         ("l_3qvm_REPO_ID" = "r_zw73_REPO_ID")
                     )
            )
    ) 
LIMIT 1
 
"""

In [None]:
# TODO: Which join operation is efficient? How can we verify it?

### Top 50 repos: 

In [72]:
top_repos.sort(col("sum_of_stars").desc()).limit(50).show(50)

-----------------------------------------------------------------------------------------------
|"REPO_ID"  |"YEAR"  |"REPO_NAME"                             |"SUM_OF_STARS"  |"FIRST_SEEN"  |
-----------------------------------------------------------------------------------------------
|177736533  |2019    |997icu/996.ICU                          |345750          |2019-03-28    |
|28457823   |2016    |freeCodeCamp/                           |182175          |2018-06-01    |
|614765452  |2023    |Significant-Gravitas/Auto-GPT           |139979          |2023-04-15    |
|28457823   |2017    |freeCodeCamp/                           |96359           |2018-06-01    |
|211104957  |2022    |aplus-framework/app                     |92884           |2021-07-22    |
|241576270  |2020    |labuladong/fucking-algorithm            |81986           |2020-02-19    |
|34526884   |2021    |ant-design/                             |79855           |2018-05-02    |
|123458551  |2019    |jackfrued/Python-1

In [73]:
top_repos.sort(col("first_seen").desc()).limit(20).show(20)

--------------------------------------------------------------------------------------------------------
|"REPO_ID"  |"YEAR"  |"REPO_NAME"                                      |"SUM_OF_STARS"  |"FIRST_SEEN"  |
--------------------------------------------------------------------------------------------------------
|490627258  |2022    |Stella2Aurora/Raycity                            |4               |2023-08-21    |
|570370576  |2022    |ssbanjo/Keyword-pinger                           |2               |2023-08-21    |
|681301245  |2023    |Ratulbaba7/Halfrost-Field                        |1               |2023-08-21    |
|681349607  |2023    |beratflixc/Voicemod-Free-Download-2023           |38              |2023-08-21    |
|678810547  |2023    |HaryKali/null-js                                 |1               |2023-08-21    |
|680996550  |2023    |seanwallawalla/AI2001_API                        |1               |2023-08-21    |
|555892056  |2022    |Octoping925/MobileEntropyHelper-C

### Top 50 repos in 2023 only

In [74]:
# aggregate stars count by repo_id, and year=2023
top_repos_2023 = top_repos.filter(col("year")==2023)
top_repos_2023.count()

8202855

In [75]:
top_repos_2023.sort(col("sum_of_stars").desc()).limit(50).show(50)

---------------------------------------------------------------------------------------------------
|"REPO_ID"  |"YEAR"  |"REPO_NAME"                                 |"SUM_OF_STARS"  |"FIRST_SEEN"  |
---------------------------------------------------------------------------------------------------
|614765452  |2023    |Significant-Gravitas/Auto-GPT               |139979          |2023-04-15    |
|604826790  |2023    |base-org/chains                             |76626           |2023-02-23    |
|574523116  |2023    |f/awesome-chatgpt-prompts                   |72424           |2022-12-05    |
|527591471  |2023    |AUTOMATIC1111/stable-diffusion-webui        |69028           |2022-08-22    |
|596115135  |2023    |base-org/node                               |66117           |2023-02-23    |
|619859161  |2023    |twitter/the-algorithm                       |58771           |2023-03-31    |
|552661142  |2023    |langchain-ai/langchain                      |55116           |2023-07-23    |


In [76]:
top_repos_2023.sort(col("first_seen").desc()).limit(20).show(20)

-----------------------------------------------------------------------------------------------------------
|"REPO_ID"  |"YEAR"  |"REPO_NAME"                                         |"SUM_OF_STARS"  |"FIRST_SEEN"  |
-----------------------------------------------------------------------------------------------------------
|681242538  |2023    |romoguilhem/los-pimientos                           |1               |2023-08-21    |
|680969651  |2023    |smartdev6/marketplace                               |1               |2023-08-21    |
|681297664  |2023    |neoimpulse/Match3Game                               |1               |2023-08-21    |
|5504383    |2023    |mostlyserious/kadyns                                |1               |2023-08-21    |
|600460842  |2023    |Esildo/State_Machine_Project                        |1               |2023-08-21    |
|681267131  |2023    |shital0101/Searching                                |2               |2023-08-21    |
|680965354  |2023    |coding

In [88]:
#END