<a href="https://colab.research.google.com/github/Sankytanky100/Data-Engineering/blob/main/Pyspark_in_Colab_Tutorial_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install Java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# Install PySpark
!pip install pyspark




In [2]:
# Check Java version
!java -version

# Check PySpark version
!pyspark --version


openjdk version "11.0.25" 2024-10-15
OpenJDK Runtime Environment (build 11.0.25+9-post-Ubuntu-1ubuntu122.04)
OpenJDK 64-Bit Server VM (build 11.0.25+9-post-Ubuntu-1ubuntu122.04, mixed mode, sharing)
Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 3.5.3
      /_/
                        
Using Scala version 2.12.18, OpenJDK 64-Bit Server VM, 11.0.25
Branch HEAD
Compiled by user haejoon.lee on 2024-09-09T05:20:05Z
Revision 32232e9ed33bb16b93ad58cfde8b82e0f07c0970
Url https://github.com/apache/spark
Type --help for more information.


In [3]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder \
    .appName("PySpark on Google Colab") \
    .getOrCreate()

# Verify SparkSession
print("SparkSession created:", spark)


SparkSession created: <pyspark.sql.session.SparkSession object at 0x79f81054e800>


In [6]:
sample_page_views  = spark.sparkContext.parallelize([
    ["en", "Statue_of_Liberty", "2022-01-01", 263],
    ["en", "Replicas_of_the_Statue_of_Liberty", "2022-01-01", 11],
    ["en", "Statue_of_Lucille_Ball" ,"2022-01-01", 6],
    ["en", "Statue_of_Liberty_National_Monument", "2022-01-01", 4],
    ["en", "Statue_of_Liberty_play"  ,"2022-01-01", 3],
])

In [7]:
sample_page_views_df = sample_page_views.toDF(
    ['language_code', 'title', 'date', 'count']
)

# show first 5 rows
sample_page_views_df.show(5, truncate=False)

+-------------+-----------------------------------+----------+-----+
|language_code|title                              |date      |count|
+-------------+-----------------------------------+----------+-----+
|en           |Statue_of_Liberty                  |2022-01-01|263  |
|en           |Replicas_of_the_Statue_of_Liberty  |2022-01-01|11   |
|en           |Statue_of_Lucille_Ball             |2022-01-01|6    |
|en           |Statue_of_Liberty_National_Monument|2022-01-01|4    |
|en           |Statue_of_Liberty_play             |2022-01-01|3    |
+-------------+-----------------------------------+----------+-----+



In [8]:
sample_page_views_rdd_restored = sample_page_views_df.rdd

# show restored RDD
sample_page_views_rdd_restored.collect()

[Row(language_code='en', title='Statue_of_Liberty', date='2022-01-01', count=263),
 Row(language_code='en', title='Replicas_of_the_Statue_of_Liberty', date='2022-01-01', count=11),
 Row(language_code='en', title='Statue_of_Lucille_Ball', date='2022-01-01', count=6),
 Row(language_code='en', title='Statue_of_Liberty_National_Monument', date='2022-01-01', count=4),
 Row(language_code='en', title='Statue_of_Liberty_play', date='2022-01-01', count=3)]

**Spark DataFrames from External Sources**

In [9]:
wiki_uniq_df = spark\
    .read\
    .csv('wiki_uniq_march_2022.csv')

# show the first 10 rows
wiki_uniq_df.show(10, truncate=False)

AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/content/wiki_uniq_march_2022.csv.

In [None]:
# show the data types
wiki_uniq_w_header_df.dtypes

In [None]:
wiki_uniq_w_schema_df = spark\
    .read\
    .option('header', True)\
    .option('inferSchema', True)\
    .csv('wiki_uniq_march_2022.csv')

# show the data types
wiki_uniq_w_schema_df.dtypes

In [None]:
spark = SparkSession \
    .builder \
    .appName("learning_spark_sql") \
    .getOrCreate()

# Read in Wikipedia Unique Visitors Dataset
wiki_uniq_df = spark.read\
    .option('header', True) \
    .option('delimiter', ',') \
    .option('inferSchema', True) \
    .csv("wiki_uniq_march_2022_w_site_type.csv")

# Create a temporary view with the DataFrame
wiki_uniq_df\
    .createOrReplaceTempView('uniq_visitors_march')

In [None]:
uniq_views_df_desc = uniq_views_df.describe()

# show summary
uniq_views_df_desc.show()

In [None]:
uniq_counts_human_df = uniq_views_df.drop('total_visitor_count','uniq_bot_visitors')

# show the first 5 rows
uniq_counts_human_df.show(5)

In [None]:
uniq_counts_final_df = uniq_counts_human_df.withColumnRenamed('uniq_human_visitors', 'unique_site_visitors')

# show the first 5 rows
uniq_counts_final_df.show(5)

In [None]:
uniq_views_df.printSchema()

In [None]:
# Create a New SparkSession
spark = SparkSession \
    .builder \
    .appName("learning_spark_sql") \
    .getOrCreate()



In [None]:
ar_site_visitors = wiki_uniq_df\
    .filter(wiki_uniq_df.language_code == 'ar')

# show the DataFrame
ar_site_visitors.show()

In [None]:
ar_visitors_slim = wiki_uniq_df\
    .select(['domain', 'uniq_human_visitors'])\
    .filter(wiki_uniq_df.language_code == 'ar')

# show the DataFrame
ar_visitors_slim.show()


In [None]:
top_visitors_site_type = wiki_uniq_df.select(['site_type', 'uniq_human_visitors'])\
    .groupBy('site_type')\
    .sum()\
    .orderBy('sum(uniq_human_visitors)', ascending=False)

# show the DataFrame
top_visitors_site_type.show()

**Querying SQL & Saving Data Frames with PySpark**

In [None]:
ar_site_visitors_qry = """
    SELECT * FROM uniq_visitors_march
    WHERE language_code = 'ar';
"""

# show the DataFrame
spark\
    .sql(ar_site_visitors_qry)\
    .show(truncate=False)

In [None]:
ar_site_visitors_slim_qry = """
    SELECT domain, uniq_human_visitors
    FROM uniq_visitors_march
    WHERE language_code = 'ar';
"""

# show the DataFrame
spark\
    .sql(ar_site_visitors_slim_qry)\
    .show(truncate=False)

In [None]:
site_top_type_qry = """
    SELECT site_type, SUM(uniq_human_visitors)
    FROM uniq_visitors_march
    GROUP BY site_type
    ORDER BY SUM(uniq_human_visitors) DESC;
"""

# show the DataFrame
spark\
    .sql(site_top_type_qry)\
    .show(truncate=False)

In [None]:
uniq_human_visitors_df = wiki_uniq_df\
    .select('domain', 'uniq_human_visitors')

# show the new DataFrame
uniq_human_visitors_df.show()

In [None]:
uniq_human_visitors_df\
    .write.csv('./results/csv/uniq_human_visitors/',  mode="overwrite")

In [None]:
uniq_human_visitors_df\
    .write.parquet('./results/pq/uniq_human_visitors/', mode="overwrite")

In [None]:
spark.stop()
