In [None]:
#Author: Shlomi Kiko
#Topic: Basic ETL example using the Python library PySpark
#Linkedin: https://www.linkedin.com/in/shlomikiko/

In [29]:
#Import libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import when
import gc
import os
import sys

In [30]:
#Define operating system variables
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [31]:
#Create the builder for Spark
spark = SparkSession.builder.appName('ETL_Spark').getOrCreate()

In [32]:
#Define connections
database = 'BikeStores'
schema = 'dbo'
table1 = schema + '.' + 'Test'

driver = 'com.microsoft.sqlserver.jdbc.SQLServerDriver'
connection = f'jdbc:sqlserver://localhost:1433;databaseName={database};integratedSecurity=true;trustServerCertificate=true'
#string  connectionURL = "jdbc:sqlserver://localhost:10020;databaseName=mydatabase;user=me;password=random_password;encrypt=true;trustServerCertificate=true";

In [33]:
#Read from Source Database
df_test = spark.read \
        .format('jdbc') \
        .option('driver', driver) \
        .option('url', connection) \
        .option('dbtable', table1) \
        .load()

In [34]:
#Check the data types
df_test.printSchema()

root
 |-- brand_id: integer (nullable = true)
 |-- brand_name: string (nullable = true)



In [35]:
#Show sample data
df_test.show()

+--------+------------+
|brand_id|  brand_name|
+--------+------------+
|       1|     Electra|
|       2|        Haro|
|       3|      Heller|
|       4| Pure Cycles|
|       5|     Ritchey|
|       6|     Strider|
|       7|Sun Bicycles|
|       8|       Surly|
|       9|        Trek|
|      10|      Shlomi|
+--------+------------+



In [36]:
#Create temp view for more flexibility
df_test.createOrReplaceTempView('TestView')

In [37]:
#Pick what data to have in the view
testView = spark.sql("""SELECT DISTINCT * FROM TestView""")

In [38]:
#Show sample data from view
testView.show()

+--------+------------+
|brand_id|  brand_name|
+--------+------------+
|       5|     Ritchey|
|       1|     Electra|
|       9|        Trek|
|       7|Sun Bicycles|
|       3|      Heller|
|      10|      Shlomi|
|       2|        Haro|
|       8|       Surly|
|       4| Pure Cycles|
|       6|     Strider|
+--------+------------+



In [43]:
#Write to final destination
testView.write \
        .format('jdbc') \
        .mode('overwrite') \
        .option('truncate', 'true') \
        .option('url', connection) \
        .option('dbtable', table1) \
        .save()

In [44]:
#Delete objects
del df_test, testView

In [45]:
#Garbage collector
gc.collect()

1992