

---

**This notebook connects you to your Google Drive.  Be sure to alter the path to match where you store the data.**


---



In [1]:
# Activate Spark in our Colab notebook.
import os
# Find the latest version of spark 3.0  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example: 'spark-3.2.2'
spark_version = 'spark-3.2.2'
# spark_version = 'spark-3.<enter version>'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.2.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.2.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3.2"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease [1,581 B]
Hit:5 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:7 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Get:8 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Get:9 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ Packages [90.7 kB]
Hit:10 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Get:11 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]
Get:12 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bi

In [2]:
# Import the Spark dependencies.
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import StructType,StructField, StringType, IntegerType

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# You have to include the full link to the csv file containing your dataset.
employee_data_filePath='/content/drive/MyDrive/Data/employee_data.csv'
# Create a DataFrame with your data.
lookup_Jobrole_filePath='/content/drive/MyDrive/Data/JobRole_lookup.csv'

In [6]:
# If your data has a header you can have Spark retrieve it
# And, we can also save a step by printing the Schema too.
employee_df=(spark.read
           .option("header","true")
           .csv(employee_data_filePath)
                      )

employee_df.printSchema()
#Better but notices it's all strings so let's do one more thing

root
 |-- Age: string (nullable = true)
 |-- Attrition: string (nullable = true)
 |-- BusinessTravel: string (nullable = true)
 |-- DailyRate: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- DistanceFromHome: string (nullable = true)
 |-- Education: string (nullable = true)
 |-- EducationField: string (nullable = true)
 |-- EmployeeCount: string (nullable = true)
 |-- EmployeeNumber: string (nullable = true)
 |-- EnvironmentSatisfaction: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- HourlyRate: string (nullable = true)
 |-- JobInvolvement: string (nullable = true)
 |-- JobLevel: string (nullable = true)
 |-- JobRole: string (nullable = true)
 |-- JobSatisfaction: string (nullable = true)
 |-- MaritalStatus: string (nullable = true)
 |-- MonthlyIncome: string (nullable = true)
 |-- MonthlyRate: string (nullable = true)
 |-- NumCompaniesWorked: string (nullable = true)
 |-- Over18: string (nullable = true)
 |-- OverTime: string (nullable = tr

In [7]:
# Read with headers and "infer" schema
employee_df=(spark.read
           .option("header","true")
           .option("inferSchema", "true")
           .csv(employee_data_filePath)
           )

employee_df.printSchema()
# This gives us a much better output.
# Note: this isn't always enough, there may be times when you have to define the schema or even convert the data types after reading. 
# You will get much better load performance if you provide a schema

root
 |-- Age: integer (nullable = true)
 |-- Attrition: string (nullable = true)
 |-- BusinessTravel: string (nullable = true)
 |-- DailyRate: integer (nullable = true)
 |-- Department: string (nullable = true)
 |-- DistanceFromHome: integer (nullable = true)
 |-- Education: integer (nullable = true)
 |-- EducationField: string (nullable = true)
 |-- EmployeeCount: integer (nullable = true)
 |-- EmployeeNumber: integer (nullable = true)
 |-- EnvironmentSatisfaction: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- HourlyRate: integer (nullable = true)
 |-- JobInvolvement: integer (nullable = true)
 |-- JobLevel: integer (nullable = true)
 |-- JobRole: integer (nullable = true)
 |-- JobSatisfaction: integer (nullable = true)
 |-- MaritalStatus: string (nullable = true)
 |-- MonthlyIncome: integer (nullable = true)
 |-- MonthlyRate: integer (nullable = true)
 |-- NumCompaniesWorked: integer (nullable = true)
 |-- Over18: string (nullable = true)
 |-- OverTime: string

In [8]:
# Here we have defined the schema of the spark DataFrame we intend to build from the csv.
empSchema= StructType(  
                        [StructField("Age",  IntegerType(), True),
                            StructField("Attrition",  StringType(), True),
                            StructField("BusinessTravel",  StringType(), True),
                            StructField("DailyRate",  IntegerType(), True),
                            StructField("Department",  StringType(), True),
                            StructField("DistanceFromHome",  IntegerType(), True),
                            StructField("Education",  IntegerType(), True),
                            StructField("EducationField",  StringType(), True),
                            StructField("EmployeeCount",  IntegerType(), True),
                            StructField("EmployeeNumber",  IntegerType(), True),
                            StructField("EnvironmentSatisfaction",  IntegerType(), True),
                            StructField("Gender",  StringType(), True),
                            StructField("HourlyRate",  IntegerType(), True),
                            StructField("JobInvolvement",  IntegerType(), True),
                            StructField("JobLevel",  IntegerType(), True),
                            StructField("JobRole",  IntegerType(), True),
                            StructField("JobSatisfaction",  IntegerType(), True),
                            StructField("MaritalStatus",  StringType(), True),
                            StructField("MonthlyIncome",  IntegerType(), True),
                            StructField("MonthlyRate",  IntegerType(), True),
                            StructField("NumCompaniesWorked",  IntegerType(), True),
                            StructField("Over18",  StringType(), True),
                            StructField("OverTime",  StringType(), True),
                            StructField("PercentSalaryHike",  IntegerType(), True),
                            StructField("PerformanceRating",  IntegerType(), True),
                            StructField("RelationshipSatisfaction",  IntegerType(), True),
                            StructField("StandardHours",  IntegerType(), True),
                            StructField("StockOptionLevel",  IntegerType(), True),
                            StructField("TotalWorkingYears",  IntegerType(), True),
                            StructField("TrainingTimesLastYear",  IntegerType(), True),
                            StructField("WorkLifeBalance",  IntegerType(), True),
                            StructField("YearsAtCompany",  IntegerType(), True),
                            StructField("YearsInCurrentRole",  IntegerType(), True),
                            StructField("YearsSinceLastPromotion",  IntegerType(), True),
                            StructField("YearsWithCurr7",  IntegerType(), True)
                        ]
)


In [9]:
# Here we are reading in the file with the defined schema.
employee_df=(spark.read
           .option("header","true")
           .schema(empSchema)
           .csv(employee_data_filePath)
           )

employee_df.printSchema()

root
 |-- Age: integer (nullable = true)
 |-- Attrition: string (nullable = true)
 |-- BusinessTravel: string (nullable = true)
 |-- DailyRate: integer (nullable = true)
 |-- Department: string (nullable = true)
 |-- DistanceFromHome: integer (nullable = true)
 |-- Education: integer (nullable = true)
 |-- EducationField: string (nullable = true)
 |-- EmployeeCount: integer (nullable = true)
 |-- EmployeeNumber: integer (nullable = true)
 |-- EnvironmentSatisfaction: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- HourlyRate: integer (nullable = true)
 |-- JobInvolvement: integer (nullable = true)
 |-- JobLevel: integer (nullable = true)
 |-- JobRole: integer (nullable = true)
 |-- JobSatisfaction: integer (nullable = true)
 |-- MaritalStatus: string (nullable = true)
 |-- MonthlyIncome: integer (nullable = true)
 |-- MonthlyRate: integer (nullable = true)
 |-- NumCompaniesWorked: integer (nullable = true)
 |-- Over18: string (nullable = true)
 |-- OverTime: string

In [10]:
# Let's look at example of querying our data with PySpark
employee_df.show()

+---+---------+-----------------+---------+--------------------+----------------+---------+--------------+-------------+--------------+-----------------------+------+----------+--------------+--------+-------+---------------+-------------+-------------+-----------+------------------+------+--------+-----------------+-----------------+------------------------+-------------+----------------+-----------------+---------------------+---------------+--------------+------------------+-----------------------+--------------+
|Age|Attrition|   BusinessTravel|DailyRate|          Department|DistanceFromHome|Education|EducationField|EmployeeCount|EmployeeNumber|EnvironmentSatisfaction|Gender|HourlyRate|JobInvolvement|JobLevel|JobRole|JobSatisfaction|MaritalStatus|MonthlyIncome|MonthlyRate|NumCompaniesWorked|Over18|OverTime|PercentSalaryHike|PerformanceRating|RelationshipSatisfaction|StandardHours|StockOptionLevel|TotalWorkingYears|TrainingTimesLastYear|WorkLifeBalance|YearsAtCompany|YearsInCurrentR

In [11]:
# Let's do a simple filter using PySpark.
employee_df.filter(employee_df.Education==1).show()

+---+---------+-----------------+---------+--------------------+----------------+---------+----------------+-------------+--------------+-----------------------+------+----------+--------------+--------+-------+---------------+-------------+-------------+-----------+------------------+------+--------+-----------------+-----------------+------------------------+-------------+----------------+-----------------+---------------------+---------------+--------------+------------------+-----------------------+--------------+
|Age|Attrition|   BusinessTravel|DailyRate|          Department|DistanceFromHome|Education|  EducationField|EmployeeCount|EmployeeNumber|EnvironmentSatisfaction|Gender|HourlyRate|JobInvolvement|JobLevel|JobRole|JobSatisfaction|MaritalStatus|MonthlyIncome|MonthlyRate|NumCompaniesWorked|Over18|OverTime|PercentSalaryHike|PerformanceRating|RelationshipSatisfaction|StandardHours|StockOptionLevel|TotalWorkingYears|TrainingTimesLastYear|WorkLifeBalance|YearsAtCompany|YearsInCurr

In [12]:
# Lets sum up the DailyRate by JobLevel and make sure our output column as an easy to read name.
(employee_df.groupby("JobLevel")
          .sum("DailyRate")
          .withColumnRenamed("sum(DailyRate)","totalRate")
          .show()
)

+--------+---------+
|JobLevel|totalRate|
+--------+---------+
|       1|   436490|
|       3|   180888|
|       5|    56266|
|       4|    82092|
|       2|   423918|
+--------+---------+



While neither of these were too terribly difficult, imagine you had very little Python skill and TONS of SQL.  We can facilitate the SQL expert by making the DataFrame into a view.

In [13]:
# Now let's take our DataFrame and tell Spark that we want to be able to query it with SQL
# # To do this we are going to chain our DataFrame .createOrReplaceTempView(<temptableName>).
employee_df.createOrReplaceTempView('employee')

In [14]:
# This permits us the ability to query our DataFrame using standard ANSI SQL.
spark.sql("Select * from employee").show()

+---+---------+-----------------+---------+--------------------+----------------+---------+--------------+-------------+--------------+-----------------------+------+----------+--------------+--------+-------+---------------+-------------+-------------+-----------+------------------+------+--------+-----------------+-----------------+------------------------+-------------+----------------+-----------------+---------------------+---------------+--------------+------------------+-----------------------+--------------+
|Age|Attrition|   BusinessTravel|DailyRate|          Department|DistanceFromHome|Education|EducationField|EmployeeCount|EmployeeNumber|EnvironmentSatisfaction|Gender|HourlyRate|JobInvolvement|JobLevel|JobRole|JobSatisfaction|MaritalStatus|MonthlyIncome|MonthlyRate|NumCompaniesWorked|Over18|OverTime|PercentSalaryHike|PerformanceRating|RelationshipSatisfaction|StandardHours|StockOptionLevel|TotalWorkingYears|TrainingTimesLastYear|WorkLifeBalance|YearsAtCompany|YearsInCurrentR

In [15]:
# Now let's do a simple filter using PySpark.
# Using this example from above: 
# employee_df.filter(employee_df.Education==1).show() we write:
spark.sql("""
SELECT * FROM employee
WHERE Education =1
""").show()


+---+---------+-----------------+---------+--------------------+----------------+---------+----------------+-------------+--------------+-----------------------+------+----------+--------------+--------+-------+---------------+-------------+-------------+-----------+------------------+------+--------+-----------------+-----------------+------------------------+-------------+----------------+-----------------+---------------------+---------------+--------------+------------------+-----------------------+--------------+
|Age|Attrition|   BusinessTravel|DailyRate|          Department|DistanceFromHome|Education|  EducationField|EmployeeCount|EmployeeNumber|EnvironmentSatisfaction|Gender|HourlyRate|JobInvolvement|JobLevel|JobRole|JobSatisfaction|MaritalStatus|MonthlyIncome|MonthlyRate|NumCompaniesWorked|Over18|OverTime|PercentSalaryHike|PerformanceRating|RelationshipSatisfaction|StandardHours|StockOptionLevel|TotalWorkingYears|TrainingTimesLastYear|WorkLifeBalance|YearsAtCompany|YearsInCurr

In [16]:
# And, this example:
# (employee_df.groupby("JobLevel")
#           .sum("DailyRate")
#           .withColumnRenamed("sum(DailyRate)","totalRate")
#           .show()
# )
# is written as follows:

spark.sql("""
select JobLevel, sum(DailyRate) as totalRate
from employee
group by 1""").show()

+--------+---------+
|JobLevel|totalRate|
+--------+---------+
|       1|   436490|
|       3|   180888|
|       5|    56266|
|       4|    82092|
|       2|   423918|
+--------+---------+



In [17]:
# Let's read in a lookup table for the JobRole column.
lookupRole_df=(spark.read
              .option("header","true")
              .option("inferSchema","true")
              .csv(lookup_Jobrole_filePath)
              )

In [18]:
# Verify the schema looks okay.
lookupRole_df.printSchema()

root
 |-- role_id: integer (nullable = true)
 |-- roleName: string (nullable = true)



In [19]:
# Create our temp view
lookupRole_df.createOrReplaceTempView('RoleLookup')

In [20]:
# This is definitely our lookup table
spark.sql("select * from RoleLookup").show()

+-------+--------------------+
|role_id|            roleName|
+-------+--------------------+
|      1|     Sales Executive|
|      2|Manufacturing Dir...|
|      3|Laboratory Techni...|
|      4|Sales Representative|
|      5|Healthcare Repres...|
|      6|  Research Scientist|
|      7|             Manager|
|      8|   Research Director|
|      9|     Human Resources|
+-------+--------------------+



In [21]:
# Then, we join the employee_df and lookupRole_df views.
joined_df=employee_df.join(lookupRole_df, employee_df.JobRole==lookupRole_df.role_id)
joined_df.show()

+---+---------+-----------------+---------+--------------------+----------------+---------+--------------+-------------+--------------+-----------------------+------+----------+--------------+--------+-------+---------------+-------------+-------------+-----------+------------------+------+--------+-----------------+-----------------+------------------------+-------------+----------------+-----------------+---------------------+---------------+--------------+------------------+-----------------------+--------------+-------+--------------------+
|Age|Attrition|   BusinessTravel|DailyRate|          Department|DistanceFromHome|Education|EducationField|EmployeeCount|EmployeeNumber|EnvironmentSatisfaction|Gender|HourlyRate|JobInvolvement|JobLevel|JobRole|JobSatisfaction|MaritalStatus|MonthlyIncome|MonthlyRate|NumCompaniesWorked|Over18|OverTime|PercentSalaryHike|PerformanceRating|RelationshipSatisfaction|StandardHours|StockOptionLevel|TotalWorkingYears|TrainingTimesLastYear|WorkLifeBalance|Y

In [22]:
# And, finally we can create our final DataFrame using SQL.
Sql_joined_df=spark.sql("""
select a.*, roleName
from example a
  Inner join 
    RoleLookup b
      on a.JobRole=b.role_id""").show()

+---+---------+-----------------+---------+--------------------+----------------+---------+--------------+-------------+--------------+-----------------------+------+----------+--------------+--------+-------+---------------+-------------+-------------+-----------+------------------+------+--------+-----------------+-----------------+------------------------+-------------+----------------+-----------------+---------------------+---------------+--------------+------------------+-----------------------+--------------+--------------------+
|Age|Attrition|   BusinessTravel|DailyRate|          Department|DistanceFromHome|Education|EducationField|EmployeeCount|EmployeeNumber|EnvironmentSatisfaction|Gender|HourlyRate|JobInvolvement|JobLevel|JobRole|JobSatisfaction|MaritalStatus|MonthlyIncome|MonthlyRate|NumCompaniesWorked|Over18|OverTime|PercentSalaryHike|PerformanceRating|RelationshipSatisfaction|StandardHours|StockOptionLevel|TotalWorkingYears|TrainingTimesLastYear|WorkLifeBalance|YearsAtCo