In [None]:
!wget https://downloads.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop2.7.tgz

In [None]:
!tar -xvf spark-3.1.2-bin-hadoop2.7.tgz

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop2.7"

In [None]:
!pip install findspark

In [None]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("My DF").getOrCreate()


# **Project Quiz**

In [None]:
from pyspark.sql.functions import sum, max, min, avg, count, col, lit, udf
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

In [None]:
!wget "https://raw.githubusercontent.com/AISCIENCES/course-master-big-data-with-pyspark-and-aws/main/Code/03-Spark%20DFs/OfficeDataProject.csv"

In [None]:
df = spark.read.options(header = "True").csv("/content/OfficeDataProject.csv")
df.show(5)
df.printSchema()

+-----------+-----------------+----------+-----+------+---+-----+
|employee_id|    employee_name|department|state|salary|age|bonus|
+-----------+-----------------+----------+-----+------+---+-----+
|       1000|        Nitz Leif| Marketing|   CA|  6131| 26|  543|
|       1001|  Melissia Dedman|   Finance|   AK|  4027| 43| 1290|
|       1002|Rudolph Barringer|        HR|   LA|  3122| 43| 1445|
|       1003|      Tamra Amber|  Accounts|   AK|  5717| 47| 1291|
|       1004|      Mullan Nitz|Purchasing|   CA|  5685| 34| 1394|
+-----------+-----------------+----------+-----+------+---+-----+
only showing top 5 rows

root
 |-- employee_id: string (nullable = true)
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- age: string (nullable = true)
 |-- bonus: string (nullable = true)



In [None]:
schema = StructType([
                     StructField("employee_id", StringType(), False),
                     StructField("employee_name", StringType(), True),
                     StructField("department", StringType(), False),
                     StructField("state", StringType(), False),
                     StructField("salary", IntegerType(), False),
                     StructField("age", IntegerType(), False),
                     StructField("bonus", IntegerType(), False)
])

In [None]:
df = spark.read.options(header = "True").schema(schema).csv("/content/OfficeDataProject.csv")
df.show(5)
df.printSchema()

+-----------+-----------------+----------+-----+------+---+-----+
|employee_id|    employee_name|department|state|salary|age|bonus|
+-----------+-----------------+----------+-----+------+---+-----+
|       1000|        Nitz Leif| Marketing|   CA|  6131| 26|  543|
|       1001|  Melissia Dedman|   Finance|   AK|  4027| 43| 1290|
|       1002|Rudolph Barringer|        HR|   LA|  3122| 43| 1445|
|       1003|      Tamra Amber|  Accounts|   AK|  5717| 47| 1291|
|       1004|      Mullan Nitz|Purchasing|   CA|  5685| 34| 1394|
+-----------+-----------------+----------+-----+------+---+-----+
only showing top 5 rows

root
 |-- employee_id: string (nullable = true)
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- bonus: integer (nullable = true)



Print the total number of employees in the company

In [None]:
df.count()

1000

Print the total number of departments in the company

In [None]:
df.select("department").distinct().count()

6

Print what all departments are present in the company

In [None]:
df.select("department").distinct().show()

+----------+
|department|
+----------+
|     Sales|
|        HR|
|   Finance|
|Purchasing|
| Marketing|
|  Accounts|
+----------+



Print the total number of employees in each department

In [None]:
df.groupBy("department").agg( count("*").alias("total_num_emp") ).show()

+----------+-------------+
|department|total_num_emp|
+----------+-------------+
|     Sales|          169|
|        HR|          171|
|   Finance|          162|
|Purchasing|          166|
| Marketing|          170|
|  Accounts|          162|
+----------+-------------+



Print the total number of employees in each state

In [None]:
df.groupBy("state").agg( count("*").alias("emp_InEach_state") ).show()

+-----+----------------+
|state|emp_InEach_state|
+-----+----------------+
|   LA|             205|
|   CA|             205|
|   WA|             208|
|   NY|             173|
|   AK|             209|
+-----+----------------+



Print the total number of employees in each state in each department

In [None]:
df.groupBy("state", "department").agg( count("*").alias("num_emp") ).sort("state", "department").show()

+-----+----------+-------+
|state|department|num_emp|
+-----+----------+-------+
|   AK|  Accounts|     37|
|   AK|   Finance|     37|
|   AK|        HR|     25|
|   AK| Marketing|     42|
|   AK|Purchasing|     30|
|   AK|     Sales|     38|
|   CA|  Accounts|     35|
|   CA|   Finance|     35|
|   CA|        HR|     28|
|   CA| Marketing|     33|
|   CA|Purchasing|     32|
|   CA|     Sales|     42|
|   LA|  Accounts|     29|
|   LA|   Finance|     29|
|   LA|        HR|     41|
|   LA| Marketing|     26|
|   LA|Purchasing|     45|
|   LA|     Sales|     35|
|   NY|  Accounts|     34|
|   NY|   Finance|     31|
+-----+----------+-------+
only showing top 20 rows



Print the min n max salaries in each department, and sort the salaries in asc order

In [None]:
df.groupBy("department")\
.agg( min("salary").alias("min_sal"))\
.sort(col("min_sal")).show()

+----------+-------+
|department|min_sal|
+----------+-------+
|   Finance|   1006|
|  Accounts|   1007|
|        HR|   1013|
| Marketing|   1031|
|     Sales|   1103|
|Purchasing|   1105|
+----------+-------+



In [None]:
df.groupBy("department")\
.agg( max("salary").alias("max_sal"))\
.sort(col("max_sal")).show()

+----------+-------+
|department|max_sal|
+----------+-------+
|  Accounts|   9890|
|   Finance|   9899|
| Marketing|   9974|
|     Sales|   9982|
|        HR|   9982|
|Purchasing|   9985|
+----------+-------+



Print the name of employees in NY state under Finance department whose bonuses are greater than the avg bonuses of all the employees in the NY state

In [None]:
global avgBonus
avgBonus = df.agg({"bonus" : "avg"}).collect()[0][0]

In [None]:
def filterEmp(employee_name, state, department, bonus):
    if (state == "NY") & (department == "Finance") & (bonus > avgBonus):
        return employee_name

filterEmpUDF = udf(lambda a,b,c,d: filterEmp(a,b,c,d), StringType())

df1 = df.withColumn("employee_name", filterEmpUDF(df.employee_name, df.state, df.department, df.bonus)).filter(col ("employee_name").isNotNull())
df1.show()




+-----------+--------------------+----------+-----+------+---+-----+
|employee_id|       employee_name|department|state|salary|age|bonus|
+-----------+--------------------+----------+-----+------+---+-----+
|       1035|       Vivan Sifford|   Finance|   NY|  1129| 35| 1261|
|       1073|      Herder Gallman|   Finance|   NY|  1988| 31| 1402|
|       1082|          Nena Rocha|   Finance|   NY|  3417| 25| 1647|
|       1087|       Leif Lemaster|   Finance|   NY|  8642| 45| 1782|
|       1100|Ellingsworth Meli...|   Finance|   NY|  7845| 32| 1358|
|       1127|        Escoto Gilma|   Finance|   NY|  3426| 41| 1285|
|       1161|     Georgeanna Laub|   Finance|   NY|  2469| 26| 1679|
|       1175|     Durio Tenenbaum|   Finance|   NY|  2253| 42| 1684|
|       1180|       Juliana Grigg|   Finance|   NY|  8178| 42| 1617|
|       1215|        Tiffani Benz|   Finance|   NY|  1665| 41| 1969|
|       1220|          Nitz Ilana|   Finance|   NY|  2443| 50| 1342|
|       1342|   Phylicia Antonina|

raise the salary by 500 of all the employees whose age is greater than 45

In [None]:
def raiseSalary(age, salary):
    if age > 45:
        salary += 500
    return salary

raiseSalaryUDF = udf(lambda x,y: raiseSalary(x,y), IntegerType())    

df2 = df.withColumn("salary", raiseSalaryUDF(df.age, df.salary))
df2.show()

+-----------+-------------------+----------+-----+------+---+-----+
|employee_id|      employee_name|department|state|salary|age|bonus|
+-----------+-------------------+----------+-----+------+---+-----+
|       1000|          Nitz Leif| Marketing|   CA|  6131| 26|  543|
|       1001|    Melissia Dedman|   Finance|   AK|  4027| 43| 1290|
|       1002|  Rudolph Barringer|        HR|   LA|  3122| 43| 1445|
|       1003|        Tamra Amber|  Accounts|   AK|  6217| 47| 1291|
|       1004|        Mullan Nitz|Purchasing|   CA|  5685| 34| 1394|
|       1005|      Zollner Karie|  Accounts|   CA|  2843| 27| 1078|
|       1006|Kaczorowski Zollner|     Sales|   CA|  7201| 21| 1834|
|       1007|      Nakano Locust| Marketing|   LA|  3444| 23| 1823|
|       1008|  Recalde Kensinger|  Accounts|   LA|  4204| 48| 1330|
|       1009|        Imai Hallie|  Accounts|   AK|  5061| 38| 1557|
|       1010|    Debroah Gallman|  Accounts|   NY|  9308| 35|  817|
|       1011|   Barringer Escoto|Purchasing|   W

Create a DF of all those employee whose age is greater than 45 and save them to a file

In [None]:
df3 = df.filter(col("age") > 45)
df3.write.options(header = "True").mode("overwrite").csv("/content/OfficeDataProject/output")

In [None]:
df4 = spark.read.options(header = "True").csv("/content/OfficeDataProject/output")
df4.show(5)

+-----------+------------------+----------+-----+------+---+-----+
|employee_id|     employee_name|department|state|salary|age|bonus|
+-----------+------------------+----------+-----+------+---+-----+
|       1003|       Tamra Amber|  Accounts|   AK|  5717| 47| 1291|
|       1008| Recalde Kensinger|  Accounts|   LA|  3704| 48| 1330|
|       1011|  Barringer Escoto|Purchasing|   WA|  1685| 49| 1706|
|       1018|Vankirk Jacquelyne|Purchasing|   NY|  8636| 47| 1192|
|       1025|   Dionne Lemaster|     Sales|   AK|  5134| 48| 1356|
+-----------+------------------+----------+-----+------+---+-----+
only showing top 5 rows

