### Finding locally installed Spark and importing PySpark libraries

In [1]:
import findspark
findspark.init("/usr/local/spark/")
from pyspark.sql import SparkSession, functions as func

### Creating Spark Session

In [2]:
spark = SparkSession.builder.appName('task').getOrCreate()

### Reading 5 million data CSV with Spark by loading the file

In [3]:
df = spark.read.format("csv").option("header","true").load("/home/ubuntu/Hr5m.csv").fillna(0)["Emp ID", "First Name", "Last Name", "Gender", "E Mail", "Month Name of Joining", "SSN", "County", "State", "Region", "City", "Zip", "Salary", func.regexp_replace(func.col("Last % Hike"), "%", "").alias("Salary Hike in %")]
df.show(5)

+------+----------+---------+------+--------------------+---------------------+-----------+--------------------+-----+-------+----------+-----+------+----------------+
|Emp ID|First Name|Last Name|Gender|              E Mail|Month Name of Joining|        SSN|              County|State| Region|      City|  Zip|Salary|Salary Hike in %|
+------+----------+---------+------+--------------------+---------------------+-----------+--------------------+-----+-------+----------+-----+------+----------------+
|742048|    Lizeth|   Mccoll|     F|lizeth.mccoll@ibm...|              January|171-86-6830|               Stark|   OH|Midwest|  Alliance|44601|147446|              14|
|671135| Argentina|     Hern|     F|argentina.hern@nt...|                April|083-02-3078|District of Columbia|   DC|  South|Washington|20411|129174|               8|
|965851|    Damian|  Patillo|     M|damian.patillo@ou...|             December|326-11-9852|              Fresno|   CA|   West|    Burrel|93607|158746|          

### Checking the total records in the PySpark Dataframe

In [6]:
print(f"Total records present: {df.count()}")

Total records present: 5000000


### Checking the data types

In [4]:
df.dtypes

[('Emp ID', 'string'),
 ('First Name', 'string'),
 ('Last Name', 'string'),
 ('Gender', 'string'),
 ('E Mail', 'string'),
 ('Month Name of Joining', 'string'),
 ('SSN', 'string'),
 ('County', 'string'),
 ('State', 'string'),
 ('Region', 'string'),
 ('City', 'string'),
 ('Zip', 'string'),
 ('Salary', 'string'),
 ('Salary Hike in %', 'string')]

### Casting 'string' value to 'int'

In [5]:
df = df.withColumn("Emp ID", df["Emp ID"].cast("int"))
df = df.withColumn("Zip", df["Zip"].cast("int"))
df = df.withColumn("Salary", df["Salary"].cast("int"))
df = df.withColumn("Salary Hike in %", df["Salary Hike in %"].cast("int"))
df.dtypes

[('Emp ID', 'int'),
 ('First Name', 'string'),
 ('Last Name', 'string'),
 ('Gender', 'string'),
 ('E Mail', 'string'),
 ('Month Name of Joining', 'string'),
 ('SSN', 'string'),
 ('County', 'string'),
 ('State', 'string'),
 ('Region', 'string'),
 ('City', 'string'),
 ('Zip', 'int'),
 ('Salary', 'int'),
 ('Salary Hike in %', 'int')]

### Checking if the changes have been made successfully

In [10]:
df.show(5)

+------+----------+---------+------+--------------------+---------------------+-----------+--------------------+-----+-------+----------+-----+------+----------------+
|Emp ID|First Name|Last Name|Gender|              E Mail|Month Name of Joining|        SSN|              County|State| Region|      City|  Zip|Salary|Salary Hike in %|
+------+----------+---------+------+--------------------+---------------------+-----------+--------------------+-----+-------+----------+-----+------+----------------+
|742048|    Lizeth|   Mccoll|     F|lizeth.mccoll@ibm...|              January|171-86-6830|               Stark|   OH|Midwest|  Alliance|44601|147446|              14|
|671135| Argentina|     Hern|     F|argentina.hern@nt...|                April|083-02-3078|District of Columbia|   DC|  South|Washington|20411|129174|               8|
|965851|    Damian|  Patillo|     M|damian.patillo@ou...|             December|326-11-9852|              Fresno|   CA|   West|    Burrel|93607|158746|          

### Checking the Schema

In [11]:
df.printSchema()

root
 |-- Emp ID: integer (nullable = true)
 |-- First Name: string (nullable = true)
 |-- Last Name: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- E Mail: string (nullable = true)
 |-- Month Name of Joining: string (nullable = true)
 |-- SSN: string (nullable = true)
 |-- County: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zip: integer (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- Salary Hike in %: integer (nullable = true)



### Task 1: Count the number of employees in each County, Region and City

In [70]:
task11c = df.agg(func.countDistinct("County").alias("No. of Counties"))
task11c.show()

+---------------+
|No. of Counties|
+---------------+
|           2752|
+---------------+



In [29]:
task11 = df.groupBy("County").count().sort("County")
task11.show(5)

+---------+-----+
|   County|count|
+---------+-----+
|    Aaron|   90|
|Abbeville|  509|
|   Acadia| 1373|
| Accomack| 4611|
|      Ada| 5104|
+---------+-----+
only showing top 5 rows



In [14]:
task12c = df.agg(func.countDistinct("Region").alias("No. of Regions"))
task12c.show()

+--------------+
|No. of Regions|
+--------------+
|             4|
+--------------+



In [15]:
task12 = df.groupBy("Region").count().sort("Region")
task12.show()

+---------+-------+
|   Region|  count|
+---------+-------+
|  Midwest|1372439|
|Northeast| 886984|
|    South|1855656|
|     West| 884921|
+---------+-------+



In [17]:
task131 = df.agg(func.countDistinct("City").alias("No. of Cities"))
task131.show()

+-------------+
|No. of Cities|
+-------------+
|        19206|
+-------------+



In [16]:
task13 = df.groupBy("City").count().sort("City")
task13.show(5)

+----------+-----+
|      City|count|
+----------+-----+
|     Aaron|   90|
|Aaronsburg|   96|
| Abbeville|  681|
|     Abbot|  139|
|Abbotsford|  118|
+----------+-----+
only showing top 5 rows



### Task 2: Generate employee summary

In [18]:
task2 = df.sort("Emp ID")
task2.show(5)

+------+----------+---------+------+--------------------+---------------------+-----------+------+-----+-------+-----------+-----+------+----------------+
|Emp ID|First Name|Last Name|Gender|              E Mail|Month Name of Joining|        SSN|County|State| Region|       City|  Zip|Salary|Salary Hike in %|
+------+----------+---------+------+--------------------+---------------------+-----------+------+-----+-------+-----------+-----+------+----------------+
|111111|     Riley|   Isbell|     M|riley.isbell@gmai...|                  May|066-02-8823|Sabine|   LA|  South|    Florien|71429|115049|              26|
|111111|    Danuta|Witkowski|     F|danuta.witkowski@...|                  May|506-57-4157|   Bay|   FL|  South|Panama City|32403|142276|              11|
|111111|    Ursula|  Ribeiro|     F|ursula.ribeiro@ao...|             February|504-45-4194|Ottawa|   MI|Midwest| Ferrysburg|49409| 68056|              11|
|111111|    Delois| Lamberth|     F|delois.lamberth@h...|             

### Task 3: Generate employee summary and ordering by Gender and Salary

In [19]:
task3 = df.orderBy(["Gender", "Salary"])
task3.show(5)

+------+----------+-----------+------+--------------------+---------------------+-----------+----------+-----+-------+------------+-----+------+----------------+
|Emp ID|First Name|  Last Name|Gender|              E Mail|Month Name of Joining|        SSN|    County|State| Region|        City|  Zip|Salary|Salary Hike in %|
+------+----------+-----------+------+--------------------+---------------------+-----------+----------+-----+-------+------------+-----+------+----------------+
|571129|  Lakiesha|    Nolting|     F|lakiesha.nolting@...|                March|141-23-4908|   Larimer|   CO|   West|  Glen Haven|80532| 40000|              21|
|828071| Madelaine|   Barrette|     F|madelaine.barrett...|             February|545-99-1053|     Aiken|   SC|  South|       Aiken|29801| 40000|              26|
|631239|   Tijuana|     Ellman|     F|tijuana.ellman@ho...|                  May|730-28-6351|      Hoke|   NC|  South|     Raeford|28376| 40000|              19|
|506893|    Ariana|Cumberbat

### Task 4: Summerize the number of employee joined and hikes granted based on month

In [20]:
task41 = df.groupBy("Month Name of Joining").count()
task41.show()

+---------------------+------+
|Month Name of Joining| count|
+---------------------+------+
|                 July|450981|
|             November|392382|
|             February|382263|
|              January|413247|
|                March|422987|
|              October|400048|
|                  May|434522|
|               August|468295|
|                April|414484|
|                 June|427424|
|             December|408721|
|            September|384646|
+---------------------+------+



In [22]:
task131 = df.agg(func.countDistinct("Month Name of Joining","Salary Hike in %").alias("No. of Hikes"))
task131.show()

+------------+
|No. of Hikes|
+------------+
|         372|
+------------+



In [21]:
task42 = df.groupBy("Month Name of Joining", "Salary Hike in %").count().sort("Month Name of Joining")
task42.show(5)

+---------------------+----------------+-----+
|Month Name of Joining|Salary Hike in %|count|
+---------------------+----------------+-----+
|                April|              13|13471|
|                April|              24|13406|
|                April|               5|13213|
|                April|              23|13585|
|                April|              18|13322|
+---------------------+----------------+-----+
only showing top 5 rows



### Task 5: Generate employee summary and ordering by Salary

In [23]:
task5 = df.orderBy(["Salary"])
task5.show(5)

+------+----------+-----------+------+--------------------+---------------------+-----------+----------+-----+---------+------------+-----+------+----------------+
|Emp ID|First Name|  Last Name|Gender|              E Mail|Month Name of Joining|        SSN|    County|State|   Region|        City|  Zip|Salary|Salary Hike in %|
+------+----------+-----------+------+--------------------+---------------------+-----------+----------+-----+---------+------------+-----+------+----------------+
|828071| Madelaine|   Barrette|     F|madelaine.barrett...|             February|545-99-1053|     Aiken|   SC|    South|       Aiken|29801| 40000|              26|
|336973|      Kirk|       Deem|     M| kirk.deem@gmail.com|                April|623-85-7102|  Ouachita|   AR|    South|      Louann|71751| 40000|              28|
|832197|     Vikki|     Grosso|     F|vikki.grosso@hotm...|              October|131-98-5564|   McHenry|   ND|  Midwest|      Bantry|58713| 40000|              14|
|680938|   Rosar

In [24]:
spark.stop()
print("SparkSession Stopped Successfully!")

SparkSession Stopped Successfully!
