In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

### Finding locally installed Spark and importing PySpark libraries

In [4]:
import findspark
findspark.init("/usr/local/spark/")
from pyspark.sql import SparkSession, functions as func

### Creating Spark Session

In [5]:
spark = SparkSession.builder.appName('task').getOrCreate()

### Reading 5 million data CSV with Spark by loading the file

In [6]:
df = spark.read.format("csv").option("header","true").load("/home/ubuntu/Hr5m.csv").fillna(0)["Emp ID", "Month Name of Joining", "Last Name", "Gender", "E Mail", "SSN", "County", "State", "Region", "City", "Zip", "Salary", func.regexp_replace(func.col("Last % Hike"), "%", "").alias("Salary Hike")]
df.show(5)

+------+---------------------+---------+------+--------------------+-----------+--------------------+-----+-------+----------+-----+------+-----------+
|Emp ID|Month Name of Joining|Last Name|Gender|              E Mail|        SSN|              County|State| Region|      City|  Zip|Salary|Salary Hike|
+------+---------------------+---------+------+--------------------+-----------+--------------------+-----+-------+----------+-----+------+-----------+
|742048|              January|   Mccoll|     F|lizeth.mccoll@ibm...|171-86-6830|               Stark|   OH|Midwest|  Alliance|44601|147446|         14|
|671135|                April|     Hern|     F|argentina.hern@nt...|083-02-3078|District of Columbia|   DC|  South|Washington|20411|129174|          8|
|965851|             December|  Patillo|     M|damian.patillo@ou...|326-11-9852|              Fresno|   CA|   West|    Burrel|93607|158746|          8|
|224660|                 June| Hagopian|     F|imogene.hagopian@...|656-36-0772|        

### Checking the total records in the PySpark Dataframe

In [7]:
print(f"Total records present: {df.count()}")

Total records present: 5000000


### Checking the data types

In [8]:
df.dtypes

[('Emp ID', 'string'),
 ('Month Name of Joining', 'string'),
 ('Last Name', 'string'),
 ('Gender', 'string'),
 ('E Mail', 'string'),
 ('SSN', 'string'),
 ('County', 'string'),
 ('State', 'string'),
 ('Region', 'string'),
 ('City', 'string'),
 ('Zip', 'string'),
 ('Salary', 'string'),
 ('Salary Hike', 'string')]

### Casting 'string' value to 'int'

In [9]:
df = df.withColumn("Emp ID", df["Emp ID"].cast("int"))
df = df.withColumn("Zip", df["Zip"].cast("int"))
df = df.withColumn("Salary", df["Salary"].cast("int"))
df = df.withColumn("Salary Hike", df["Salary Hike"].cast("int"))
df.dtypes

[('Emp ID', 'int'),
 ('Month Name of Joining', 'string'),
 ('Last Name', 'string'),
 ('Gender', 'string'),
 ('E Mail', 'string'),
 ('SSN', 'string'),
 ('County', 'string'),
 ('State', 'string'),
 ('Region', 'string'),
 ('City', 'string'),
 ('Zip', 'int'),
 ('Salary', 'int'),
 ('Salary Hike', 'int')]

### Checking if the changes have been made successfully

In [10]:
df.show(20)

+------+---------------------+---------+------+--------------------+-----------+--------------------+-----+---------+-------------+-----+------+-----------+
|Emp ID|Month Name of Joining|Last Name|Gender|              E Mail|        SSN|              County|State|   Region|         City|  Zip|Salary|Salary Hike|
+------+---------------------+---------+------+--------------------+-----------+--------------------+-----+---------+-------------+-----+------+-----------+
|742048|              January|   Mccoll|     F|lizeth.mccoll@ibm...|171-86-6830|               Stark|   OH|  Midwest|     Alliance|44601|147446|         14|
|671135|                April|     Hern|     F|argentina.hern@nt...|083-02-3078|District of Columbia|   DC|    South|   Washington|20411|129174|          8|
|965851|             December|  Patillo|     M|damian.patillo@ou...|326-11-9852|              Fresno|   CA|     West|       Burrel|93607|158746|          8|
|224660|                 June| Hagopian|     F|imogene.hag

### Checking the Schema

In [11]:
df.printSchema()

root
 |-- Emp ID: integer (nullable = true)
 |-- Month Name of Joining: string (nullable = true)
 |-- Last Name: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- E Mail: string (nullable = true)
 |-- SSN: string (nullable = true)
 |-- County: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zip: integer (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- Salary Hike: integer (nullable = true)



### Task 1: Count the number of employees in each County, Region and City

In [12]:
task11 = df.groupBy("County").agg(func.countDistinct("Emp ID").alias('No. of Employees Per County'))
task11.show(5)

+---------+---------------------------+
|   County|No. of Employees Per County|
+---------+---------------------------+
|Worcester|                      12921|
| Thurston|                       2868|
|    Tyler|                       1738|
|    Bucks|                       7659|
|  Palermo|                        155|
+---------+---------------------------+
only showing top 5 rows



In [13]:
task12 = df.groupBy("Region").agg(func.countDistinct("Emp ID").alias('No. of Employees Per Region'))
task12.show(5)

+---------+---------------------------+
|   Region|No. of Employees Per Region|
+---------+---------------------------+
|  Midwest|                     691814|
|    South|                     770939|
|     West|                     555368|
|Northeast|                     557073|
+---------+---------------------------+



In [14]:
task13 = df.groupBy("City").agg(func.countDistinct("Emp ID").alias('No. of Employees Per City'))
task13.show(5)

+-----------+-------------------------+
|       City|No. of Employees Per City|
+-----------+-------------------------+
|    Hanover|                     2311|
|      Tyler|                     1999|
|Piney Creek|                      110|
|   Bluffton|                      820|
| Prattville|                      487|
+-----------+-------------------------+
only showing top 5 rows



### Task 2: Generate employee summary

In [15]:
task2 = df.sort("Emp ID")
task2.show(5)

+------+---------------------+---------+------+--------------------+-----------+------+-----+-------+-----------+-----+------+-----------+
|Emp ID|Month Name of Joining|Last Name|Gender|              E Mail|        SSN|County|State| Region|       City|  Zip|Salary|Salary Hike|
+------+---------------------+---------+------+--------------------+-----------+------+-----+-------+-----------+-----+------+-----------+
|111111|                  May|   Isbell|     M|riley.isbell@gmai...|066-02-8823|Sabine|   LA|  South|    Florien|71429|115049|         26|
|111111|                  May|Witkowski|     F|danuta.witkowski@...|506-57-4157|   Bay|   FL|  South|Panama City|32403|142276|         11|
|111111|             February|  Ribeiro|     F|ursula.ribeiro@ao...|504-45-4194|Ottawa|   MI|Midwest| Ferrysburg|49409| 68056|         11|
|111111|                March|     Hock|     F|karolyn.hock@hotm...|295-15-1492|  Wise|   TX|  South|    Slidell|76267|146377|         25|
|111111|              Octob

### Task 3: Generate employee summary and ordering by Gender and Salary

In [16]:
task3 = df.orderBy(["Gender", "Salary"])
task3.show(5)

+------+---------------------+-----------+------+--------------------+-----------+----------+-----+-------+------------+-----+------+-----------+
|Emp ID|Month Name of Joining|  Last Name|Gender|              E Mail|        SSN|    County|State| Region|        City|  Zip|Salary|Salary Hike|
+------+---------------------+-----------+------+--------------------+-----------+----------+-----+-------+------------+-----+------+-----------+
|571129|                March|    Nolting|     F|lakiesha.nolting@...|141-23-4908|   Larimer|   CO|   West|  Glen Haven|80532| 40000|         21|
|506893|                 June|Cumberbatch|     F|ariana.cumberbatc...|057-02-5798|Evangeline|   LA|  South|Saint Landry|71367| 40000|          4|
|631239|                  May|     Ellman|     F|tijuana.ellman@ho...|730-28-6351|      Hoke|   NC|  South|     Raeford|28376| 40000|         19|
|832197|              October|     Grosso|     F|vikki.grosso@hotm...|131-98-5564|   McHenry|   ND|Midwest|      Bantry|5871

### Task 4: Summerize the number of employee joined and hikes granted based on month

In [17]:
task41 = df.groupBy("Month Name of Joining").agg(func.countDistinct("Emp ID").alias('No. of Employees Joined in Particular Month'))
task41.show()

+---------------------+-------------------------------------------+
|Month Name of Joining|No. of Employees Joined in Particular Month|
+---------------------+-------------------------------------------+
|                 July|                                     351634|
|             November|                                     315549|
|             February|                                     309233|
|              January|                                     328953|
|                March|                                     335261|
|              October|                                     320530|
|                  May|                                     342019|
|               August|                                     361916|
|                April|                                     329291|
|                 June|                                     337582|
|             December|                                     325905|
|            September|                         

In [19]:
task42 = df.groupBy("Month Name of Joining").agg(func.countDistinct("Salary Hike").alias('No. of Hikes granted in Particular Month'))
task42.show()

+---------------------+----------------------------------------+
|Month Name of Joining|No. of Hikes granted in Particular Month|
+---------------------+----------------------------------------+
|                 July|                                      31|
|             November|                                      31|
|             February|                                      31|
|              January|                                      31|
|                March|                                      31|
|              October|                                      31|
|                  May|                                      31|
|               August|                                      31|
|                April|                                      31|
|                 June|                                      31|
|             December|                                      31|
|            September|                                      31|
+---------------------+--

### Task 5: Generate employee summary and ordering by Salary

In [20]:
task5 = df.orderBy(["Salary"])
task5.show(5)

+------+---------------------+-----------+------+--------------------+-----------+----------+-----+---------+------------+-----+------+-----------+
|Emp ID|Month Name of Joining|  Last Name|Gender|              E Mail|        SSN|    County|State|   Region|        City|  Zip|Salary|Salary Hike|
+------+---------------------+-----------+------+--------------------+-----------+----------+-----+---------+------------+-----+------+-----------+
|828071|             February|   Barrette|     F|madelaine.barrett...|545-99-1053|     Aiken|   SC|    South|       Aiken|29801| 40000|         26|
|336973|                April|       Deem|     M| kirk.deem@gmail.com|623-85-7102|  Ouachita|   AR|    South|      Louann|71751| 40000|         28|
|832197|              October|     Grosso|     F|vikki.grosso@hotm...|131-98-5564|   McHenry|   ND|  Midwest|      Bantry|58713| 40000|         14|
|680938|             November|   Hillyard|     M|rosario.hillyard@...|307-37-6816|  Somerset|   PA|Northeast|   

In [21]:
spark.stop()
print("SparkSession Stopped Successfully!")

SparkSession Stopped Successfully!
