In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

### Finding locally installed Spark and importing PySpark libraries

In [1]:
import findspark
findspark.init("/usr/local/spark/")
from pyspark.sql import SparkSession, functions as func

### Creating Spark Session

In [2]:
spark = SparkSession.builder.appName('task').getOrCreate()

### Reading 5 million data CSV with Spark by loading the file

In [3]:
df = spark.read.format("csv").option("header","true").load("/home/ubuntu/Hr5m.csv").fillna(0)["Emp ID", "Month Name of Joining", "Last Name", "Gender", "E Mail", "SSN", "County", "State", "Region", "City", "Zip", "Salary", func.regexp_replace(func.col("Last % Hike"), "%", "").alias("Salary Hike in %")]
df.show(5)

+------+---------------------+---------+------+--------------------+-----------+--------------------+-----+-------+----------+-----+------+----------------+
|Emp ID|Month Name of Joining|Last Name|Gender|              E Mail|        SSN|              County|State| Region|      City|  Zip|Salary|Salary Hike in %|
+------+---------------------+---------+------+--------------------+-----------+--------------------+-----+-------+----------+-----+------+----------------+
|742048|              January|   Mccoll|     F|lizeth.mccoll@ibm...|171-86-6830|               Stark|   OH|Midwest|  Alliance|44601|147446|              14|
|671135|                April|     Hern|     F|argentina.hern@nt...|083-02-3078|District of Columbia|   DC|  South|Washington|20411|129174|               8|
|965851|             December|  Patillo|     M|damian.patillo@ou...|326-11-9852|              Fresno|   CA|   West|    Burrel|93607|158746|               8|
|224660|                 June| Hagopian|     F|imogene.hag

### Checking the total records in the PySpark Dataframe

In [4]:
print(f"Total records present: {df.count()}")

Total records present: 5000000


### Checking the data types

In [5]:
df.dtypes

[('Emp ID', 'string'),
 ('Month Name of Joining', 'string'),
 ('Last Name', 'string'),
 ('Gender', 'string'),
 ('E Mail', 'string'),
 ('SSN', 'string'),
 ('County', 'string'),
 ('State', 'string'),
 ('Region', 'string'),
 ('City', 'string'),
 ('Zip', 'string'),
 ('Salary', 'string'),
 ('Salary Hike in %', 'string')]

### Casting 'string' value to 'int'

In [6]:
df = df.withColumn("Emp ID", df["Emp ID"].cast("int"))
df = df.withColumn("Zip", df["Zip"].cast("int"))
df = df.withColumn("Salary", df["Salary"].cast("int"))
df = df.withColumn("Salary Hike in %", df["Salary Hike in %"].cast("int"))
df.dtypes

[('Emp ID', 'int'),
 ('Month Name of Joining', 'string'),
 ('Last Name', 'string'),
 ('Gender', 'string'),
 ('E Mail', 'string'),
 ('SSN', 'string'),
 ('County', 'string'),
 ('State', 'string'),
 ('Region', 'string'),
 ('City', 'string'),
 ('Zip', 'int'),
 ('Salary', 'int'),
 ('Salary Hike in %', 'int')]

### Checking if the changes have been made successfully

In [7]:
df.show(20)

+------+---------------------+---------+------+--------------------+-----------+--------------------+-----+---------+-------------+-----+------+----------------+
|Emp ID|Month Name of Joining|Last Name|Gender|              E Mail|        SSN|              County|State|   Region|         City|  Zip|Salary|Salary Hike in %|
+------+---------------------+---------+------+--------------------+-----------+--------------------+-----+---------+-------------+-----+------+----------------+
|742048|              January|   Mccoll|     F|lizeth.mccoll@ibm...|171-86-6830|               Stark|   OH|  Midwest|     Alliance|44601|147446|              14|
|671135|                April|     Hern|     F|argentina.hern@nt...|083-02-3078|District of Columbia|   DC|    South|   Washington|20411|129174|               8|
|965851|             December|  Patillo|     M|damian.patillo@ou...|326-11-9852|              Fresno|   CA|     West|       Burrel|93607|158746|               8|
|224660|                 Jun

### Checking the Schema

In [8]:
df.printSchema()

root
 |-- Emp ID: integer (nullable = true)
 |-- Month Name of Joining: string (nullable = true)
 |-- Last Name: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- E Mail: string (nullable = true)
 |-- SSN: string (nullable = true)
 |-- County: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zip: integer (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- Salary Hike in %: integer (nullable = true)



### Task 1: Count the number of employees in each County, Region and City

In [13]:
task11 = df.groupBy("County").count().sort("County")
task11.show(2753)

+--------------------+-----+
|              County|count|
+--------------------+-----+
|               Aaron|   90|
|           Abbeville|  509|
|              Acadia| 1373|
|            Accomack| 4611|
|                 Ada| 5104|
|               Adair| 2673|
|               Adams|12997|
|             Addison| 2063|
|        Agoura Hills|   81|
|               Aiken| 2789|
|              Aitkin|  918|
|               Akron|  261|
|             Alachua| 3557|
|            Alamance| 1507|
|             Alameda| 8132|
|             Alamosa|  474|
|              Albany|10296|
|           Albemarle| 2019|
|              Alcona| 1054|
|              Alcorn|  540|
|            Alderson|  117|
|               Aldie|  118|
|      Aleutians East|  626|
| Aleutians West (CA)|  790|
|           Alexander| 1203|
|   Alexandria (city)| 1541|
|             Alfalfa|  924|
|              Alfred|  114|
|               Alger| 1145|
|               Alice|  151|
|           Allamakee|  822|
|             

In [14]:
task111 = df.agg(func.countDistinct("County").alias("No. of Counties"))
task111.show()

+---------------+
|No. of Counties|
+---------------+
|           2752|
+---------------+



In [18]:
task12 = df.groupBy("Region").count().sort("Region")
task12.show()

+---------+-------+
|   Region|  count|
+---------+-------+
|  Midwest|1372439|
|Northeast| 886984|
|    South|1855656|
|     West| 884921|
+---------+-------+



In [19]:
task121 = df.agg(func.countDistinct("Region").alias("No. of Regions"))
task121.show()

+--------------+
|No. of Regions|
+--------------+
|             4|
+--------------+



In [21]:
task13 = df.groupBy("City").count().sort("City")
task13.show(19206)

+--------------------+-----+
|                City|count|
+--------------------+-----+
|               Aaron|   90|
|          Aaronsburg|   96|
|           Abbeville|  681|
|               Abbot|  139|
|          Abbotsford|  118|
|              Abbott|   92|
|         Abbottstown|  114|
|           Abbyville|   95|
|               Abell|   96|
|         Abercrombie|  120|
|            Aberdeen|  990|
|Aberdeen Proving ...|  144|
|            Abernant|  128|
|           Abernathy|  107|
|                Abie|  133|
|             Abilene| 1307|
|            Abingdon|  597|
|            Abington|  391|
|             Abiquiu|  149|
|       Abita Springs|  120|
|              Abrams|  147|
|            Absaraka|  131|
|           Absarokee|   87|
|             Absecon|  255|
|              Acampo|  111|
|            Accident|  143|
|            Accokeek|  109|
|             Accomac|  106|
|              Accord|  242|
|           Accoville|  115|
|                 Ace|  112|
|             

In [20]:
task131 = df.agg(func.countDistinct("City").alias("No. of Cities"))
task131.show()

+-------------+
|No. of Cities|
+-------------+
|        19206|
+-------------+



### Task 2: Generate employee summary

In [22]:
task2 = df.sort("Emp ID")
task2.show(5)

+------+---------------------+---------+------+--------------------+-----------+------+-----+-------+-----------+-----+------+----------------+
|Emp ID|Month Name of Joining|Last Name|Gender|              E Mail|        SSN|County|State| Region|       City|  Zip|Salary|Salary Hike in %|
+------+---------------------+---------+------+--------------------+-----------+------+-----+-------+-----------+-----+------+----------------+
|111111|                  May|   Isbell|     M|riley.isbell@gmai...|066-02-8823|Sabine|   LA|  South|    Florien|71429|115049|              26|
|111111|                  May|Witkowski|     F|danuta.witkowski@...|506-57-4157|   Bay|   FL|  South|Panama City|32403|142276|              11|
|111111|             February|  Ribeiro|     F|ursula.ribeiro@ao...|504-45-4194|Ottawa|   MI|Midwest| Ferrysburg|49409| 68056|              11|
|111111|              October| Lamberth|     F|delois.lamberth@h...|193-86-6147|Martin|   NC|  South|Williamston|27892|146377|          

### Task 3: Generate employee summary and ordering by Gender and Salary

In [23]:
task3 = df.orderBy(["Gender", "Salary"])
task3.show(5)

+------+---------------------+-----------+------+--------------------+-----------+----------+-----+-------+------------+-----+------+----------------+
|Emp ID|Month Name of Joining|  Last Name|Gender|              E Mail|        SSN|    County|State| Region|        City|  Zip|Salary|Salary Hike in %|
+------+---------------------+-----------+------+--------------------+-----------+----------+-----+-------+------------+-----+------+----------------+
|571129|                March|    Nolting|     F|lakiesha.nolting@...|141-23-4908|   Larimer|   CO|   West|  Glen Haven|80532| 40000|              21|
|828071|             February|   Barrette|     F|madelaine.barrett...|545-99-1053|     Aiken|   SC|  South|       Aiken|29801| 40000|              26|
|631239|                  May|     Ellman|     F|tijuana.ellman@ho...|730-28-6351|      Hoke|   NC|  South|     Raeford|28376| 40000|              19|
|506893|                 June|Cumberbatch|     F|ariana.cumberbatc...|057-02-5798|Evangeline| 

### Task 4: Summerize the number of employee joined and hikes granted based on month

In [24]:
task41 = df.groupBy("Month Name of Joining").count()
task41.show()

+---------------------+------+
|Month Name of Joining| count|
+---------------------+------+
|                 July|450981|
|             November|392382|
|             February|382263|
|              January|413247|
|                March|422987|
|              October|400048|
|                  May|434522|
|               August|468295|
|                April|414484|
|                 June|427424|
|             December|408721|
|            September|384646|
+---------------------+------+



In [25]:
task42 = df.groupBy("Month Name of Joining", "Salary Hike in %").count().sort("Month Name of Joining")
task42.show(372)

+---------------------+----------------+-----+
|Month Name of Joining|Salary Hike in %|count|
+---------------------+----------------+-----+
|                April|               1|13439|
|                April|              10|13291|
|                April|              27|13303|
|                April|              16|13210|
|                April|              13|13471|
|                April|              11|13292|
|                April|               5|13213|
|                April|              23|13585|
|                April|              20|13357|
|                April|               0|13417|
|                April|              30|13367|
|                April|               3|13467|
|                April|              14|13374|
|                April|               4|13286|
|                April|               2|13326|
|                April|               7|13357|
|                April|              28|13531|
|                April|              25|13485|
|            

In [29]:
task131 = df.agg(func.countDistinct("Month Name of Joining","Salary Hike in %").alias("No. of Hikes"))
task131.show()

+------------+
|No. of Hikes|
+------------+
|         372|
+------------+



### Task 5: Generate employee summary and ordering by Salary

In [30]:
task5 = df.orderBy(["Salary"])
task5.show(5)

+------+---------------------+-----------+------+--------------------+-----------+----------+-----+---------+------------+-----+------+----------------+
|Emp ID|Month Name of Joining|  Last Name|Gender|              E Mail|        SSN|    County|State|   Region|        City|  Zip|Salary|Salary Hike in %|
+------+---------------------+-----------+------+--------------------+-----------+----------+-----+---------+------------+-----+------+----------------+
|336973|                April|       Deem|     M| kirk.deem@gmail.com|623-85-7102|  Ouachita|   AR|    South|      Louann|71751| 40000|              28|
|680938|             November|   Hillyard|     M|rosario.hillyard@...|307-37-6816|  Somerset|   PA|Northeast|   Tire Hill|15959| 40000|              30|
|506893|                 June|Cumberbatch|     F|ariana.cumberbatc...|057-02-5798|Evangeline|   LA|    South|Saint Landry|71367| 40000|               4|
|832197|              October|     Grosso|     F|vikki.grosso@hotm...|131-98-5564|

In [31]:
spark.stop()
print("SparkSession Stopped Successfully!")

SparkSession Stopped Successfully!
