In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

### Finding locally installed Spark and importing PySpark libraries

In [3]:
import findspark
findspark.init("/usr/local/spark/")
from pyspark.sql import SparkSession, functions as func

### Creating Spark Session

In [4]:
spark = SparkSession.builder.appName('task').getOrCreate()

### Reading CSV with Spark

In [5]:
df = spark.read.csv('/home/ubuntu/10000 Records.csv', header=True, inferSchema = True).fillna(0)["Emp ID", "Month Name of Joining", "Last Name", "Gender", "E Mail", "SSN", "County", "State", "Region", "City", "Zip", "Salary", func.regexp_replace(func.col("Last % Hike"), "%", "").alias("Salary Hike")]
df.show(5)

+------+---------------------+---------+------+--------------------+-----------+------------+-----+---------+-------------+-----+------+-----------+
|Emp ID|Month Name of Joining|Last Name|Gender|              E Mail|        SSN|      County|State|   Region|         City|  Zip|Salary|Salary Hike|
+------+---------------------+---------+------+--------------------+-----------+------------+-----+---------+-------------+-----+------+-----------+
|198429|             February|Bumgarner|     F|serafina.bumgarne...|063-02-3609|  Chautauqua|   NY|Northeast|       Clymer|14724| 69294|         14|
|178566|                 June|     Rojo|     F|juliette.rojo@yah...|671-48-9915|  Montgomery|   PA|Northeast|     Glenside|19038|193912|         27|
|647173|              January| Krawczyk|     M|milan.krawczyk@ho...|527-99-6328|Anne Arundel|   MD|    South|Gibson Island|21056|123681|         11|
|847634|                  May|    Jason|     M|elmer.jason@yahoo...|063-02-5994|  Washington|   VA|    Sou

### Checking the total records in the PySpark Dataframe

In [6]:
print(f"Total records present: {df.count()}")

Total records present: 10000


### Checking the data types

In [7]:
df.dtypes

[('Emp ID', 'int'),
 ('Month Name of Joining', 'string'),
 ('Last Name', 'string'),
 ('Gender', 'string'),
 ('E Mail', 'string'),
 ('SSN', 'string'),
 ('County', 'string'),
 ('State', 'string'),
 ('Region', 'string'),
 ('City', 'string'),
 ('Zip', 'int'),
 ('Salary', 'int'),
 ('Salary Hike', 'string')]

### Casting 'string' value to 'int'

In [8]:
df = (df.withColumn("Salary Hike", df["Salary Hike"].cast("int")))
df.dtypes

[('Emp ID', 'int'),
 ('Month Name of Joining', 'string'),
 ('Last Name', 'string'),
 ('Gender', 'string'),
 ('E Mail', 'string'),
 ('SSN', 'string'),
 ('County', 'string'),
 ('State', 'string'),
 ('Region', 'string'),
 ('City', 'string'),
 ('Zip', 'int'),
 ('Salary', 'int'),
 ('Salary Hike', 'int')]

### Checking if the changes have been made successfully

In [9]:
df.show()

+------+---------------------+---------+------+--------------------+-----------+------------+-----+---------+---------------+-----+------+-----------+
|Emp ID|Month Name of Joining|Last Name|Gender|              E Mail|        SSN|      County|State|   Region|           City|  Zip|Salary|Salary Hike|
+------+---------------------+---------+------+--------------------+-----------+------------+-----+---------+---------------+-----+------+-----------+
|198429|             February|Bumgarner|     F|serafina.bumgarne...|063-02-3609|  Chautauqua|   NY|Northeast|         Clymer|14724| 69294|         14|
|178566|                 June|     Rojo|     F|juliette.rojo@yah...|671-48-9915|  Montgomery|   PA|Northeast|       Glenside|19038|193912|         27|
|647173|              January| Krawczyk|     M|milan.krawczyk@ho...|527-99-6328|Anne Arundel|   MD|    South|  Gibson Island|21056|123681|         11|
|847634|                  May|    Jason|     M|elmer.jason@yahoo...|063-02-5994|  Washington| 

### Checking the Schema

In [11]:
df.printSchema()

root
 |-- Emp ID: integer (nullable = true)
 |-- Month Name of Joining: string (nullable = true)
 |-- Last Name: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- E Mail: string (nullable = true)
 |-- SSN: string (nullable = true)
 |-- County: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zip: integer (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- Salary Hike: integer (nullable = true)



### Task 1: Count the number of employees in each County, Region and City

In [12]:
task11 = df.groupBy("County").agg(func.countDistinct("Emp ID").alias('No. of Employees Per County'))
task11.show(5)

+--------------------+---------------------------+
|              County|No. of Employees Per County|
+--------------------+---------------------------+
|               Bucks|                         14|
|           Worcester|                         21|
|            Thurston|                          3|
|              Grimes|                          2|
|Skagway-Hoonah-An...|                          2|
+--------------------+---------------------------+
only showing top 5 rows



In [13]:
task12 = df.groupBy("Region").agg(func.countDistinct("Emp ID").alias('No. of Employees Per Region'))
task12.show(5)

+---------+---------------------------+
|   Region|No. of Employees Per Region|
+---------+---------------------------+
|  Midwest|                       2819|
|    South|                       3647|
|     West|                       1747|
|Northeast|                       1773|
+---------+---------------------------+



In [14]:
task13 = df.groupBy("City").agg(func.countDistinct("Emp ID").alias('No. of Employees Per City'))
task13.show(5)

+---------+-------------------------+
|     City|No. of Employees Per City|
+---------+-------------------------+
|   Scuddy|                        1|
|  Maxbass|                        2|
| Fredonia|                        3|
|    Tyler|                        1|
|Fairbanks|                        5|
+---------+-------------------------+
only showing top 5 rows



### Task 2: Generate employee summary

In [15]:
task2 = df.sort("Emp ID")
task2.show(5)

+------+---------------------+---------+------+--------------------+-----------+----------+-----+---------+--------------+-----+------+-----------+
|Emp ID|Month Name of Joining|Last Name|Gender|              E Mail|        SSN|    County|State|   Region|          City|  Zip|Salary|Salary Hike|
+------+---------------------+---------+------+--------------------+-----------+----------+-----+---------+--------------+-----+------+-----------+
|111282|             February|  Boykins|     M|claude.boykins@ho...|316-35-9848| Dinwiddie|   VA|    South|     Mc Kenney|23872|118158|         26|
|111415|                March|Sebastian|     M|gene.sebastian@ya...|294-15-0140|   Houston|   GA|    South| Warner Robins|31099|139501|         20|
|111498|                  May|  Dickman|     F|wan.dickman@walma...|704-18-5219|    Benton|   IA|  Midwest|  Mount Auburn|52313|173022|         21|
|111730|             December|     Buie|     F|nickie.buie@gmail...|607-87-9903|   Carroll|   NH|Northeast|East 

### Task 3: Generate employee summary and ordering by Gender and Salary

In [16]:
task3 = df.orderBy(["Gender", "Salary"])
task3.show(5)

+------+---------------------+----------+------+--------------------+-----------+-----------+-----+-------+-----------+-----+------+-----------+
|Emp ID|Month Name of Joining| Last Name|Gender|              E Mail|        SSN|     County|State| Region|       City|  Zip|Salary|Salary Hike|
+------+---------------------+----------+------+--------------------+-----------+-----------+-----+-------+-----------+-----+------+-----------+
|338798|                  May|    Conley|     F|jodi.conley@chart...|084-02-6421|Los Angeles|   CA|   West|   Glendale|91210| 40009|          1|
|644604|               August|   Brazell|     F|lorina.brazell@nt...|308-37-1932|      Lucas|   OH|Midwest|     Toledo|43635| 40015|         12|
|946170|             November|Lafontaine|     F|anja.lafontaine@y...|116-98-7081|    Henrico|   VA|  South|   Sandston|23150| 40049|         21|
|893122|                 June|     Raber|     F|sharan.raber@gmai...|480-41-0592|  Baltimore|   MD|  South|Hunt Valley|21031| 4005

### Task 4: Summerize the number of employee joined and hikes granted based on month

In [17]:
task41 = df.groupBy("Month Name of Joining").agg(func.countDistinct("Emp ID").alias('No. of Employees Joined in Particular Month'))
task41.show()

+---------------------+-------------------------------------------+
|Month Name of Joining|No. of Employees Joined in Particular Month|
+---------------------+-------------------------------------------+
|                 July|                                        901|
|             November|                                        787|
|             February|                                        782|
|              January|                                        836|
|                March|                                        867|
|              October|                                        811|
|                  May|                                        846|
|               August|                                        808|
|                April|                                        851|
|                 June|                                        894|
|             December|                                        826|
|            September|                         

In [18]:
task42 = df.groupBy("Month Name of Joining").agg(func.countDistinct("Salary Hike").alias('No. of Hikes granted in Particular Month'))
task42.show()

+---------------------+----------------------------------------+
|Month Name of Joining|No. of Hikes granted in Particular Month|
+---------------------+----------------------------------------+
|                 July|                                      31|
|             November|                                      31|
|             February|                                      31|
|              January|                                      31|
|                March|                                      31|
|              October|                                      31|
|                  May|                                      31|
|               August|                                      31|
|                April|                                      31|
|                 June|                                      31|
|             December|                                      31|
|            September|                                      31|
+---------------------+--

### Task 5: Generate employee summary and ordering by Salary

In [19]:
task5 = df.orderBy(["Salary"])
task5.show(5)

+------+---------------------+---------+------+--------------------+-----------+-------------+-----+---------+-------------+-----+------+-----------+
|Emp ID|Month Name of Joining|Last Name|Gender|              E Mail|        SSN|       County|State|   Region|         City|  Zip|Salary|Salary Hike|
+------+---------------------+---------+------+--------------------+-----------+-------------+-----+---------+-------------+-----+------+-----------+
|373347|             December|   Shaner|     M|royce.shaner@hotm...|056-02-4379|East Freetown|   NY|Northeast|East Freetown|13055| 40007|         24|
|449798|             February|  Cleaves|     M|ben.cleaves@bells...|369-39-2187|     Montrose|   CO|     West|        Nucla|81424| 40009|         19|
|338798|                  May|   Conley|     F|jodi.conley@chart...|084-02-6421|  Los Angeles|   CA|     West|     Glendale|91210| 40009|          1|
|684725|            September|  Valerio|     M|peter.valerio@aol...|052-02-4357|        Burke|   GA|

In [20]:
spark.stop()
print("SparkSession Stopped Successfully!")

SparkSession Stopped Successfully!
