In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.functions import col, lit


In [3]:
spark = SparkSession.builder.master("local[*]").appName("SparkExamples.com").getOrCreate()

In [8]:
#before this please create a dataset in Azure and start you ADF directory and save the data to local 
df = spark.read.format("csv").option("header",True).load("file:///D:/data1/personal_data.csv")

In [9]:
df.show()

+---+---------+----------+--------+---+----------+----------+----------+------------+--------------------+------+------+
| SN|firstname|middlename|lastname|age|experiance|     skill|      city|departmentID|                 DOB|gender|salary|
+---+---------+----------+--------+---+----------+----------+----------+------------+--------------------+------+------+
|  1|    James|        Kc|   Smith| 51|        10|      .Net|   Newyork|           1|1991-04-01 00:00:...|     M|  3000|
|  2|  Michael|      Rose|    None| 45|        11|      Ruby|   Newyork|           2|2000-05-19 00:00:...|     M|  4000|
|  3|   Robert|      None|Williams| 32|        10|     Scala|California|           1|1978-09-05 00:00:...|     M|  4000|
|  4|    Maria|      Anne|   Jones| 36|         9|      Java| Hyderabad|           4|1967-12-01 00:00:...|     F|  5000|
|  5|      Jen|      Mary|   Brown| 39|         5|     Scala|    Nagpur|           5|1980-02-17 00:00:...|     F|     0|
|  6|Prabhakar|         B|      

### Convert dataframe to SQL object

In [10]:
df.createOrReplaceTempView('personal_data')

In [11]:
select_df = spark.sql("select * from personal_data")

In [12]:
select_df.show()

+---+---------+----------+--------+---+----------+----------+----------+------------+--------------------+------+------+
| SN|firstname|middlename|lastname|age|experiance|     skill|      city|departmentID|                 DOB|gender|salary|
+---+---------+----------+--------+---+----------+----------+----------+------------+--------------------+------+------+
|  1|    James|        Kc|   Smith| 51|        10|      .Net|   Newyork|           1|1991-04-01 00:00:...|     M|  3000|
|  2|  Michael|      Rose|    None| 45|        11|      Ruby|   Newyork|           2|2000-05-19 00:00:...|     M|  4000|
|  3|   Robert|      None|Williams| 32|        10|     Scala|California|           1|1978-09-05 00:00:...|     M|  4000|
|  4|    Maria|      Anne|   Jones| 36|         9|      Java| Hyderabad|           4|1967-12-01 00:00:...|     F|  5000|
|  5|      Jen|      Mary|   Brown| 39|         5|     Scala|    Nagpur|           5|1980-02-17 00:00:...|     F|     0|
|  6|Prabhakar|         B|      

### Using group by on the departmentID


In [16]:
groupBY_df = spark.sql("select departmentID, city, experiance, count(*) from personal_data group by departmentID, city,experiance order by city desc").show()

+------------+----------+----------+--------+
|departmentID|      city|experiance|count(1)|
+------------+----------+----------+--------+
|           2|      Pune|        11|       1|
|           1|      Pune|         9|       1|
|           5|      Pune|        10|       1|
|           6|      None|        12|       1|
|           1|   Newyork|        10|       1|
|           2|   Newyork|        11|       1|
|           4|   Nellore|         2|       1|
|           5|    Nagpur|         5|       1|
|           3| Hyderabad|        13|       1|
|           4| Hyderabad|         9|       1|
|           6| Hyderabad|        14|       1|
|           1|California|        10|       1|
+------------+----------+----------+--------+



### Using group by on city column


In [20]:
city_groupBY_df = spark.sql("select city, count(*) from personal_data group by city order by count(1) desc").show()

+----------+--------+
|      city|count(1)|
+----------+--------+
|      Pune|       3|
| Hyderabad|       3|
|   Newyork|       2|
|      None|       1|
|   Nellore|       1|
|    Nagpur|       1|
|California|       1|
+----------+--------+



### Grouping the data on the basis of skills

In [22]:
city_df = spark.sql("select skill, count(*) from personal_data group by skill order by count(1) asc").show()

+----------+--------+
|     skill|count(1)|
+----------+--------+
|JavaScript|       1|
|    Python|       1|
|         R|       1|
|      .Net|       1|
|      None|       1|
|      Ruby|       2|
|      Java|       2|
|     Scala|       3|
+----------+--------+



### Using wildcard in SparkSQL


In [27]:
wildcard_p_df = spark.sql("select * from personal_data where firstname LIKE '%b%'").show()

+---+---------+----------+--------+---+----------+------+----------+------------+--------------------+------+------+
| SN|firstname|middlename|lastname|age|experiance| skill|      city|departmentID|                 DOB|gender|salary|
+---+---------+----------+--------+---+----------+------+----------+------------+--------------------+------+------+
|  3|   Robert|      None|Williams| 32|        10| Scala|California|           1|1978-09-05 00:00:...|     M|  4000|
|  6|Prabhakar|         B|       G| 33|        11|Python|      Pune|           2|1967-12-01 00:00:...|     M|  5000|
| 11|      Bob|         D|    None| 43|        14|     R| Hyderabad|           6|1967-12-01 00:00:...|     M|  2500|
+---+---------+----------+--------+---+----------+------+----------+------------+--------------------+------+------+



### Using the filter condition 

In [30]:
filter_df = spark.sql("select * from personal_data where gender == 'M' order by salary desc").show()

+---+---------+----------+--------+---+----------+----------+----------+------------+--------------------+------+------+
| SN|firstname|middlename|lastname|age|experiance|     skill|      city|departmentID|                 DOB|gender|salary|
+---+---------+----------+--------+---+----------+----------+----------+------------+--------------------+------+------+
|  7|  Praveen|         B|       G| 21|        13|      Java| Hyderabad|           3|1967-12-01 00:00:...|     M|  6500|
|  8|   Rajesh|         B|       G| 25|         2|     Scala|   Nellore|           4|1967-12-01 00:00:...|     M|  5100|
|  6|Prabhakar|         B|       G| 33|        11|    Python|      Pune|           2|1967-12-01 00:00:...|     M|  5000|
|  9|  Pramodh|         B|       G| 49|         9|      Ruby|      Pune|           1|1967-12-01 00:00:...|     M|  5000|
|  2|  Michael|      Rose|    None| 45|        11|      Ruby|   Newyork|           2|2000-05-19 00:00:...|     M|  4000|
|  3|   Robert|      None|Willia

In [31]:
filter1_df = spark.sql("select * from personal_data where gender != 'M' order by salary desc").show()

+---+---------+----------+--------+---+----------+-----+---------+------------+--------------------+------+------+
| SN|firstname|middlename|lastname|age|experiance|skill|     city|departmentID|                 DOB|gender|salary|
+---+---------+----------+--------+---+----------+-----+---------+------------+--------------------+------+------+
|  4|    Maria|      Anne|   Jones| 36|         9| Java|Hyderabad|           4|1967-12-01 00:00:...|     F|  5000|
|  5|      Jen|      Mary|   Brown| 39|         5|Scala|   Nagpur|           5|1980-02-17 00:00:...|     F|     0|
+---+---------+----------+--------+---+----------+-----+---------+------------+--------------------+------+------+



In [32]:
filter1_df = spark.sql("select * from personal_data where gender != 'F' order by salary desc").show()

+---+---------+----------+--------+---+----------+----------+----------+------------+--------------------+------+------+
| SN|firstname|middlename|lastname|age|experiance|     skill|      city|departmentID|                 DOB|gender|salary|
+---+---------+----------+--------+---+----------+----------+----------+------------+--------------------+------+------+
|  7|  Praveen|         B|       G| 21|        13|      Java| Hyderabad|           3|1967-12-01 00:00:...|     M|  6500|
|  8|   Rajesh|         B|       G| 25|         2|     Scala|   Nellore|           4|1967-12-01 00:00:...|     M|  5100|
|  6|Prabhakar|         B|       G| 33|        11|    Python|      Pune|           2|1967-12-01 00:00:...|     M|  5000|
|  9|  Pramodh|         B|       G| 49|         9|      Ruby|      Pune|           1|1967-12-01 00:00:...|     M|  5000|
|  2|  Michael|      Rose|    None| 45|        11|      Ruby|   Newyork|           2|2000-05-19 00:00:...|     M|  4000|
|  3|   Robert|      None|Willia

In [33]:
filter1_df = spark.sql("select * from personal_data where gender == 'F' order by salary desc").show()

+---+---------+----------+--------+---+----------+-----+---------+------------+--------------------+------+------+
| SN|firstname|middlename|lastname|age|experiance|skill|     city|departmentID|                 DOB|gender|salary|
+---+---------+----------+--------+---+----------+-----+---------+------------+--------------------+------+------+
|  4|    Maria|      Anne|   Jones| 36|         9| Java|Hyderabad|           4|1967-12-01 00:00:...|     F|  5000|
|  5|      Jen|      Mary|   Brown| 39|         5|Scala|   Nagpur|           5|1980-02-17 00:00:...|     F|     0|
+---+---------+----------+--------+---+----------+-----+---------+------------+--------------------+------+------+



In [35]:
filter2_df = spark.sql("select * from personal_data where salary > 4000").show()

+---+---------+----------+--------+---+----------+------+---------+------------+--------------------+------+------+
| SN|firstname|middlename|lastname|age|experiance| skill|     city|departmentID|                 DOB|gender|salary|
+---+---------+----------+--------+---+----------+------+---------+------------+--------------------+------+------+
|  4|    Maria|      Anne|   Jones| 36|         9|  Java|Hyderabad|           4|1967-12-01 00:00:...|     F|  5000|
|  6|Prabhakar|         B|       G| 33|        11|Python|     Pune|           2|1967-12-01 00:00:...|     M|  5000|
|  7|  Praveen|         B|       G| 21|        13|  Java|Hyderabad|           3|1967-12-01 00:00:...|     M|  6500|
|  8|   Rajesh|         B|       G| 25|         2| Scala|  Nellore|           4|1967-12-01 00:00:...|     M|  5100|
|  9|  Pramodh|         B|       G| 49|         9|  Ruby|     Pune|           1|1967-12-01 00:00:...|     M|  5000|
+---+---------+----------+--------+---+----------+------+---------+-----

### Using rank function 


In [61]:
rank_df = spark.sql("""
    SELECT *,
           RANK() OVER (PARTITION BY departmentID ORDER BY salary DESC) AS rnk
    FROM personal_data
""") 
rank_df = rank_df.filter(rank_df.rnk < 3)
rank_df.show()

+---+---------+----------+--------+---+----------+----------+----------+------------+--------------------+------+------+---+
| SN|firstname|middlename|lastname|age|experiance|     skill|      city|departmentID|                 DOB|gender|salary|rnk|
+---+---------+----------+--------+---+----------+----------+----------+------------+--------------------+------+------+---+
|  7|  Praveen|         B|       G| 21|        13|      Java| Hyderabad|           3|1967-12-01 00:00:...|     M|  6500|  1|
| 10|     Ajay|      None|    None| 50|        10|      None|      Pune|           5|1967-12-01 00:00:...|     M|  2500|  1|
|  5|      Jen|      Mary|   Brown| 39|         5|     Scala|    Nagpur|           5|1980-02-17 00:00:...|     F|     0|  2|
| 11|      Bob|         D|    None| 43|        14|         R| Hyderabad|           6|1967-12-01 00:00:...|     M|  2500|  1|
| 12|    Chris|         B|   Smith| 47|        12|JavaScript|      None|           6|1967-12-01 00:00:...|     M|  2500|  1|


In [67]:
rank_df = spark.sql("""
    SELECT *
    FROM (
        SELECT *,
               RANK() OVER (PARTITION BY departmentID ORDER BY salary DESC) AS rnk
        FROM personal_data
    ) x
    WHERE x.rnk < 3
""")
rank_df.show()

+---+---------+----------+--------+---+----------+----------+----------+------------+--------------------+------+------+---+
| SN|firstname|middlename|lastname|age|experiance|     skill|      city|departmentID|                 DOB|gender|salary|rnk|
+---+---------+----------+--------+---+----------+----------+----------+------------+--------------------+------+------+---+
|  7|  Praveen|         B|       G| 21|        13|      Java| Hyderabad|           3|1967-12-01 00:00:...|     M|  6500|  1|
| 10|     Ajay|      None|    None| 50|        10|      None|      Pune|           5|1967-12-01 00:00:...|     M|  2500|  1|
|  5|      Jen|      Mary|   Brown| 39|         5|     Scala|    Nagpur|           5|1980-02-17 00:00:...|     F|     0|  2|
| 11|      Bob|         D|    None| 43|        14|         R| Hyderabad|           6|1967-12-01 00:00:...|     M|  2500|  1|
| 12|    Chris|         B|   Smith| 47|        12|JavaScript|      None|           6|1967-12-01 00:00:...|     M|  2500|  1|


### Using Dense Rank

In [69]:
dense_rnk_df = spark.sql("""
    SELECT * FROM (
        select *,
        DENSE_RANK() over(partition by departmentID order by salary desc) as dnk
        from personal_data
        )x
        where x.dnk =1
""").show()

+---+---------+----------+--------+---+----------+----------+---------+------------+--------------------+------+------+---+
| SN|firstname|middlename|lastname|age|experiance|     skill|     city|departmentID|                 DOB|gender|salary|dnk|
+---+---------+----------+--------+---+----------+----------+---------+------------+--------------------+------+------+---+
|  7|  Praveen|         B|       G| 21|        13|      Java|Hyderabad|           3|1967-12-01 00:00:...|     M|  6500|  1|
| 10|     Ajay|      None|    None| 50|        10|      None|     Pune|           5|1967-12-01 00:00:...|     M|  2500|  1|
| 11|      Bob|         D|    None| 43|        14|         R|Hyderabad|           6|1967-12-01 00:00:...|     M|  2500|  1|
| 12|    Chris|         B|   Smith| 47|        12|JavaScript|     None|           6|1967-12-01 00:00:...|     M|  2500|  1|
|  9|  Pramodh|         B|       G| 49|         9|      Ruby|     Pune|           1|1967-12-01 00:00:...|     M|  5000|  1|
|  8|   

### Using ROW_NUMBER()

In [72]:
rownm_df = spark.sql("""
    SELECT * FROM (SELECT *,
        ROW_NUMBER() OVER(partition by departmentID order by salary asc) as rn
        FROM personal_data)x
    where x.rn =1
""").show()

+---+---------+----------+--------+---+----------+-----+---------+------------+--------------------+------+------+---+
| SN|firstname|middlename|lastname|age|experiance|skill|     city|departmentID|                 DOB|gender|salary| rn|
+---+---------+----------+--------+---+----------+-----+---------+------------+--------------------+------+------+---+
|  7|  Praveen|         B|       G| 21|        13| Java|Hyderabad|           3|1967-12-01 00:00:...|     M|  6500|  1|
|  5|      Jen|      Mary|   Brown| 39|         5|Scala|   Nagpur|           5|1980-02-17 00:00:...|     F|     0|  1|
| 11|      Bob|         D|    None| 43|        14|    R|Hyderabad|           6|1967-12-01 00:00:...|     M|  2500|  1|
|  1|    James|        Kc|   Smith| 51|        10| .Net|  Newyork|           1|1991-04-01 00:00:...|     M|  3000|  1|
|  4|    Maria|      Anne|   Jones| 36|         9| Java|Hyderabad|           4|1967-12-01 00:00:...|     F|  5000|  1|
|  2|  Michael|      Rose|    None| 45|        1

### Using max function 


In [91]:
max_df = spark.sql("""
    SELECT departmentID, max(salary) from personal_data group by departmentID order by max(salary) desc
""").show()


+------------+-----------+
|departmentID|max(salary)|
+------------+-----------+
|           3|       6500|
|           4|       5100|
|           1|       5000|
|           2|       5000|
|           5|       2500|
|           6|       2500|
+------------+-----------+



### Avg salary 

In [95]:
agg_df = spark.sql("""
    SELECT departmentID, AVG(salary) AS avg_salary
    FROM personal_data
    GROUP BY departmentID
    ORDER BY avg_salary desc
""").show()


+------------+----------+
|departmentID|avg_salary|
+------------+----------+
|           3|    6500.0|
|           4|    5050.0|
|           2|    4500.0|
|           1|    4000.0|
|           6|    2500.0|
|           5|    1250.0|
+------------+----------+

