In [1]:
sc

In [2]:
spark

In [4]:
people_df = spark.read.json('file:///home/hadoop/Downloads/People.json')

In [5]:
people_df.show(5)

+---------+-----------+----------+------+---+---------+------+
|     city|    country|first_name|gender| id|last_name|salary|
+---------+-----------+----------+------+---+---------+------+
|Mulyosari|  Indonesia|     Valma|Female|  1|     Sans|983107|
|  Niihama|      Japan|     Paolo|  Male|  2|   Kiddie|649173|
|Dū Qal‘ah|Afghanistan|    Miltie|  Male|  3| De Zuani|352898|
|   Iberia|       Peru|    Jarrid|  Male|  4| Dalziell|170398|
| La Ronge|     Canada| Reinaldos|  Male|  5|   Keeffe|440989|
+---------+-----------+----------+------+---+---------+------+
only showing top 5 rows



In [6]:
people_df.printSchema()

root
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- id: long (nullable = true)
 |-- last_name: string (nullable = true)
 |-- salary: long (nullable = true)



#### 1. Create a user-defined schema for fields of DataFrame

In [7]:
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType, FloatType, StringType, StructType, StructField

In [13]:
schema = StructType([
    StructField('id', IntegerType(), True),
    StructField('first_name', StringType(), True),
    StructField('last_name', StringType(), True),
    StructField('gender', StringType(), True),
    StructField('salary', FloatType(), True),
    StructField('city', StringType(), True),
    StructField('country', StringType(), True)
])

In [14]:
people_df = spark.read.schema(schema).json('file:///home/hadoop/Downloads/People.json')

In [15]:
people_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: float (nullable = true)
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)



In [16]:
people_df.show()

+---+----------+---------+------+--------+------------------+------------+
| id|first_name|last_name|gender|  salary|              city|     country|
+---+----------+---------+------+--------+------------------+------------+
|  1|     Valma|     Sans|Female|983107.0|         Mulyosari|   Indonesia|
|  2|     Paolo|   Kiddie|  Male|649173.0|           Niihama|       Japan|
|  3|    Miltie| De Zuani|  Male|352898.0|         Dū Qal‘ah| Afghanistan|
|  4|    Jarrid| Dalziell|  Male|170398.0|            Iberia|        Peru|
|  5| Reinaldos|   Keeffe|  Male|440989.0|          La Ronge|      Canada|
|  6|        Eb|Schwanden|  Male|274126.0|      Kuala Lumpur|    Malaysia|
|  7|    Alleyn|   Paddon|  Male|681914.0|         Al Qurayn|Saudi Arabia|
|  8|   Baryram|     Yell|  Male|250748.0|           Jixiang|       China|
|  9|     Cammy|     Axel|Female|221750.0|Thị Trấn Phong Thổ|     Vietnam|
| 10|       Erl|  Caldera|  Male|680801.0|        Kotatengah|   Indonesia|
| 11|    Miguel|   Moules

In [19]:
bank_data = spark.read.json('file:///home/hadoop/Downloads/bank_edited.json', multiLine=True)
# we set multiLine to true as the records in the JSON file are like 
# key:value paires in each line, which will create problems [similar to a record in mongoDB]
bank_data.show()

+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+--------+-----+-----+--------+--------+---+
|age|balance|campaign|contact|day|default|duration|education|housing|         job|loan| marital|month|pdays|poutcome|previous|  y|
+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+--------+-----+-----+--------+--------+---+
| 58|   2143|       1|unknown|  5|     no|     261| tertiary|    yes|  management|  no| married|  may|   -1| unknown|       0| no|
| 44|     29|       1|unknown|  5|     no|     151|secondary|    yes|  technician|  no|  single|  may|   -1| unknown|       0| no|
| 33|      2|       1|unknown|  5|     no|      76|secondary|    yes|entrepreneur| yes| married|  may|   -1| unknown|       0| no|
| 47|   1506|       1|unknown|  5|     no|      92|  unknown|    yes| blue-collar|  no| married|  may|   -1| unknown|       0| no|
| 33|      1|       1|unknown|  5|     no|     198|  unknown|     no|     unknown| 

In [20]:
bank_data.printSchema()

root
 |-- age: long (nullable = true)
 |-- balance: long (nullable = true)
 |-- campaign: long (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: long (nullable = true)
 |-- default: string (nullable = true)
 |-- duration: long (nullable = true)
 |-- education: string (nullable = true)
 |-- housing: string (nullable = true)
 |-- job: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- month: string (nullable = true)
 |-- pdays: long (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- previous: long (nullable = true)
 |-- y: string (nullable = true)



In [None]:
# typecasting any one column
# we use withColumn for any column wise transformation that we want to do

bank_data.withColumn('age', col('age').cast(IntegerType()))
bank_data.withColumn('balance', col('balance').cast(FloatType()))
bank_data.withColumn('campaign', col('balance').cast(IntegerType()))
bank_data.withColumn('contact', col('contact').cast())

### 3. Creating a new column

In [23]:
# another example of withColumn
# creating new column from 2 separate columns
from pyspark.sql.functions import concat

# this needs to be updated back to dataframe to actually change it
people_df.withColumn('Full_name', concat(col('first_name'), lit(" "), col('last_name'))).show()

+---+----------+---------+------+--------+------------------+------------+-----------------+
| id|first_name|last_name|gender|  salary|              city|     country|        Full_name|
+---+----------+---------+------+--------+------------------+------------+-----------------+
|  1|     Valma|     Sans|Female|983107.0|         Mulyosari|   Indonesia|       Valma Sans|
|  2|     Paolo|   Kiddie|  Male|649173.0|           Niihama|       Japan|     Paolo Kiddie|
|  3|    Miltie| De Zuani|  Male|352898.0|         Dū Qal‘ah| Afghanistan|  Miltie De Zuani|
|  4|    Jarrid| Dalziell|  Male|170398.0|            Iberia|        Peru|  Jarrid Dalziell|
|  5| Reinaldos|   Keeffe|  Male|440989.0|          La Ronge|      Canada| Reinaldos Keeffe|
|  6|        Eb|Schwanden|  Male|274126.0|      Kuala Lumpur|    Malaysia|     Eb Schwanden|
|  7|    Alleyn|   Paddon|  Male|681914.0|         Al Qurayn|Saudi Arabia|    Alleyn Paddon|
|  8|   Baryram|     Yell|  Male|250748.0|           Jixiang|       Ch

### 4. renaming a column

In [25]:
people_df = people_df.withColumnRenamed('salary', 'income')
people_df

DataFrame[id: int, first_name: string, last_name: string, gender: string, income: float, city: string, country: string]

### 5. limit()

In [27]:
people_df.limit(3).show()

+---+----------+---------+------+--------+---------+-----------+
| id|first_name|last_name|gender|  income|     city|    country|
+---+----------+---------+------+--------+---------+-----------+
|  1|     Valma|     Sans|Female|983107.0|Mulyosari|  Indonesia|
|  2|     Paolo|   Kiddie|  Male|649173.0|  Niihama|      Japan|
|  3|    Miltie| De Zuani|  Male|352898.0|Dū Qal‘ah|Afghanistan|
+---+----------+---------+------+--------+---------+-----------+



### 6. orderBy()
###### arrange data in required order

In [29]:
people_df.orderBy(['income'], ascending=True).show()

+---+----------+----------+------+-------+------------+--------------------+
| id|first_name| last_name|gender| income|        city|             country|
+---+----------+----------+------+-------+------------+--------------------+
| 93|      Cory|     Prigg|  Male|12876.0|     Gondang|           Indonesia|
|590|      Flem|  Tumielli|  Male|13347.0| Debre Zeyit|            Ethiopia|
|192|       Odo|   Conyers|  Male|15555.0|  Raffingora|            Zimbabwe|
|407|  Barbabas| Ballingal|  Male|18598.0|Beringinjaya|           Indonesia|
|297|     Daron|    Melato|Female|19881.0|      Phayao|            Thailand|
| 24|   Avigdor|   Goddman|  Male|20216.0|       Gujun|               China|
|315|    Alayne|    Foskin|Female|20390.0|     Siluman|           Indonesia|
|199|     Niles| Atcherley|  Male|22529.0|Nova Venécia|              Brazil|
|294|     Terri|    Holton|Female|23934.0|      Hitura|               Nepal|
|601|    Pattie|Bosomworth|Female|24967.0|   Virolahti|             Finland|

In [31]:
people_df.orderBy(['country', 'income'], ascending=[True, False]).show()

+---+----------+------------+------+--------+------------------+--------------+
| id|first_name|   last_name|gender|  income|              city|       country|
+---+----------+------------+------+--------+------------------+--------------+
|490|  Cathlene|    Gatfield|Female|981605.0|           Mīrābād|   Afghanistan|
|448|      Yuri|     Duggary|  Male|414107.0|     Sang-e Māshah|   Afghanistan|
|  3|    Miltie|    De Zuani|  Male|352898.0|         Dū Qal‘ah|   Afghanistan|
|155|    Guntar|    Langmuir|  Male|290613.0|             Khōst|   Afghanistan|
|983|      Tiff|     Dreakin|Female|208548.0|             Āsmār|   Afghanistan|
|290|     Myles|      Britch|  Male|191508.0|         Dū Laīnah|   Afghanistan|
|419|   Ezekiel|   Fleetwood|  Male|163113.0|      Barakī Barak|   Afghanistan|
|701|    Gerrie|      Heigho|  Male|503327.0|             Föglö| Aland Islands|
|674|    Ludwig|    Bothwell|  Male|825171.0|         Martanesh|       Albania|
|421|    Hamnet|     Maruska|  Male|1296

### 7. materialized view
###### createOrReplaceTempView(_name_of_temp_view_)

In [32]:
# while running the command, if any error encountered
# check if multiple 'SparkSummit' are running - kill them.

bank_data.createOrReplaceTempView('bankData')

In [33]:
spark.sql('SELECT * FROM bankData').show(7)

+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+-------+-----+-----+--------+--------+---+
|age|balance|campaign|contact|day|default|duration|education|housing|         job|loan|marital|month|pdays|poutcome|previous|  y|
+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+-------+-----+-----+--------+--------+---+
| 58|   2143|       1|unknown|  5|     no|     261| tertiary|    yes|  management|  no|married|  may|   -1| unknown|       0| no|
| 44|     29|       1|unknown|  5|     no|     151|secondary|    yes|  technician|  no| single|  may|   -1| unknown|       0| no|
| 33|      2|       1|unknown|  5|     no|      76|secondary|    yes|entrepreneur| yes|married|  may|   -1| unknown|       0| no|
| 47|   1506|       1|unknown|  5|     no|      92|  unknown|    yes| blue-collar|  no|married|  may|   -1| unknown|       0| no|
| 33|      1|       1|unknown|  5|     no|     198|  unknown|     no|     unknown|  no| si

In [36]:
spark.sql('SELECT COUNT(*) AS count FROM bankData').show()

+-----+
|count|
+-----+
|45211|
+-----+



In [48]:
# Q1: Show the top 10 youngest age group employees with maximum balance

spark.sql(
    """
        SELECT age, MAX(balance)
        FROM bankData
        GROUP BY age
        ORDER BY MAX(balance) DESC, age ASC 
    """
).show(10)

+---+------------+
|age|max(balance)|
+---+------------+
| 51|      102127|
| 59|       98417|
| 84|       81204|
| 60|       71188|
| 56|       66721|
| 52|       66653|
| 32|       59649|
| 44|       58544|
| 50|       57435|
| 43|       56831|
+---+------------+
only showing top 10 rows



In [49]:
# Q2: Show the worst 5 job types having minimum salary 

spark.sql(
    """
        SELECT job, MIN(balance)
        FROM bankData
        GROUP BY job
        ORDER BY MIN(balance)
    """
).show(5)

+-------------+------------+
|          job|min(balance)|
+-------------+------------+
|  blue-collar|       -8019|
|   management|       -6847|
|self-employed|       -3313|
|   technician|       -2827|
|     services|       -2122|
+-------------+------------+
only showing top 5 rows



In [None]:
# Q3: 