In [1]:
sc

In [2]:
spark

In [3]:
people_df = spark.read.json("file:///home/hadoop/Downloads/People.json")

In [5]:
people_df.show(5)

+---------+-----------+----------+------+---+---------+------+
|     city|    country|first_name|gender| id|last_name|salary|
+---------+-----------+----------+------+---+---------+------+
|Mulyosari|  Indonesia|     Valma|Female|  1|     Sans|983107|
|  Niihama|      Japan|     Paolo|  Male|  2|   Kiddie|649173|
|Dū Qal‘ah|Afghanistan|    Miltie|  Male|  3| De Zuani|352898|
|   Iberia|       Peru|    Jarrid|  Male|  4| Dalziell|170398|
| La Ronge|     Canada| Reinaldos|  Male|  5|   Keeffe|440989|
+---------+-----------+----------+------+---+---------+------+
only showing top 5 rows



In [6]:
people_df.printSchema()

root
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- id: long (nullable = true)
 |-- last_name: string (nullable = true)
 |-- salary: long (nullable = true)



####  1. Create a user defined schema for fields of DataFrame

In [7]:
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType, FloatType, StringType, StructType, StructField
from pyspark.sql.types import *

In [8]:
schema = StructType(
    [
        StructField("id", IntegerType(), True),
        StructField("first_name", StringType(), True),
        StructField("last_name", StringType(), True),
        StructField("gender", StringType(), True),
        StructField("salary", LongType(), True),
        StructField("city", StringType(), True),
        StructField("country", StringType(), True)
        
    ]
)

In [9]:
people_df = spark.read.schema(schema).json("file:///home/hadoop/Downloads/People.json")

In [10]:
people_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)



In [12]:
# if the data has dictionary with key- value pairs are seperated by newline the we use multiLine = True

bank_data = spark.read.json("file:///home/hadoop/Downloads/bank_edited.json", multiLine=True)
bank_data.show()

+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+--------+-----+-----+--------+--------+---+
|age|balance|campaign|contact|day|default|duration|education|housing|         job|loan| marital|month|pdays|poutcome|previous|  y|
+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+--------+-----+-----+--------+--------+---+
| 58|   2143|       1|unknown|  5|     no|     261| tertiary|    yes|  management|  no| married|  may|   -1| unknown|       0| no|
| 44|     29|       1|unknown|  5|     no|     151|secondary|    yes|  technician|  no|  single|  may|   -1| unknown|       0| no|
| 33|      2|       1|unknown|  5|     no|      76|secondary|    yes|entrepreneur| yes| married|  may|   -1| unknown|       0| no|
| 47|   1506|       1|unknown|  5|     no|      92|  unknown|    yes| blue-collar|  no| married|  may|   -1| unknown|       0| no|
| 33|      1|       1|unknown|  5|     no|     198|  unknown|     no|     unknown| 

In [13]:
bank_data.printSchema()

root
 |-- age: long (nullable = true)
 |-- balance: long (nullable = true)
 |-- campaign: long (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: long (nullable = true)
 |-- default: string (nullable = true)
 |-- duration: long (nullable = true)
 |-- education: string (nullable = true)
 |-- housing: string (nullable = true)
 |-- job: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- month: string (nullable = true)
 |-- pdays: long (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- previous: long (nullable = true)
 |-- y: string (nullable = true)



In [14]:
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType, FloatType, StringType, StructType, StructField
from pyspark.sql.types import *

#### 2. Typecasting any one column

In [15]:
# Typecasting age from long to int
# we use withColumn for any transformation of the table
bank_data.withColumn('age',col('age').cast(IntegerType()))

DataFrame[age: int, balance: bigint, campaign: bigint, contact: string, day: bigint, default: string, duration: bigint, education: string, housing: string, job: string, loan: string, marital: string, month: string, pdays: bigint, poutcome: string, previous: bigint, y: string]

#### 3. creating new column from two strings

In [18]:
# lit() used for adding literal values, here adding space between firstname and last name

from pyspark.sql.functions import concat

people_df.withColumn('Full Name',concat(col('first_name'),lit(" "),col('last_name'))).show(2)

+---+----------+---------+------+------+---------+---------+------------+
| id|first_name|last_name|gender|salary|     city|  country|   Full Name|
+---+----------+---------+------+------+---------+---------+------------+
|  1|     Valma|     Sans|Female|983107|Mulyosari|Indonesia|  Valma Sans|
|  2|     Paolo|   Kiddie|  Male|649173|  Niihama|    Japan|Paolo Kiddie|
+---+----------+---------+------+------+---------+---------+------------+
only showing top 2 rows



#### 4. Renaming existing column

In [19]:
people_df = people_df.withColumnRenamed("salary","income")

In [20]:
people_df

DataFrame[id: int, first_name: string, last_name: string, gender: string, income: bigint, city: string, country: string]

#### 5. Limit()

In [21]:
people_df.limit(5).show()

+---+----------+---------+------+------+---------+-----------+
| id|first_name|last_name|gender|income|     city|    country|
+---+----------+---------+------+------+---------+-----------+
|  1|     Valma|     Sans|Female|983107|Mulyosari|  Indonesia|
|  2|     Paolo|   Kiddie|  Male|649173|  Niihama|      Japan|
|  3|    Miltie| De Zuani|  Male|352898|Dū Qal‘ah|Afghanistan|
|  4|    Jarrid| Dalziell|  Male|170398|   Iberia|       Peru|
|  5| Reinaldos|   Keeffe|  Male|440989| La Ronge|     Canada|
+---+----------+---------+------+------+---------+-----------+



####  OrderBy()

    * Arrange data in ascending or descending order

In [22]:
people_df.orderBy(["income"],ascending=True).show(5)

+---+----------+---------+------+------+------------+---------+
| id|first_name|last_name|gender|income|        city|  country|
+---+----------+---------+------+------+------------+---------+
| 93|      Cory|    Prigg|  Male| 12876|     Gondang|Indonesia|
|590|      Flem| Tumielli|  Male| 13347| Debre Zeyit| Ethiopia|
|192|       Odo|  Conyers|  Male| 15555|  Raffingora| Zimbabwe|
|407|  Barbabas|Ballingal|  Male| 18598|Beringinjaya|Indonesia|
|297|     Daron|   Melato|Female| 19881|      Phayao| Thailand|
+---+----------+---------+------+------+------------+---------+
only showing top 5 rows



In [24]:
people_df.orderBy(["country","income"],ascending=[True,False]).show(10)

+---+----------+---------+------+------+-------------+-------------+
| id|first_name|last_name|gender|income|         city|      country|
+---+----------+---------+------+------+-------------+-------------+
|490|  Cathlene| Gatfield|Female|981605|      Mīrābād|  Afghanistan|
|448|      Yuri|  Duggary|  Male|414107|Sang-e Māshah|  Afghanistan|
|  3|    Miltie| De Zuani|  Male|352898|    Dū Qal‘ah|  Afghanistan|
|155|    Guntar| Langmuir|  Male|290613|        Khōst|  Afghanistan|
|983|      Tiff|  Dreakin|Female|208548|        Āsmār|  Afghanistan|
|290|     Myles|   Britch|  Male|191508|    Dū Laīnah|  Afghanistan|
|419|   Ezekiel|Fleetwood|  Male|163113| Barakī Barak|  Afghanistan|
|701|    Gerrie|   Heigho|  Male|503327|        Föglö|Aland Islands|
|674|    Ludwig| Bothwell|  Male|825171|    Martanesh|      Albania|
|421|    Hamnet|  Maruska|  Male|129628|      Hoçisht|      Albania|
+---+----------+---------+------+------+-------------+-------------+
only showing top 10 rows



#### Materialized view
    * createOrReplaceTempView()
    * we use this because we cannot make changes to original warehouse

In [25]:
bank_data.createOrReplaceTempView('bankdata')

In [30]:
spark.sql("select * from bankdata").show(5)

+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+-------+-----+-----+--------+--------+---+
|age|balance|campaign|contact|day|default|duration|education|housing|         job|loan|marital|month|pdays|poutcome|previous|  y|
+---+-------+--------+-------+---+-------+--------+---------+-------+------------+----+-------+-----+-----+--------+--------+---+
| 58|   2143|       1|unknown|  5|     no|     261| tertiary|    yes|  management|  no|married|  may|   -1| unknown|       0| no|
| 44|     29|       1|unknown|  5|     no|     151|secondary|    yes|  technician|  no| single|  may|   -1| unknown|       0| no|
| 33|      2|       1|unknown|  5|     no|      76|secondary|    yes|entrepreneur| yes|married|  may|   -1| unknown|       0| no|
| 47|   1506|       1|unknown|  5|     no|      92|  unknown|    yes| blue-collar|  no|married|  may|   -1| unknown|       0| no|
| 33|      1|       1|unknown|  5|     no|     198|  unknown|     no|     unknown|  no| si

In [32]:
spark.sql("select count(*) as count from bankdata").show()

+-----+
|count|
+-----+
|45211|
+-----+



*  Show the top 10 Youngest age group  with maximum balance

In [55]:
spark.sql("select age, max(balance) from bankdata group by age order by age asc limit(10)").show()

+---+------------+
|age|max(balance)|
+---+------------+
| 18|        1944|
| 19|        5368|
| 20|        8860|
| 21|        8278|
| 22|       10971|
| 23|       19690|
| 24|       23878|
| 25|       16874|
| 26|       24299|
| 27|       24025|
+---+------------+



* show the worst 5 job type having minimum salary

In [57]:
spark.sql("select job, min(balance) from bankdata group by job order by min(balance) asc").show()

+-------------+------------+
|          job|min(balance)|
+-------------+------------+
|  blue-collar|       -8019|
|   management|       -6847|
|self-employed|       -3313|
|   technician|       -2827|
|     services|       -2122|
| entrepreneur|       -2082|
|    housemaid|       -1941|
|       admin.|       -1601|
|      retired|       -1598|
|   unemployed|       -1270|
|      student|        -679|
|      unknown|        -295|
+-------------+------------+

