In [1]:
sc

In [2]:
spark

In [14]:
hr_employee = spark.read.csv("file:///home/hadoop/Downloads/HR_Employee.csv", header= True, inferSchema=True)

In [4]:
hr_employee.printSchema()

root
 |-- EmployeeID: integer (nullable = true)
 |-- Department: string (nullable = true)
 |-- JobRole: string (nullable = true)
 |-- Attrition: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- MaritalStatus: string (nullable = true)
 |-- Education: string (nullable = true)
 |-- EducationField: string (nullable = true)
 |-- BusinessTravel: string (nullable = true)
 |-- JobInvolvement: string (nullable = true)
 |-- JobLevel: integer (nullable = true)
 |-- JobSatisfaction: string (nullable = true)
 |-- Hourlyrate: integer (nullable = true)
 |-- Income: integer (nullable = true)
 |-- Salaryhike: integer (nullable = true)
 |-- OverTime: string (nullable = true)
 |-- Workex: integer (nullable = true)
 |-- YearsSinceLastPromotion: integer (nullable = true)
 |-- EmpSatisfaction: string (nullable = true)
 |-- TrainingTimesLastYear: integer (nullable = true)
 |-- WorkLifeBalance: string (nullable = true)
 |-- Performance_Rating: string (nul

### Big Data file types
    * Parquet file forMat - Records are stored in columnaR format, this file format compress dataset of .csv of structured format into parquet format. Parquet format is good for query type of response.
    * There are other file formats also like:- AVro, ORC

In [5]:
hr_employee.rdd.getNumPartitions()

1

In [15]:
hr_employee.write.parquet('file:///home/hadoop/Downloads/HR_Parquet')

AnalysisException: 'path file:/home/hadoop/Downloads/HR_Parquet already exists.;'

In [7]:
hr_employee.write.orc("/HR_Orc")

In [8]:
spark.read.orc("/HR_Orc").show(5)

+----------+--------------------+--------------------+---------+------+---+-------------+-------------+--------------+-----------------+--------------+--------+---------------+----------+------+----------+--------+------+-----------------------+---------------+---------------------+---------------+------------------+
|EmployeeID|          Department|             JobRole|Attrition|Gender|Age|MaritalStatus|    Education|EducationField|   BusinessTravel|JobInvolvement|JobLevel|JobSatisfaction|Hourlyrate|Income|Salaryhike|OverTime|Workex|YearsSinceLastPromotion|EmpSatisfaction|TrainingTimesLastYear|WorkLifeBalance|Performance_Rating|
+----------+--------------------+--------------------+---------+------+---+-------------+-------------+--------------+-----------------+--------------+--------+---------------+----------+------+----------+--------+------+-----------------------+---------------+---------------------+---------------+------------------+
|         1|               Sales|     Sales

#### Optimization Techniques
    * OptImisIng spark jobs can significantly improvE performance of spark running queries , spark jobs.
    
    2. Partitioning
        * Partitioning divides data into smaller chunks, which can be processed parallely

In [9]:
hr_employee.rdd.getNumPartitions()

1

In [10]:
partitioned_df = hr_employee.repartition(3)

In [11]:
partitioned_df.write.parquet("/HR_Parition")

### 3. Caching and persistanceManaging 
    * MAnaging different levels of storage

In [16]:
# In Memory Cache storage.
hr_employee.cache()

DataFrame[EmployeeID: int, Department: string, JobRole: string, Attrition: string, Gender: string, Age: int, MaritalStatus: string, Education: string, EducationField: string, BusinessTravel: string, JobInvolvement: string, JobLevel: int, JobSatisfaction: string, Hourlyrate: int, Income: int, Salaryhike: int, OverTime: string, Workex: int, YearsSinceLastPromotion: int, EmpSatisfaction: string, TrainingTimesLastYear: int, WorkLifeBalance: string, Performance_Rating: string]

In [17]:
# persistance of Data frame with specific type of storage options likE - memory only , memory_ser, mEmory and disk

from pyspark import StorageLevel
hr_employee1 = hr_employee.persist(StorageLevel.MEMORY_AND_DISK)

In [18]:
hr_employee2 = hr_employee.persist(StorageLevel.MEMORY_ONLY_SER)

### 4. Serialization

* Efficient serialization reduces time to read/write data and transfer it over network. Kyro serialization is more popular method for better performance over default Java Serialization


#### a)  Java Serialization
    * It is default serialization method . It is easy to use but drawback is it will slow down the read, write process.

In [19]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [20]:
spark = SparkSession.builder.appName("Java Serialization").getOrCreate()

In [21]:
spark.stop()

In [23]:
spark = SparkSession.builder.appName("Pyspark Serialization")\
.config("spark.serializer","org.apache.spark.serializer.JavaSerializer").getOrCreate()

In [24]:
spark

b) Kyro Serialization: faster, more compact than Java Serialisation

In [25]:
spark = SparkSession.builder\
.config("spark.serializer","org.apache.spark.serializer.KyroSerializer")\
.config("spark.kyro.registrationRequired","true")\
.config("spark.kyro.classesToRegister","org.apache.spark.example.Person")\
.appName("Kyro Serialization").getOrCreate()

5. Broadcast Joins
* Broadcasting small datasets improves join performance

In [34]:
small_df = spark.read.csv("file:///home/hadoop/Downloads/airports.csv", inferSchema=True, header=True)
df = spark.read.csv("file:///home/hadoop/Downloads/raw_flight_data.csv", inferSchema=True, header=True)

In [35]:
from pyspark.sql.functions import broadcast
broadcast_df = broadcast(small_df)

In [36]:
broadcast_df = broadcast_df.cache()
df = df.cache() 

In [39]:
airport_df = df.join(broadcast_df, df.OriginAirportID == broadcast_df.airport_id)

In [40]:
airport_df.show(5)

+----------+---------+-------+---------------+-------------+--------+--------+----------+--------------+-----+--------------------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|airport_id|          city|state|                name|
+----------+---------+-------+---------------+-------------+--------+--------+----------+--------------+-----+--------------------+
|        19|        5|     DL|          11433|        13303|      -3|       1|     11433|       Detroit|   MI|Detroit Metro Way...|
|        19|        5|     DL|          14869|        12478|       0|      -8|     14869|Salt Lake City|   UT|Salt Lake City In...|
|        19|        5|     DL|          14057|        14869|      -4|     -15|     14057|      Portland|   OR|Portland Internat...|
|        19|        5|     DL|          15016|        11433|      28|      24|     15016|     St. Louis|   MO|Lambert-St. Louis...|
|        19|        5|     DL|          11193|        12892|      -6|     -1

#### 6. Level of Parallelism

In [41]:
# Adjust level of parallelism based on your cluster size
spark.conf.set("spark.default.parallelism", 100)

#### 7. Avoid GroupByKey
* Use ReduceByKey or aggregateByKey() instead of GroupByKey() to reduce number of shuffling

In [45]:
rdd = spark.sparkContext.parallelize([('dosa',2),('Idli',3),('vada',5),('rice',1),
                                    ('coffee',5),('Idli',3),('vada',3)])
rdd.groupByKey().mapValues(sum).collect()

[('dosa', 2), ('Idli', 6), ('vada', 8), ('rice', 1), ('coffee', 5)]

In [46]:
rdd.reduceByKey(lambda x,y:x+y).collect()

[('dosa', 2), ('Idli', 6), ('vada', 8), ('rice', 1), ('coffee', 5)]

In [49]:
from pyspark.sql.functions import sum

df = spark.createDataFrame([('dosa',2),('Idli',3),('vada',5),('rice',1),
                                    ('coffee',5),('Idli',3),('vada',3),('sweets',7)], ['Order','Value'])
df.groupBy("Order").agg(sum("Value").alias("Total_Value")).show()

+------+-----------+
| Order|Total_Value|
+------+-----------+
|  Idli|          6|
|sweets|          7|
|  vada|          8|
|  dosa|          2|
|  rice|          1|
|coffee|          5|
+------+-----------+



In [50]:
df.rdd.reduceByKey(lambda x,y:x+y).collect()

[('sweets', 7),
 ('dosa', 2),
 ('Idli', 6),
 ('vada', 8),
 ('rice', 1),
 ('coffee', 5)]

### 8. Reduce Shuffle
    * Reduce the  number of shuffles by optimising transformations
    * Use reduceBykey() over groupByKey()
    * Use map() and reduce() over groupBy()

### 9. Repartition() and Coalesce()
* repartition to increase no of partition and coalesce() to reduce the number vof partitions

### 10. Accumulators 
* use accumulators for optimising aggregate information like count(), sum() accross all executors parallely executing tasks in multiple worker nodes.
* Accumulator in spark is a variable that can be added through cumulative operations.

In [52]:
# Declare and initialize an accumulator
acc = spark.sparkContext.accumulator(0)

In [53]:
type(acc)

pyspark.accumulators.Accumulator

In [54]:
rdd = spark.sparkContext.parallelize([1,2,3,4,5,6,7,8,9])

In [55]:
# python udf
def add(x):
    acc.add(x)

In [56]:
rdd.foreach(add)

In [57]:
print(acc.value)

45


In [58]:
def counter(x):
    global acc
    acc.add(1)
    return x

In [59]:
rdd.map(counter).count()

9

### 11. Bucketing
* Use Bucketing to create buckets of large datasets for efficient query and joins