In [1]:
sc

In [2]:
spark

In [3]:
hr_employee = spark.read.csv('file:///home/hadoop/Downloads/HR_Employee.csv', inferSchema=True, header=True)

In [4]:
hr_employee.printSchema()

root
 |-- EmployeeID: integer (nullable = true)
 |-- Department: string (nullable = true)
 |-- JobRole: string (nullable = true)
 |-- Attrition: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- MaritalStatus: string (nullable = true)
 |-- Education: string (nullable = true)
 |-- EducationField: string (nullable = true)
 |-- BusinessTravel: string (nullable = true)
 |-- JobInvolvement: string (nullable = true)
 |-- JobLevel: integer (nullable = true)
 |-- JobSatisfaction: string (nullable = true)
 |-- Hourlyrate: integer (nullable = true)
 |-- Income: integer (nullable = true)
 |-- Salaryhike: integer (nullable = true)
 |-- OverTime: string (nullable = true)
 |-- Workex: integer (nullable = true)
 |-- YearsSinceLastPromotion: integer (nullable = true)
 |-- EmpSatisfaction: string (nullable = true)
 |-- TrainingTimesLastYear: integer (nullable = true)
 |-- WorkLifeBalance: string (nullable = true)
 |-- Performance_Rating: string (nul

## 1. BigData file types
#### a. Parquet File Format - records are stored columnunar in this format, compresses dataset of .csv of structured format into parquet format. Parquet format is good for query type of response
###### [more about different file formats](https://www.clairvoyant.ai/blog/big-data-file-formats)

In [9]:
hr_employee.rdd.getNumPartitions()

1

In [11]:
hr_employee.write.parquet('file:///home/hadoop/Downloads/HR_Parquet')

#### b. ORC

In [12]:
hr_employee.write.orc('/HR_orc')

In [13]:
spark.read.orc('/HR_orc/').show(10)

+----------+--------------------+--------------------+---------+------+---+-------------+-------------+--------------+-----------------+--------------+--------+---------------+----------+------+----------+--------+------+-----------------------+---------------+---------------------+---------------+------------------+
|EmployeeID|          Department|             JobRole|Attrition|Gender|Age|MaritalStatus|    Education|EducationField|   BusinessTravel|JobInvolvement|JobLevel|JobSatisfaction|Hourlyrate|Income|Salaryhike|OverTime|Workex|YearsSinceLastPromotion|EmpSatisfaction|TrainingTimesLastYear|WorkLifeBalance|Performance_Rating|
+----------+--------------------+--------------------+---------+------+---+-------------+-------------+--------------+-----------------+--------------+--------+---------------+----------+------+----------+--------+------+-----------------------+---------------+---------------------+---------------+------------------+
|         1|               Sales|     Sales

## Optimization Techniques

#### 1. Optimizing spark jobs can significantly improve performance of Spark running queries

## 2. Partitioning
#### Partitioning divide data into smaller chunks, which can be processed parallely

In [15]:
hr_employee.rdd.getNumPartitions()

1

In [16]:
partitioned_df = hr_employee.repartition(3)

In [17]:
partitioned_df.write.parquet('/HR_Partition')

## 3. Caching and Persistence
#### Managing different levels of storage

In [19]:
# in-memory cache storage
hr_employee.cache()

DataFrame[EmployeeID: int, Department: string, JobRole: string, Attrition: string, Gender: string, Age: int, MaritalStatus: string, Education: string, EducationField: string, BusinessTravel: string, JobInvolvement: string, JobLevel: int, JobSatisfaction: string, Hourlyrate: int, Income: int, Salaryhike: int, OverTime: string, Workex: int, YearsSinceLastPromotion: int, EmpSatisfaction: string, TrainingTimesLastYear: int, WorkLifeBalance: string, Performance_Rating: string]

In [21]:
# persistence of data storage with specific type of storage options like - memory only, memory_ser, or memeory_and_disk
from pyspark import StorageLevel
hr_employee1 = hr_employee.persist(StorageLevel.MEMORY_AND_DISK)

In [22]:
hr_employee2 = hr_employee.persist(StorageLevel.MEMORY_ONLY_SER)

## 4. Serialization
#### Efficient Serialization reduces time to read/write data and transfer it over network.
#### Kyro Serialization, Java Serialization  -- popular serialization method for betrer performance over defaulr jaa serialization

#### a. Java Serialization is the default serialization method. Easy to use, but drawback is that it will slow down the read-write process

In [35]:
from pyspark import SparkContext, SparkConf

In [5]:
spark = SparkSession.builder.appName('Java Serialization').getOrCreate()

In [6]:
spark.stop()

In [7]:
spark = SparkSession.builder.appName('PySpark serialization')\
.config('spark.serializer', 'org.apache.spark.serializer.JavaSerializer').getOrCreate()

In [8]:
spark

In [None]:
#### makes it faster and more compact than Java serialization

In [28]:
# dont run
#spark = SparkSession.builder\
#.config('spark.Serializer', ;arg.apache.spark.serializer.KyroSerializer)\
#.config('spark.kyro.registrationRequired', 'True')\
#.config('spark.kyro.classesToRegister', 'org.apache.spark.example.person')\
#
##.appName('Kyro Serialization').getOrCreate()

## 5. Broadcast Join
#### increase size of smaller one then join

In [9]:
small_df = spark.read.csv('file:///home/hadoop/Downloads/airports.csv')
df = spark.read.csv('file:///home/hadoop/Downloads/raw_flight_data.csv')

In [10]:
from pyspark.sql.functions import broadcast
broadcast_df = broadcast(small_df)

In [11]:
broadcast_df = broadcast_df.cache()
df = df.cache()

In [12]:
# broadcast join
airport_df = df.join(broadcast_df, df.OriginAirportID==broadcast_df.airport_id)

AttributeError: 'DataFrame' object has no attribute 'OriginAirportID'

In [None]:
airport_df.show()

## 6. Level of Parallelism

In [13]:
# adjust level of parallelism base don cluster size
spark.conf.set('spark.default.parallelism', 100)

## 7. Avoid GroupByKey()
#### use ReduceByKey() or aggregateByKey() instead of GroupByKey() to reduce number of shuffling

In [22]:
rdd = spark.sparkContext.parallelize([('dosa',2), ('idli',3), ('vada',3), ('rice',1), ('coffee',5), ('idli',4), ('vada',5)])

# option, this method is to be avoided
#rdd.groupByKey().mapValues(sum).collect()

rdd.reduceByKey(lambda x,y: x+y).collect()

[('dosa', 2), ('idli', 7), ('vada', 8), ('rice', 1), ('coffee', 5)]

In [28]:
from pyspark.sql.functions import sum

df = spark.createDataFrame([('dosa',2), ('idli',3), ('vada',3), ('rice',1), ('coffee',5), ('idli',4), 
                            ('vada',5)], schema=['order', 'value'])
df.groupBy('order').agg(sum('value').alias('total_value')).show()

+------+-----------+
| order|total_value|
+------+-----------+
|  vada|          8|
|  dosa|          2|
|  rice|          1|
|  idli|          7|
|coffee|          5|
+------+-----------+



In [29]:
df.rdd.reduceByKey(lambda x,y: x+y).collect()

[('dosa', 2), ('idli', 7), ('vada', 8), ('rice', 1), ('coffee', 5)]

## 8. Reduce Shuffle
#### Reduce number of shuffles by optimizing transformations
#### Use reduceByKey() over groupByKey()
#### use map() and reduce() over groupBy()

## 9. Repartition and Coalesce
    * Use accumulators for optimizing aggregate information like count(), sum() across all executors parallely executing tasks in multiple worker nodes.
    * accumulator in Spark are variables, that can be added through cumulative operations

In [56]:
# declare and initialize Accumulator
acc = spark.sparkContext.accumulator(0)

In [57]:
type(acc)

pyspark.accumulators.Accumulator

In [58]:
rdd = spark.sparkContext.parallelize([1,2,3,4,5,6,7,8,9])

In [59]:
# python UDF
def add(x):
    acc.add(x)

In [60]:
rdd.foreach(add)

In [61]:
print(acc.value)

45


In [54]:
acc = spark.sparkContext.accumulator(0)
rdd = spark.sparkContext.parallelize([1,2,3,4,5,6,7,8,9])

def counter(x):
    # global acc  # this was used earlier when this block didnt reinitialize acc and used previous one from above
    acc.add(1)
    # return x

In [55]:
rdd.foreach(counter)
print(acc.value)

9


## 10. Bucketing
    * use bucketing to create large datasets for efficient queries as well as joins