# SparkSession

In [None]:
!pip install pyspark py4j

from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('demo').getOrCreate()
spark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# reading csv file

In [None]:
df = spark.read.format('csv').option('header',True).option('inferSchema',True).option('nullValue','null').load('/content/employee.csv')
df.show(5)
df.printSchema()
df.count()

+-----+------+--------+----+----------+----+----+------+------------+
|EMPNO| ENAME|     JOB| MGR|  HIREDATE| SAL|COMM|DEPTNO|UPDATED_DATE|
+-----+------+--------+----+----------+----+----+------+------------+
| 7369| SMITH|   CLERK|7902|17-12-1980| 800|null|    20|  01-01-2022|
| 7499| ALLEN|SALESMAN|7698|20-02-1981|1600| 300|    30|  02-01-2022|
| 7521|  WARD|SALESMAN|7698|22-02-1981|1250| 500|    30|  03-01-2022|
| 7566| JONES| MANAGER|7839|04-02-1981|2975|null|    20|  04-01-2022|
| 7654|MARTIN|SALESMAN|7698|21-09-1981|1250| 600|    30|  05-01-2022|
+-----+------+--------+----+----------+----+----+------+------------+
only showing top 5 rows

root
 |-- EMPNO: integer (nullable = true)
 |-- ENAME: string (nullable = true)
 |-- JOB: string (nullable = true)
 |-- MGR: integer (nullable = true)
 |-- HIREDATE: string (nullable = true)
 |-- SAL: integer (nullable = true)
 |-- COMM: integer (nullable = true)
 |-- DEPTNO: integer (nullable = true)
 |-- UPDATED_DATE: string (nullable = true

33

# Change data type of column

In [None]:
from pyspark.sql.functions import *
ch_df = df.withColumn('Emp',col('EMPNO').cast('string')).withColumn('salary',col('SAL').cast('string'))
ch_df.show(10)
ch_df.printSchema()

+-----+------+---------+----+----------+----+----+------+------------+----+------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE| SAL|COMM|DEPTNO|UPDATED_DATE| Emp|salary|
+-----+------+---------+----+----------+----+----+------+------------+----+------+
| 7369| SMITH|    CLERK|7902|17-12-1980| 800|null|    20|  01-01-2022|7369|   800|
| 7499| ALLEN| SALESMAN|7698|20-02-1981|1600| 300|    30|  02-01-2022|7499|  1600|
| 7521|  WARD| SALESMAN|7698|22-02-1981|1250| 500|    30|  03-01-2022|7521|  1250|
| 7566| JONES|  MANAGER|7839|04-02-1981|2975|null|    20|  04-01-2022|7566|  2975|
| 7654|MARTIN| SALESMAN|7698|21-09-1981|1250| 600|    30|  05-01-2022|7654|  1250|
| 7698|   SGR|  MANAGER|7839|05-01-1981|2850| 500|    30|  06-01-2022|7698|  2850|
| 7782|  RAVI|  MANAGER|7839|06-09-1981|2450|null|    10|  07-01-2022|7782|  2450|
| 7788| SCOTT|  ANALYST|7566|19-04-1987|3000|1200|    20|  08-01-2022|7788|  3000|
| 7839|  KING|PRESIDENT|null|      null|5000|null|    10|        null|7839|  5000|
| 78

#Concatinating two columns

In [None]:
from pyspark.sql.functions import *

con_df = spark.read.format('csv').option('header',True).load('/content/sample_data/california_housing_test.csv')

con_df.withColumn('Lcoation',concat(col('longitude'),col('latitude'))).show()


+-----------+---------+------------------+-----------+--------------+-----------+-----------+-------------+------------------+--------------------+
|  longitude| latitude|housing_median_age|total_rooms|total_bedrooms| population| households|median_income|median_house_value|            Lcoation|
+-----------+---------+------------------+-----------+--------------+-----------+-----------+-------------+------------------+--------------------+
|-122.050000|37.370000|         27.000000|3885.000000|    661.000000|1537.000000| 606.000000|     6.608500|     344700.000000|-122.05000037.370000|
|-118.300000|34.260000|         43.000000|1510.000000|    310.000000| 809.000000| 277.000000|     3.599000|     176500.000000|-118.30000034.260000|
|-117.810000|33.780000|         27.000000|3589.000000|    507.000000|1484.000000| 495.000000|     5.793400|     270500.000000|-117.81000033.780000|
|-118.360000|33.820000|         28.000000|  67.000000|     15.000000|  49.000000|  11.000000|     6.135900|     

# create year,month,day columns

In [None]:
df.printSchema()

root
 |-- EMPNO: integer (nullable = true)
 |-- ENAME: string (nullable = true)
 |-- JOB: string (nullable = true)
 |-- MGR: integer (nullable = true)
 |-- HIREDATE: string (nullable = true)
 |-- SAL: integer (nullable = true)
 |-- COMM: integer (nullable = true)
 |-- DEPTNO: integer (nullable = true)
 |-- UPDATED_DATE: string (nullable = true)



In [None]:
df1 = df.withColumn('HIREDATE',to_date('HIREDATE','dd-MM-yyyy'))
df1.show(10)
df1.printSchema()

+-----+------+---------+----+----------+----+----+------+------------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE| SAL|COMM|DEPTNO|UPDATED_DATE|
+-----+------+---------+----+----------+----+----+------+------------+
| 7369| SMITH|    CLERK|7902|1980-12-17| 800|null|    20|  01-01-2022|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600| 300|    30|  02-01-2022|
| 7521|  WARD| SALESMAN|7698|1981-02-22|1250| 500|    30|  03-01-2022|
| 7566| JONES|  MANAGER|7839|1981-02-04|2975|null|    20|  04-01-2022|
| 7654|MARTIN| SALESMAN|7698|1981-09-21|1250| 600|    30|  05-01-2022|
| 7698|   SGR|  MANAGER|7839|1981-01-05|2850| 500|    30|  06-01-2022|
| 7782|  RAVI|  MANAGER|7839|1981-09-06|2450|null|    10|  07-01-2022|
| 7788| SCOTT|  ANALYST|7566|1987-04-19|3000|1200|    20|  08-01-2022|
| 7839|  KING|PRESIDENT|null|      null|5000|null|    10|        null|
| 7844|TURNER| SALESMAN|7698|1981-08-09|1500|   0|    30|  01-02-2022|
+-----+------+---------+----+----------+----+----+------+------------+
only s

In [None]:
df2 = df1.withColumn('year',date_format('HIREDATE','yyyy')).withColumn('month',date_format('HIREDATE','MM')).withColumn('day',date_format('HIREDATE','dd'))

df2.show(10)
df2.printSchema()
df2.count()

+-----+------+---------+----+----------+----+----+------+------------+----+-----+----+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE| SAL|COMM|DEPTNO|UPDATED_DATE|year|month| day|
+-----+------+---------+----+----------+----+----+------+------------+----+-----+----+
| 7369| SMITH|    CLERK|7902|1980-12-17| 800|null|    20|  01-01-2022|1980|   12|  17|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600| 300|    30|  02-01-2022|1981|   02|  20|
| 7521|  WARD| SALESMAN|7698|1981-02-22|1250| 500|    30|  03-01-2022|1981|   02|  22|
| 7566| JONES|  MANAGER|7839|1981-02-04|2975|null|    20|  04-01-2022|1981|   02|  04|
| 7654|MARTIN| SALESMAN|7698|1981-09-21|1250| 600|    30|  05-01-2022|1981|   09|  21|
| 7698|   SGR|  MANAGER|7839|1981-01-05|2850| 500|    30|  06-01-2022|1981|   01|  05|
| 7782|  RAVI|  MANAGER|7839|1981-09-06|2450|null|    10|  07-01-2022|1981|   09|  06|
| 7788| SCOTT|  ANALYST|7566|1987-04-19|3000|1200|    20|  08-01-2022|1987|   04|  19|
| 7839|  KING|PRESIDENT|null|      null|500

33

# Create A partitioned dataframe by year,month,day save it to Employee Hire table

In [None]:
df3 = df2.write.partitionBy('year','month','day').saveAsTable('Employee_Hire')

In [None]:
spark.sql('select count(*) from Employee_Hire').show()

+--------+
|count(1)|
+--------+
|      33|
+--------+



# Dataframe Write Modes

overwrite – mode is used to overwrite the existing file.

append – To add the data to the existing file.

ignore – Ignores write operation when the file already exists.

error – This is a default option when the file already exists, it returns an error.

In [None]:
# append – To add the data to the existing file.

df3 = df2.write.partitionBy('year','month','day').mode('append').saveAsTable('Employee_Hire')


In [None]:
#overwrite – mode is used to overwrite the existing file.

df3 = df2.write.partitionBy('year','month','day').mode('overwrite').saveAsTable('Employee_Hire')

In [None]:
#ignore – Ignores write operation when the file already exists.
df3 = df2.write.partitionBy('year','month','day').mode('ignore').saveAsTable('Employee_Hire')

In [None]:
spark.sql( 'select * from Employee_Hire where').show()

+-----+------+--------+----+----------+----+----+------+------------+----+-----+---+
|EMPNO| ENAME|     JOB| MGR|  HIREDATE| SAL|COMM|DEPTNO|UPDATED_DATE|year|month|day|
+-----+------+--------+----+----------+----+----+------+------------+----+-----+---+
| 7844|TURNER|SALESMAN|7698|1981-08-09|1500|   0|    30|  01-02-2022|1981|   08| 09|
| 7844|TURNER|SALESMAN|7698|1981-08-09|1500|   0|    30|  07-02-2021|1981|   08| 09|
| 7844|TURNER|SALESMAN|7698|1981-08-09|1500|   0|    30|  01-02-2022|1981|   08| 09|
| 7844|TURNER|SALESMAN|7698|1981-08-09|1500|   0|    30|  07-02-2021|1981|   08| 09|
| 7521|  WARD|SALESMAN|7698|1981-02-22|1250| 500|    30|  03-01-2022|1981|   02| 22|
| 7521|  WARD|SALESMAN|7698|1981-02-22|1250| 500|    30|        null|1981|   02| 22|
| 7521|  WARD|SALESMAN|7698|1981-02-22|1250| 500|    30|  03-01-2022|1981|   02| 22|
| 7521|  WARD|SALESMAN|7698|1981-02-22|1250| 500|    30|        null|1981|   02| 22|
| 7499| ALLEN|SALESMAN|7698|1981-02-20|1600| 300|    30|  02-01-2

In [None]:
spark.sql('select count(*) from Employee_Hire').show()

+--------+
|count(1)|
+--------+
|      66|
+--------+



# Incremental loading

loading data from source (daily)  ----> Tansform -----> loading warehouse

In [None]:
# read data of day0 file

from pyspark.sql.functions import *

day0 = spark.read.format('csv').option('header',True).load('/content/employee_day0.csv')


In [None]:
# write partitioned data into warhouse employee table 

day0_df = day0.write.partitionBy('JOB').saveAsTable('employee')

In [None]:
# query warhouse tables

spark.sql('select * from employee').show()
spark.sql('select count(*) from employee').show()

+-----+------+----+----+----+------+---------+
|EMPNO| ENAME| MGR| SAL|COMM|DEPTNO|      JOB|
+-----+------+----+----+----+------+---------+
| 7499| ALLEN|7698|1600| 300|    30| SALESMAN|
| 7521|  WARD|7698|1250| 500|    30| SALESMAN|
| 7654|MARTIN|7698|1250| 600|    30| SALESMAN|
| 7844|TURNER|7698|1500|  10|    30| SALESMAN|
| 7369| SMITH|7902| 800| 100|    20|    CLERK|
| 7876| ADAMS|7788|1100| 100|    20|    CLERK|
| 7900| JAMES|7698| 950| 890|    30|    CLERK|
| 7934|MILLER|7782|1300| 740|    10|    CLERK|
| 7788| SCOTT|7566|3000|1200|    20|  ANALYST|
| 7902|  FORD|7566|3000| 400|    20|  ANALYST|
| 7566| JONES|7839|2975| 200|    20|  MANAGER|
| 7698|   SGR|7839|2850| 500|    30|  MANAGER|
| 7782|  RAVI|7839|2450| 500|    10|  MANAGER|
| 7839|  KING|7998|5000|1000|    10|PRESIDENT|
+-----+------+----+----+----+------+---------+

+--------+
|count(1)|
+--------+
|      14|
+--------+



In [None]:
# read data of day1 file

day1 = spark.read.format('csv').option('header',True).load('/content/employee_day1.csv')

day1.show()
day1.count()


+-----+------+---------+----+----+----+------+
|EMPNO| ENAME|      JOB| MGR| SAL|COMM|DEPTNO|
+-----+------+---------+----+----+----+------+
| 1234|SEKHAR|   doctor|7777| 667|  78|    80|
| 7369| SMITH|    CLERK|7902| 800|  90|    20|
| 7499| ALLEN| SALESMAN|7698|1600| 300|    30|
| 7521|  WARD| SALESMAN|7698|1250| 500|    30|
| 7566| JONES|  MANAGER|7839|2975| 100|    20|
| 7654|MARTIN| SALESMAN|7698|1250|1400|    30|
| 7698|   SGR|  MANAGER|7839|2850| 200|    30|
| 7782|  RAVI|  MANAGER|7839|2450|  14|    10|
| 7788| SCOTT|  ANALYST|7566|3000| 180|    20|
| 7839|  KING|PRESIDENT|7888|5000| 140|    10|
| 7844|TURNER| SALESMAN|7698|1500|  10|    30|
| 7876| ADAMS|    CLERK|7788|1100| 300|    20|
| 7900| JAMES|    CLERK|7698| 950| 400|    30|
| 7902|  FORD|  ANALYST|7566|3000| 100|    20|
| 7934|MILLER|    CLERK|7782|1300| 100|    10|
| 1234|   RAM|    CLERK|7457| 494| 588|    80|
+-----+------+---------+----+----+----+------+



16

In [None]:
# write partitioned data into warhouse employee table 

day1_df = day1.write.partitionBy('JOB').mode('append').saveAsTable('employee')


In [None]:
# query warhouse tables

spark.sql('select * from employee').show()
spark.sql('select count(*) from employee').show()

+-----+------+----+----+----+------+--------+
|EMPNO| ENAME| MGR| SAL|COMM|DEPTNO|     JOB|
+-----+------+----+----+----+------+--------+
| 7499| ALLEN|7698|1600| 300|    30|SALESMAN|
| 7521|  WARD|7698|1250| 500|    30|SALESMAN|
| 7654|MARTIN|7698|1250|1400|    30|SALESMAN|
| 7844|TURNER|7698|1500|  10|    30|SALESMAN|
| 7499| ALLEN|7698|1600| 300|    30|SALESMAN|
| 7521|  WARD|7698|1250| 500|    30|SALESMAN|
| 7654|MARTIN|7698|1250| 600|    30|SALESMAN|
| 7844|TURNER|7698|1500|  10|    30|SALESMAN|
| 7369| SMITH|7902| 800|  90|    20|   CLERK|
| 7876| ADAMS|7788|1100| 300|    20|   CLERK|
| 7900| JAMES|7698| 950| 400|    30|   CLERK|
| 7934|MILLER|7782|1300| 100|    10|   CLERK|
| 1234|   RAM|7457| 494| 588|    80|   CLERK|
| 7369| SMITH|7902| 800| 100|    20|   CLERK|
| 7876| ADAMS|7788|1100| 100|    20|   CLERK|
| 7900| JAMES|7698| 950| 890|    30|   CLERK|
| 7934|MILLER|7782|1300| 740|    10|   CLERK|
| 7788| SCOTT|7566|3000|1200|    20| ANALYST|
| 7902|  FORD|7566|3000| 400|    2

# Incremental loading

#yesterday and today's data loading to warehouse tables

In [None]:

# read and Partitioned data of Day0 (YesterDay data)

from pyspark.sql.functions import *

date_day0 = spark.read.format('csv').option('header',True).load('/content/employee_day0.csv').withColumn('Date', date_sub(current_date(),1))

date_day0 = date_day0.withColumn('year',date_format('Date','yyyy')).withColumn('Month',date_format('Date','MM')).withColumn('Day',date_format('Date','dd'))

date_day0.show(truncate = False)

date_day0.count()


+-----+------+---------+----+----+----+------+----------+----+-----+---+
|EMPNO|ENAME |JOB      |MGR |SAL |COMM|DEPTNO|Date      |year|Month|Day|
+-----+------+---------+----+----+----+------+----------+----+-----+---+
|7369 |SMITH |CLERK    |7902|800 |100 |20    |2023-04-17|2023|04   |17 |
|7499 |ALLEN |SALESMAN |7698|1600|300 |30    |2023-04-17|2023|04   |17 |
|7521 |WARD  |SALESMAN |7698|1250|500 |30    |2023-04-17|2023|04   |17 |
|7566 |JONES |MANAGER  |7839|2975|200 |20    |2023-04-17|2023|04   |17 |
|7654 |MARTIN|SALESMAN |7698|1250|600 |30    |2023-04-17|2023|04   |17 |
|7698 |SGR   |MANAGER  |7839|2850|500 |30    |2023-04-17|2023|04   |17 |
|7782 |RAVI  |MANAGER  |7839|2450|500 |10    |2023-04-17|2023|04   |17 |
|7788 |SCOTT |ANALYST  |7566|3000|1200|20    |2023-04-17|2023|04   |17 |
|7839 |KING  |PRESIDENT|7998|5000|1000|10    |2023-04-17|2023|04   |17 |
|7844 |TURNER|SALESMAN |7698|1500|10  |30    |2023-04-17|2023|04   |17 |
|7876 |ADAMS |CLERK    |7788|1100|100 |20    |2023-

14

In [None]:
# Save the data of Day0 in warhouse tables

date_day0_df = date_day0.write.partitionBy('year','Month','Day').saveAsTable('employee_day_wise')

In [None]:
# query warhouse tables

spark.sql('select * from employee_day_wise').show()
spark.sql('select count(*) from employee_day_wise').show()

+-----+------+---------+----+----+----+------+----------+----+-----+---+
|EMPNO| ENAME|      JOB| MGR| SAL|COMM|DEPTNO|      Date|year|Month|Day|
+-----+------+---------+----+----+----+------+----------+----+-----+---+
| 7369| SMITH|    CLERK|7902| 800| 100|    20|2023-04-17|2023|   04| 17|
| 7499| ALLEN| SALESMAN|7698|1600| 300|    30|2023-04-17|2023|   04| 17|
| 7521|  WARD| SALESMAN|7698|1250| 500|    30|2023-04-17|2023|   04| 17|
| 7566| JONES|  MANAGER|7839|2975| 200|    20|2023-04-17|2023|   04| 17|
| 7654|MARTIN| SALESMAN|7698|1250| 600|    30|2023-04-17|2023|   04| 17|
| 7698|   SGR|  MANAGER|7839|2850| 500|    30|2023-04-17|2023|   04| 17|
| 7782|  RAVI|  MANAGER|7839|2450| 500|    10|2023-04-17|2023|   04| 17|
| 7788| SCOTT|  ANALYST|7566|3000|1200|    20|2023-04-17|2023|   04| 17|
| 7839|  KING|PRESIDENT|7998|5000|1000|    10|2023-04-17|2023|   04| 17|
| 7844|TURNER| SALESMAN|7698|1500|  10|    30|2023-04-17|2023|   04| 17|
| 7876| ADAMS|    CLERK|7788|1100| 100|    20|2023-

In [None]:
# # read and Partitioned data of Day1 (Today's data)

from pyspark.sql.functions import *

date_day1 = spark.read.format('csv').option('header',True).load('/content/employee_day1.csv').withColumn('Date',current_date())

date_day1 = date_day1.withColumn('year',date_format('Date','yyyy')).withColumn('Month',date_format('Date','MM')).withColumn('day',date_format('Date','dd'))

date_day1.show()

date_day1.count()


+-----+------+---------+----+----+----+------+----------+----+-----+---+
|EMPNO| ENAME|      JOB| MGR| SAL|COMM|DEPTNO|      Date|year|Month|day|
+-----+------+---------+----+----+----+------+----------+----+-----+---+
| 1234|SEKHAR|   doctor|7777| 667|  78|    80|2023-04-18|2023|   04| 18|
| 7369| SMITH|    CLERK|7902| 800|  90|    20|2023-04-18|2023|   04| 18|
| 7499| ALLEN| SALESMAN|7698|1600| 300|    30|2023-04-18|2023|   04| 18|
| 7521|  WARD| SALESMAN|7698|1250| 500|    30|2023-04-18|2023|   04| 18|
| 7566| JONES|  MANAGER|7839|2975| 100|    20|2023-04-18|2023|   04| 18|
| 7654|MARTIN| SALESMAN|7698|1250|1400|    30|2023-04-18|2023|   04| 18|
| 7698|   SGR|  MANAGER|7839|2850| 200|    30|2023-04-18|2023|   04| 18|
| 7782|  RAVI|  MANAGER|7839|2450|  14|    10|2023-04-18|2023|   04| 18|
| 7788| SCOTT|  ANALYST|7566|3000| 180|    20|2023-04-18|2023|   04| 18|
| 7839|  KING|PRESIDENT|7888|5000| 140|    10|2023-04-18|2023|   04| 18|
| 7844|TURNER| SALESMAN|7698|1500|  10|    30|2023-

16

In [None]:
# Save the data of Day0 in warhouse tables

date_day1_df  = date_day1.write.partitionBy('year','Month','day').mode('append').saveAsTable('employee_day_wise')

In [None]:
# query warhouse tables

spark.sql('select * from employee_day_wise').show()
spark.sql('select count(*) from employee_day_wise').show()

+-----+------+---------+----+----+----+------+----------+----+-----+---+
|EMPNO| ENAME|      JOB| MGR| SAL|COMM|DEPTNO|      Date|year|Month|Day|
+-----+------+---------+----+----+----+------+----------+----+-----+---+
| 1234|SEKHAR|   doctor|7777| 667|  78|    80|2023-04-18|2023|   04| 18|
| 7369| SMITH|    CLERK|7902| 800|  90|    20|2023-04-18|2023|   04| 18|
| 7499| ALLEN| SALESMAN|7698|1600| 300|    30|2023-04-18|2023|   04| 18|
| 7521|  WARD| SALESMAN|7698|1250| 500|    30|2023-04-18|2023|   04| 18|
| 7566| JONES|  MANAGER|7839|2975| 100|    20|2023-04-18|2023|   04| 18|
| 7654|MARTIN| SALESMAN|7698|1250|1400|    30|2023-04-18|2023|   04| 18|
| 7698|   SGR|  MANAGER|7839|2850| 200|    30|2023-04-18|2023|   04| 18|
| 7782|  RAVI|  MANAGER|7839|2450|  14|    10|2023-04-18|2023|   04| 18|
| 7788| SCOTT|  ANALYST|7566|3000| 180|    20|2023-04-18|2023|   04| 18|
| 7839|  KING|PRESIDENT|7888|5000| 140|    10|2023-04-18|2023|   04| 18|
| 7844|TURNER| SALESMAN|7698|1500|  10|    30|2023-

# How to Skip first few rows? ( Step By Step detailed)



In [139]:
Rdd = spark.sparkContext.textFile('/content/emp_pipe_skip.txt').zipWithIndex()

Rdd.collect()

[('line 1', 0),
 ('line 2', 1),
 ('line 3', 2),
 ('EMPNO|ENAME|JOB|MGR|HIREDATE|SAL|COMM|DEPTNO|UPDATED_DATE', 3),
 ('7369|SMITH|CLERK|7902|17-12-1980|800|null|20|01-01-2022', 4),
 ('7499|ALLEN|SALESMAN|7698|20-02-1981|1600|300|30|02-01-2022', 5),
 ('7521|WARD|SALESMAN|7698|22-02-1981|1250|500|30|03-01-2022', 6),
 ('7566|JONES|MANAGER|7839|04-02-1981|2975|null|20|04-01-2022', 7),
 ('7654|MARTIN|SALESMAN|7698|21-09-1981|1250|1400|30|05-01-2022', 8),
 ('7698|SGR|MANAGER|7839|05-01-1981|2850|null|30|06-01-2022', 9),
 ('7782|RAVI|MANAGER|7839|06-09-1981|2450|null|10|07-01-2022', 10),
 ('7788|SCOTT|ANALYST|7566|19-04-1987|3000|null|20|08-01-2022', 11),
 ('7839|KING|PRESIDENT|null|null|5000|null|10|null', 12),
 ('7844|TURNER|SALESMAN|7698|09-08-1981|1500|0|30|01-02-2022', 13),
 ('7876|ADAMS|CLERK|7788|23-05-1987|1100|null|20|02-02-2022', 14),
 ('7900|JAMES|CLERK|7698|12-03-1981|950|null|30|03-02-2022', 15),
 ('7902|FORD|ANALYST|7566|12-03-1981|3000|null|20|04-02-2022', 16),
 ('7934|MILLER|CL

In [152]:
Rdd1 = Rdd.filter(lambda a : a[1]>2)
Rdd1.collect()

[('EMPNO|ENAME|JOB|MGR|HIREDATE|SAL|COMM|DEPTNO|UPDATED_DATE', 3),
 ('7369|SMITH|CLERK|7902|17-12-1980|800|null|20|01-01-2022', 4),
 ('7499|ALLEN|SALESMAN|7698|20-02-1981|1600|300|30|02-01-2022', 5),
 ('7521|WARD|SALESMAN|7698|22-02-1981|1250|500|30|03-01-2022', 6),
 ('7566|JONES|MANAGER|7839|04-02-1981|2975|null|20|04-01-2022', 7),
 ('7654|MARTIN|SALESMAN|7698|21-09-1981|1250|1400|30|05-01-2022', 8),
 ('7698|SGR|MANAGER|7839|05-01-1981|2850|null|30|06-01-2022', 9),
 ('7782|RAVI|MANAGER|7839|06-09-1981|2450|null|10|07-01-2022', 10),
 ('7788|SCOTT|ANALYST|7566|19-04-1987|3000|null|20|08-01-2022', 11),
 ('7839|KING|PRESIDENT|null|null|5000|null|10|null', 12),
 ('7844|TURNER|SALESMAN|7698|09-08-1981|1500|0|30|01-02-2022', 13),
 ('7876|ADAMS|CLERK|7788|23-05-1987|1100|null|20|02-02-2022', 14),
 ('7900|JAMES|CLERK|7698|12-03-1981|950|null|30|03-02-2022', 15),
 ('7902|FORD|ANALYST|7566|12-03-1981|3000|null|20|04-02-2022', 16),
 ('7934|MILLER|CLERK|7782|01-03-1982|1300|null|10|05-02-2022', 17

In [155]:
# Removing index and split using sep

Rdd2 = Rdd1.map(lambda a : a[0].split('|'))

Rdd2.collect()


[['EMPNO',
  'ENAME',
  'JOB',
  'MGR',
  'HIREDATE',
  'SAL',
  'COMM',
  'DEPTNO',
  'UPDATED_DATE'],
 ['7369',
  'SMITH',
  'CLERK',
  '7902',
  '17-12-1980',
  '800',
  'null',
  '20',
  '01-01-2022'],
 ['7499',
  'ALLEN',
  'SALESMAN',
  '7698',
  '20-02-1981',
  '1600',
  '300',
  '30',
  '02-01-2022'],
 ['7521',
  'WARD',
  'SALESMAN',
  '7698',
  '22-02-1981',
  '1250',
  '500',
  '30',
  '03-01-2022'],
 ['7566',
  'JONES',
  'MANAGER',
  '7839',
  '04-02-1981',
  '2975',
  'null',
  '20',
  '04-01-2022'],
 ['7654',
  'MARTIN',
  'SALESMAN',
  '7698',
  '21-09-1981',
  '1250',
  '1400',
  '30',
  '05-01-2022'],
 ['7698',
  'SGR',
  'MANAGER',
  '7839',
  '05-01-1981',
  '2850',
  'null',
  '30',
  '06-01-2022'],
 ['7782',
  'RAVI',
  'MANAGER',
  '7839',
  '06-09-1981',
  '2450',
  'null',
  '10',
  '07-01-2022'],
 ['7788',
  'SCOTT',
  'ANALYST',
  '7566',
  '19-04-1987',
  '3000',
  'null',
  '20',
  '08-01-2022'],
 ['7839', 'KING', 'PRESIDENT', 'null', 'null', '5000', 'null'

In [161]:
# collect columns headers
columns = Rdd2.collect()[0]

print(columns)

['EMPNO', 'ENAME', 'JOB', 'MGR', 'HIREDATE', 'SAL', 'COMM', 'DEPTNO', 'UPDATED_DATE']


In [162]:
# skip first row
kipline = Rdd2.first()

print(skipline)

['EMPNO', 'ENAME', 'JOB', 'MGR', 'HIREDATE', 'SAL', 'COMM', 'DEPTNO', 'UPDATED_DATE']


In [163]:
# making df from rdd

DF = Rdd2.filter(lambda a : a != skipline).toDF(columns)

DF.show()

DF.printSchema()

+-----+------+---------+----+----------+----+----+------+------------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE| SAL|COMM|DEPTNO|UPDATED_DATE|
+-----+------+---------+----+----------+----+----+------+------------+
| 7369| SMITH|    CLERK|7902|17-12-1980| 800|null|    20|  01-01-2022|
| 7499| ALLEN| SALESMAN|7698|20-02-1981|1600| 300|    30|  02-01-2022|
| 7521|  WARD| SALESMAN|7698|22-02-1981|1250| 500|    30|  03-01-2022|
| 7566| JONES|  MANAGER|7839|04-02-1981|2975|null|    20|  04-01-2022|
| 7654|MARTIN| SALESMAN|7698|21-09-1981|1250|1400|    30|  05-01-2022|
| 7698|   SGR|  MANAGER|7839|05-01-1981|2850|null|    30|  06-01-2022|
| 7782|  RAVI|  MANAGER|7839|06-09-1981|2450|null|    10|  07-01-2022|
| 7788| SCOTT|  ANALYST|7566|19-04-1987|3000|null|    20|  08-01-2022|
| 7839|  KING|PRESIDENT|null|      null|5000|null|    10|        null|
| 7844|TURNER| SALESMAN|7698|09-08-1981|1500|   0|    30|  01-02-2022|
| 7876| ADAMS|    CLERK|7788|23-05-1987|1100|null|    20|  02-02-2022|
| 7900

# Skip rows in given file ?

In [164]:
r1 = spark.sparkContext.textFile('/content/emp_pipe_skip.txt').zipWithIndex().filter(lambda a : a[1]>2).map(lambda a : a[0].split('|'))

r1.collect()

[['EMPNO',
  'ENAME',
  'JOB',
  'MGR',
  'HIREDATE',
  'SAL',
  'COMM',
  'DEPTNO',
  'UPDATED_DATE'],
 ['7369',
  'SMITH',
  'CLERK',
  '7902',
  '17-12-1980',
  '800',
  'null',
  '20',
  '01-01-2022'],
 ['7499',
  'ALLEN',
  'SALESMAN',
  '7698',
  '20-02-1981',
  '1600',
  '300',
  '30',
  '02-01-2022'],
 ['7521',
  'WARD',
  'SALESMAN',
  '7698',
  '22-02-1981',
  '1250',
  '500',
  '30',
  '03-01-2022'],
 ['7566',
  'JONES',
  'MANAGER',
  '7839',
  '04-02-1981',
  '2975',
  'null',
  '20',
  '04-01-2022'],
 ['7654',
  'MARTIN',
  'SALESMAN',
  '7698',
  '21-09-1981',
  '1250',
  '1400',
  '30',
  '05-01-2022'],
 ['7698',
  'SGR',
  'MANAGER',
  '7839',
  '05-01-1981',
  '2850',
  'null',
  '30',
  '06-01-2022'],
 ['7782',
  'RAVI',
  'MANAGER',
  '7839',
  '06-09-1981',
  '2450',
  'null',
  '10',
  '07-01-2022'],
 ['7788',
  'SCOTT',
  'ANALYST',
  '7566',
  '19-04-1987',
  '3000',
  'null',
  '20',
  '08-01-2022'],
 ['7839', 'KING', 'PRESIDENT', 'null', 'null', '5000', 'null'

In [166]:
columns = r1.collect()[0]
skipline = r1.collect()[0]
print(columns)
print(skipline)

['EMPNO', 'ENAME', 'JOB', 'MGR', 'HIREDATE', 'SAL', 'COMM', 'DEPTNO', 'UPDATED_DATE']
['EMPNO', 'ENAME', 'JOB', 'MGR', 'HIREDATE', 'SAL', 'COMM', 'DEPTNO', 'UPDATED_DATE']


In [170]:
sk_df = r1.filter(lambda a : a!= skipline).toDF(columns)

sk_df.show(10)
print(sk_df.count())
sk_df.printSchema()
print(type(sk_df))


+-----+------+---------+----+----------+----+----+------+------------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE| SAL|COMM|DEPTNO|UPDATED_DATE|
+-----+------+---------+----+----------+----+----+------+------------+
| 7369| SMITH|    CLERK|7902|17-12-1980| 800|null|    20|  01-01-2022|
| 7499| ALLEN| SALESMAN|7698|20-02-1981|1600| 300|    30|  02-01-2022|
| 7521|  WARD| SALESMAN|7698|22-02-1981|1250| 500|    30|  03-01-2022|
| 7566| JONES|  MANAGER|7839|04-02-1981|2975|null|    20|  04-01-2022|
| 7654|MARTIN| SALESMAN|7698|21-09-1981|1250|1400|    30|  05-01-2022|
| 7698|   SGR|  MANAGER|7839|05-01-1981|2850|null|    30|  06-01-2022|
| 7782|  RAVI|  MANAGER|7839|06-09-1981|2450|null|    10|  07-01-2022|
| 7788| SCOTT|  ANALYST|7566|19-04-1987|3000|null|    20|  08-01-2022|
| 7839|  KING|PRESIDENT|null|      null|5000|null|    10|        null|
| 7844|TURNER| SALESMAN|7698|09-08-1981|1500|   0|    30|  01-02-2022|
+-----+------+---------+----+----------+----+----+------+------------+
only s

# how to remove duplicates

1.   distinct()
2.   dropDuplicates()/drop_duplicates()
3.   window function with row_number()
4.   groupBy with Count()





In [197]:
dup = spark.read.format('csv').option('header',True).option('inferSchema',True).load('/content/employee_dup.csv')
dup.show()
dup.count()

+-----+------+---------+----+----+----+------+------------+
|EMPNO| ENAME|      JOB| MGR| SAL|COMM|DEPTNO|UPDATED_DATE|
+-----+------+---------+----+----+----+------+------------+
| 7369| SMITH|    CLERK|7902| 800|null|    20|  01-01-2021|
| 7499| ALLEN| SALESMAN|7698|1600| 300|    30|  02-01-2021|
| 7521|  WARD| SALESMAN|7698|1250| 500|    30|  03-01-2021|
| 7566| JONES|  MANAGER|7839|2975|null|    20|  04-01-2021|
| 7654|MARTIN| SALESMAN|7698|1250|1400|    30|  05-01-2021|
| 7698|   SGR|  MANAGER|7839|2850|null|    30|  06-01-2021|
| 7782|  RAVI|  MANAGER|7839|2450|null|    10|  07-01-2021|
| 7788| SCOTT|  ANALYST|7566|3000|null|    20|  08-01-2021|
| 7839|  KING|PRESIDENT|null|5000|null|    10|        null|
| 7369| SMITH|    CLERK|7902| 800|null|    20|  01-01-2021|
| 7499| ALLEN| SALESMAN|7698|1600| 300|    30|  02-01-2021|
| 7521|  WARD| SALESMAN|7698|1250| 500|    30|  03-01-2021|
| 7566| JONES|  MANAGER|7839|2975|null|    20|  04-01-2021|
| 7654|MARTIN| SALESMAN|7698|1250|1400| 

18

In [198]:
#distinct
dist_df = dup.distinct()
dist_df.show()
dist_df.count()

+-----+------+---------+----+----+----+------+------------+
|EMPNO| ENAME|      JOB| MGR| SAL|COMM|DEPTNO|UPDATED_DATE|
+-----+------+---------+----+----+----+------+------------+
| 7782|  RAVI|  MANAGER|7839|2450|null|    10|  07-01-2021|
| 7369| SMITH|    CLERK|7902| 800|null|    20|  01-01-2021|
| 7566| JONES|  MANAGER|7839|2975|null|    20|  04-01-2021|
| 7839|  KING|PRESIDENT|null|5000|null|    10|        null|
| 7698|   SGR|  MANAGER|7839|2850|null|    30|  06-01-2021|
| 7499| ALLEN| SALESMAN|7698|1600| 300|    30|  02-01-2021|
| 7788| SCOTT|  ANALYST|7566|3000|null|    20|  08-01-2021|
| 7521|  WARD| SALESMAN|7698|1250| 500|    30|  03-01-2021|
| 7654|MARTIN| SALESMAN|7698|1250|1400|    30|  05-01-2021|
+-----+------+---------+----+----+----+------+------------+



9

In [199]:
#dropduplicates - it will keep only first record ,we will not get latest date data 
#so we need to use order by function and desc to drop duplicates

drop_df = dup.orderBy(col('EMPNO').desc()).dropDuplicates(['EMPNO'])
drop_df.show()

+-----+------+---------+----+----+----+------+------------+
|EMPNO| ENAME|      JOB| MGR| SAL|COMM|DEPTNO|UPDATED_DATE|
+-----+------+---------+----+----+----+------+------------+
| 7369| SMITH|    CLERK|7902| 800|null|    20|  01-01-2021|
| 7499| ALLEN| SALESMAN|7698|1600| 300|    30|  02-01-2021|
| 7521|  WARD| SALESMAN|7698|1250| 500|    30|  03-01-2021|
| 7566| JONES|  MANAGER|7839|2975|null|    20|  04-01-2021|
| 7654|MARTIN| SALESMAN|7698|1250|1400|    30|  05-01-2021|
| 7698|   SGR|  MANAGER|7839|2850|null|    30|  06-01-2021|
| 7782|  RAVI|  MANAGER|7839|2450|null|    10|  07-01-2021|
| 7788| SCOTT|  ANALYST|7566|3000|null|    20|  08-01-2021|
| 7839|  KING|PRESIDENT|null|5000|null|    10|        null|
+-----+------+---------+----+----+----+------+------------+



In [205]:
# window Function wuth row number

from pyspark.sql.window import *


win_df = dup.withColumn('row',row_number().over(Window.partitionBy('EMPNO').orderBy( col('EMPNO').desc() )))


# Good Data 
Unique_df = win_df.filter(col('row') ==  1)

Unique_df.show()
print(Unique_df.count())

# Bad Data
dupli_df = win_df.filter(col('row') != 1)

dupli_df.show()
print(dupli_df.count())


+-----+------+---------+----+----+----+------+------------+---+
|EMPNO| ENAME|      JOB| MGR| SAL|COMM|DEPTNO|UPDATED_DATE|row|
+-----+------+---------+----+----+----+------+------------+---+
| 7369| SMITH|    CLERK|7902| 800|null|    20|  01-01-2021|  1|
| 7499| ALLEN| SALESMAN|7698|1600| 300|    30|  02-01-2021|  1|
| 7521|  WARD| SALESMAN|7698|1250| 500|    30|  03-01-2021|  1|
| 7566| JONES|  MANAGER|7839|2975|null|    20|  04-01-2021|  1|
| 7654|MARTIN| SALESMAN|7698|1250|1400|    30|  05-01-2021|  1|
| 7698|   SGR|  MANAGER|7839|2850|null|    30|  06-01-2021|  1|
| 7782|  RAVI|  MANAGER|7839|2450|null|    10|  07-01-2021|  1|
| 7788| SCOTT|  ANALYST|7566|3000|null|    20|  08-01-2021|  1|
| 7839|  KING|PRESIDENT|null|5000|null|    10|        null|  1|
+-----+------+---------+----+----+----+------+------------+---+

9
+-----+------+---------+----+----+----+------+------------+---+
|EMPNO| ENAME|      JOB| MGR| SAL|COMM|DEPTNO|UPDATED_DATE|row|
+-----+------+---------+----+----+---

# how to read all files from a single floder

In [None]:
da = spark.read.format('csv').option('header',True).option('nullValue','null').option('inferSchema',True).load('/content/Data/')

da.show()
da.count()

AnalysisException: ignored

# how to read all files from a floder inside another floder (nested/sub floders)

In [None]:
ne_da = spark.read.format('csv').option('recursiveFileLookup',True).option('header',True).option('nullValue','null').option('inferSchema',True).load('/content/Data')


ne_da.show()
ne_da.count()

# print file name with location

In [216]:
from pyspark.sql.functions import *
df.show(10)

fi_na = df.withColumn('file_Location',input_file_name())

fi_na.show(10)





+-----+------+---------+----+----------+----+----+------+------------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE| SAL|COMM|DEPTNO|UPDATED_DATE|
+-----+------+---------+----+----------+----+----+------+------------+
| 7369| SMITH|    CLERK|7902|17-12-1980| 800|null|    20|  01-01-2022|
| 7499| ALLEN| SALESMAN|7698|20-02-1981|1600| 300|    30|  02-01-2022|
| 7521|  WARD| SALESMAN|7698|22-02-1981|1250| 500|    30|  03-01-2022|
| 7566| JONES|  MANAGER|7839|04-02-1981|2975|null|    20|  04-01-2022|
| 7654|MARTIN| SALESMAN|7698|21-09-1981|1250|1400|    30|  05-01-2022|
| 7698|   SGR|  MANAGER|7839|05-01-1981|2850|null|    30|  06-01-2022|
| 7782|  RAVI|  MANAGER|7839|06-09-1981|2450|null|    10|  07-01-2022|
| 7788| SCOTT|  ANALYST|7566|19-04-1987|3000|null|    20|  08-01-2022|
| 7839|  KING|PRESIDENT|null|      null|5000|null|    10|        null|
| 7844|TURNER| SALESMAN|7698|09-08-1981|1500|   0|    30|  01-02-2022|
+-----+------+---------+----+----------+----+----+------+------------+
only s

# how to get no.of rows for each file?

In [219]:
fi_na.groupBy('file_Location').count().show(truncate=False)

+-------------------------------+-----+
|file_Location                  |count|
+-------------------------------+-----+
|file:/content/emp_pipe_skip.txt|19   |
+-------------------------------+-----+



# how to get no.of rows in each partition

In [220]:
fi_na.select(spark_partition_id().alias('id')).groupBy('id').count().show()

+---+-----+
| id|count|
+---+-----+
|  0|   19|
+---+-----+



In [221]:
fi_na.repartition(3).select(spark_partition_id().alias('id')).groupBy('id').count().show()

+---+-----+
| id|count|
+---+-----+
|  0|    7|
|  1|    6|
|  2|    6|
+---+-----+



# how to add/generate sequence id /surrogate key as a column


1.   monotonically_increasing_id()
2.   row_number() - window function
3.   crc32   - generate random numbers
4.   md5     - hash key generater function
5.   sha1 and sha2 -hash key generate function








In [232]:
surr = spark.read.format('csv').option('header',True).option('nullValue','null') \
                .load('/content/employee.csv').dropna(how = 'any').orderBy(col('EMPNO').desc()).dropDuplicates(['EMPNO'])
surr.show()
surr.count()


+-----+------+--------+----+----------+----+----+------+------------+
|EMPNO| ENAME|     JOB| MGR|  HIREDATE| SAL|COMM|DEPTNO|UPDATED_DATE|
+-----+------+--------+----+----------+----+----+------+------------+
| 7499| ALLEN|SALESMAN|7698|20-02-1981|1600| 300|    30|  02-01-2022|
| 7521|  WARD|SALESMAN|7698|22-02-1981|1250| 500|    30|  03-01-2022|
| 7654|MARTIN|SALESMAN|7698|21-09-1981|1250| 600|    30|  05-01-2022|
| 7698|   SGR| MANAGER|7839|05-01-1981|2850| 500|    30|  06-01-2022|
| 7782|  RAVI| MANAGER|7839|06-09-1981|2450|  14|    10|  04-02-2021|
| 7788| SCOTT| ANALYST|7566|19-04-1987|3000|1200|    20|  08-01-2022|
| 7844|TURNER|SALESMAN|7698|09-08-1981|1500|   0|    30|  01-02-2022|
| 7876| ADAMS|   CLERK|7788|23-05-1987|1100| 100|    20|  02-02-2022|
| 7902|  FORD| ANALYST|7566|12-03-1981|3000| 400|    20|  04-02-2022|
| 7934|MILLER|   CLERK|7782|01-03-1982|1300| 100|    10|  02-01-2021|
+-----+------+--------+----+----------+----+----+------+------------+



10

# monotonically_increasing_id

In [235]:
#monotonically_increasing_id
mono = surr.withColumn('Mono_id',monotonically_increasing_id())
mono.show()


# from custom id

mono1 = surr.withColumn('mono_id',monotonically_increasing_id()+1)
mono1.show()

+-----+------+--------+----+----------+----+----+------+------------+-------+
|EMPNO| ENAME|     JOB| MGR|  HIREDATE| SAL|COMM|DEPTNO|UPDATED_DATE|Mono_id|
+-----+------+--------+----+----------+----+----+------+------------+-------+
| 7499| ALLEN|SALESMAN|7698|20-02-1981|1600| 300|    30|  02-01-2022|      0|
| 7521|  WARD|SALESMAN|7698|22-02-1981|1250| 500|    30|  03-01-2022|      1|
| 7654|MARTIN|SALESMAN|7698|21-09-1981|1250| 600|    30|  05-01-2022|      2|
| 7698|   SGR| MANAGER|7839|05-01-1981|2850| 500|    30|  06-01-2022|      3|
| 7782|  RAVI| MANAGER|7839|06-09-1981|2450|  14|    10|  04-02-2021|      4|
| 7788| SCOTT| ANALYST|7566|19-04-1987|3000|1200|    20|  08-01-2022|      5|
| 7844|TURNER|SALESMAN|7698|09-08-1981|1500|   0|    30|  01-02-2022|      6|
| 7876| ADAMS|   CLERK|7788|23-05-1987|1100| 100|    20|  02-02-2022|      7|
| 7902|  FORD| ANALYST|7566|12-03-1981|3000| 400|    20|  04-02-2022|      8|
| 7934|MILLER|   CLERK|7782|01-03-1982|1300| 100|    10|  02-01-

#  row_number

In [240]:
from pyspark.sql.functions import *

from pyspark.sql.window import *

row_df = surr.withColumn('row_num',row_number().over(Window.partitionBy(lit('')).orderBy(lit(''))))

row_df.show()

+-----+------+--------+----+----------+----+----+------+------------+-------+
|EMPNO| ENAME|     JOB| MGR|  HIREDATE| SAL|COMM|DEPTNO|UPDATED_DATE|row_num|
+-----+------+--------+----+----------+----+----+------+------------+-------+
| 7499| ALLEN|SALESMAN|7698|20-02-1981|1600| 300|    30|  02-01-2022|      1|
| 7521|  WARD|SALESMAN|7698|22-02-1981|1250| 500|    30|  03-01-2022|      2|
| 7654|MARTIN|SALESMAN|7698|21-09-1981|1250| 600|    30|  05-01-2022|      3|
| 7698|   SGR| MANAGER|7839|05-01-1981|2850| 500|    30|  06-01-2022|      4|
| 7782|  RAVI| MANAGER|7839|06-09-1981|2450|  14|    10|  04-02-2021|      5|
| 7788| SCOTT| ANALYST|7566|19-04-1987|3000|1200|    20|  08-01-2022|      6|
| 7844|TURNER|SALESMAN|7698|09-08-1981|1500|   0|    30|  01-02-2022|      7|
| 7876| ADAMS|   CLERK|7788|23-05-1987|1100| 100|    20|  02-02-2022|      8|
| 7902|  FORD| ANALYST|7566|12-03-1981|3000| 400|    20|  04-02-2022|      9|
| 7934|MILLER|   CLERK|7782|01-03-1982|1300| 100|    10|  02-01-

# crc32 

1.generate random numbers - it is works only on string data type

2.We should not use crc32 surrogate key generation on larger table because it can generate duplicate sequence if more than 100k/1M records



In [None]:
from pyspark.sql.functions import *

crc_df = surr.withColumn('crc',crc32(col('EMPNO').cast('string')))

crc_df.show()

+-----+------+--------+----+----------+----+----+------+------------+----------+
|EMPNO| ENAME|     JOB| MGR|  HIREDATE| SAL|COMM|DEPTNO|UPDATED_DATE|     crc32|
+-----+------+--------+----+----------+----+----+------+------------+----------+
| 7499| ALLEN|SALESMAN|7698|20-02-1981|1600| 300|    30|  02-01-2022|1046173690|
| 7521|  WARD|SALESMAN|7698|22-02-1981|1250| 500|    30|  03-01-2022|3535170612|
| 7654|MARTIN|SALESMAN|7698|21-09-1981|1250| 600|    30|  05-01-2022|4024152101|
| 7698|   SGR| MANAGER|7839|05-01-1981|2850| 500|    30|  06-01-2022|1255715586|
| 7782|  RAVI| MANAGER|7839|06-09-1981|2450|  14|    10|  04-02-2021|3000238442|
| 7788| SCOTT| ANALYST|7566|19-04-1987|3000|1200|    20|  08-01-2022|1375856756|
| 7844|TURNER|SALESMAN|7698|09-08-1981|1500|   0|    30|  01-02-2022|4234062958|
| 7876| ADAMS|   CLERK|7788|23-05-1987|1100| 100|    20|  02-02-2022| 964533889|
| 7902|  FORD| ANALYST|7566|12-03-1981|3000| 400|    20|  04-02-2022|1888724584|
| 7934|MILLER|   CLERK|7782|

In [None]:

# crc32 with key generation with two columns

crc_df = spark.read.format('csv').option('header',True).load('/content/sample_data/california_housing_test.csv')
crc_df = crc_df.withColumn('Location',concat(col('latitude'),col('longitude')))
crc_df = crc_df.withColumn('CRC_32',crc32(col('Location')))
crc_df = crc_df.withColumn('Duplicates',row_number().over(Window.partitionBy('CRC_32').orderBy('CRC_32')))
crc_df = crc_df.filter(col('Duplicates')>1)
crc_df.show()
crc_df.count()

+-----------+---------+------------------+-----------+--------------+-----------+-----------+-------------+------------------+--------------------+---------+----------+
|  longitude| latitude|housing_median_age|total_rooms|total_bedrooms| population| households|median_income|median_house_value|            Location|   CRC_32|Duplicates|
+-----------+---------+------------------+-----------+--------------+-----------+-----------+-------------+------------------+--------------------+---------+----------+
|-118.380000|34.050000|         35.000000|3517.000000|    879.000000|1632.000000| 784.000000|     3.095600|     500001.000000|34.050000-118.380000| 11051636|         2|
|-122.540000|37.900000|         48.000000|2491.000000|    460.000000| 937.000000| 455.000000|     4.437500|     370000.000000|37.900000-122.540000| 66409223|         2|
|-118.400000|33.870000|         26.000000|6712.000000|   1441.000000|2803.000000|1394.000000|     5.227600|     434500.000000|33.870000-118.400000|10611730

288

# md5

In [251]:
# md5 - 32 bit hash key
# not suggeted for if records more than 1 Million it can generate duplicates

from pyspark.sql.functions import *

md_df = surr.withColumn('md', md5( col('EMPNO').cast('string')))

md_df.show()

+-----+------+--------+----+----------+----+----+------+------------+--------------------+
|EMPNO| ENAME|     JOB| MGR|  HIREDATE| SAL|COMM|DEPTNO|UPDATED_DATE|                  md|
+-----+------+--------+----+----------+----+----+------+------------+--------------------+
| 7499| ALLEN|SALESMAN|7698|20-02-1981|1600| 300|    30|  02-01-2022|7a2b33c672ce223b2...|
| 7521|  WARD|SALESMAN|7698|22-02-1981|1250| 500|    30|  03-01-2022|e1e1f667ce4596e56...|
| 7654|MARTIN|SALESMAN|7698|21-09-1981|1250| 600|    30|  05-01-2022|e2a7555f7cabd6e31...|
| 7698|   SGR| MANAGER|7839|05-01-1981|2850| 500|    30|  06-01-2022|c570c225d1fb8a72a...|
| 7782|  RAVI| MANAGER|7839|06-09-1981|2450|  14|    10|  04-02-2021|ac5c482277858d6fe...|
| 7788| SCOTT| ANALYST|7566|19-04-1987|3000|1200|    20|  08-01-2022|866c7ee013c58f01f...|
| 7844|TURNER|SALESMAN|7698|09-08-1981|1500|   0|    30|  01-02-2022|b356e7aed7ee82589...|
| 7876| ADAMS|   CLERK|7788|23-05-1987|1100| 100|    20|  02-02-2022|42dab56861d81108e...|

# sha2

In [None]:
#sha2 - hash key value
#suggeted for huge data 256 0r 512 bits

from pyspark.sql.functions import *

sha_df = surr.withColumn('sha',sha2( col('EmpNO').cast('string'),256))
sha_df1 = surr.withColumn('sha',sha2( col('EmpNO').cast('string'),512))
sha_df.show(truncate = False)
sha_df1.show(truncate = False)

+-----+------+--------+----+----------+----+----+------+------------+--------------------+
|EMPNO| ENAME|     JOB| MGR|  HIREDATE| SAL|COMM|DEPTNO|UPDATED_DATE|                sha2|
+-----+------+--------+----+----------+----+----+------+------------+--------------------+
| 7499| ALLEN|SALESMAN|7698|20-02-1981|1600| 300|    30|  02-01-2022|4427dc2e32a1d099d...|
| 7521|  WARD|SALESMAN|7698|22-02-1981|1250| 500|    30|  03-01-2022|74ed8ca63e8b4fb8b...|
| 7654|MARTIN|SALESMAN|7698|21-09-1981|1250| 600|    30|  05-01-2022|b969b01c158ebfecd...|
| 7698|   SGR| MANAGER|7839|05-01-1981|2850| 500|    30|  06-01-2022|83f9d8d707524a4f5...|
| 7782|  RAVI| MANAGER|7839|06-09-1981|2450|  14|    10|  04-02-2021|d7b6fab9aa91943de...|
| 7788| SCOTT| ANALYST|7566|19-04-1987|3000|1200|    20|  08-01-2022|16740bf13991fe083...|
| 7844|TURNER|SALESMAN|7698|09-08-1981|1500|   0|    30|  01-02-2022|b513a7ff5978b9588...|
| 7876| ADAMS|   CLERK|7788|23-05-1987|1100| 100|    20|  02-02-2022|05a4cd58579909328...|

# how to get no.of null rows from a dataframe at each column

In [257]:
null_df = spark.read.format('csv').option('header',True).option('inferSchema',True).option('nullValue','null').load('/content/employee.csv')

null_df.show()

+-----+------+---------+----+----------+----+----+------+------------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE| SAL|COMM|DEPTNO|UPDATED_DATE|
+-----+------+---------+----+----------+----+----+------+------------+
| 7369| SMITH|    CLERK|7902|17-12-1980| 800|null|    20|  01-01-2022|
| 7499| ALLEN| SALESMAN|7698|20-02-1981|1600| 300|    30|  02-01-2022|
| 7521|  WARD| SALESMAN|7698|22-02-1981|1250| 500|    30|  03-01-2022|
| 7566| JONES|  MANAGER|7839|04-02-1981|2975|null|    20|  04-01-2022|
| 7654|MARTIN| SALESMAN|7698|21-09-1981|1250| 600|    30|  05-01-2022|
| 7698|   SGR|  MANAGER|7839|05-01-1981|2850| 500|    30|  06-01-2022|
| 7782|  RAVI|  MANAGER|7839|06-09-1981|2450|null|    10|  07-01-2022|
| 7788| SCOTT|  ANALYST|7566|19-04-1987|3000|1200|    20|  08-01-2022|
| 7839|  KING|PRESIDENT|null|      null|5000|null|    10|        null|
| 7844|TURNER| SALESMAN|7698|09-08-1981|1500|   0|    30|  01-02-2022|
| 7876| ADAMS|    CLERK|7788|23-05-1987|1100| 100|    20|  02-02-2022|
| 7900

In [264]:
# individual Columns null values

comm = null_df.filter( 'COMM is Null')

comm.show()
print(comm.count())

mgr = null_df.filter( 'MGR is Null ')

mgr.show()
print(mgr.count())

+-----+------+---------+----+----------+----+----+------+------------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE| SAL|COMM|DEPTNO|UPDATED_DATE|
+-----+------+---------+----+----------+----+----+------+------------+
| 7369| SMITH|    CLERK|7902|17-12-1980| 800|null|    20|  01-01-2022|
| 7566| JONES|  MANAGER|7839|04-02-1981|2975|null|    20|  04-01-2022|
| 7782|  RAVI|  MANAGER|7839|06-09-1981|2450|null|    10|  07-01-2022|
| 7839|  KING|PRESIDENT|null|      null|5000|null|    10|        null|
| 7900| JAMES|    CLERK|7698|12-03-1981| 950|null|    30|  03-02-2022|
| 7934|MILLER|    CLERK|7782|01-03-1982|1300|null|    10|  05-02-2022|
| 7369| SMITH|    CLERK|7902|17-12-1980| 800|null|    20|  07-02-2022|
| 7566| JONES|  MANAGER|7839|04-02-1981|2975|null|    20|  01-02-2021|
| 7698|   SGR|  MANAGER|7839|05-01-1981|2850|null|    30|  03-02-2021|
| 7788| SCOTT|  ANALYST|7566|19-04-1987|3000|null|    20|  05-02-2021|
| 7876| ADAMS|    CLERK|7788|23-05-1987|1100|null|    20|  08-02-2021|
| 7902

In [None]:
#how to get number of null in all columns

null_df.select([count(when(col(i).isNull(),i)).alias(i) for i in null_df.columns]).show()


+-----+-----+---+---+--------+---+----+------+------------+
|EMPNO|ENAME|JOB|MGR|HIREDATE|SAL|COMM|DEPTNO|UPDATED_DATE|
+-----+-----+---+---+--------+---+----+------+------------+
|    3|    3|  3|  5|       7|  3|  15|     3|           6|
+-----+-----+---+---+--------+---+----+------+------------+



# Read csv file with sep

In [111]:
du_df = spark.read.format('csv').option('sep','|').option('inferSchema',True).option('header',True).load('/content/emp_pipe.txt')
du_df.show()
du_df.printSchema()

+-----+------+---------+----+----------+----+----+------+------------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE| SAL|COMM|DEPTNO|UPDATED_DATE|
+-----+------+---------+----+----------+----+----+------+------------+
| 7369| SMITH|    CLERK|7902|17-12-1980| 800|null|    20|  01-01-2022|
| 7499| ALLEN| SALESMAN|7698|20-02-1981|1600| 300|    30|  02-01-2022|
| 7521|  WARD| SALESMAN|7698|22-02-1981|1250| 500|    30|  03-01-2022|
| 7566| JONES|  MANAGER|7839|04-02-1981|2975|null|    20|  04-01-2022|
| 7654|MARTIN| SALESMAN|7698|21-09-1981|1250|1400|    30|  05-01-2022|
| 7698|   SGR|  MANAGER|7839|05-01-1981|2850|null|    30|  06-01-2022|
| 7782|  RAVI|  MANAGER|7839|06-09-1981|2450|null|    10|  07-01-2022|
| 7788| SCOTT|  ANALYST|7566|19-04-1987|3000|null|    20|  08-01-2022|
| 7839|  KING|PRESIDENT|null|      null|5000|null|    10|        null|
| 7844|TURNER| SALESMAN|7698|09-08-1981|1500|   0|    30|  01-02-2022|
| 7876| ADAMS|    CLERK|7788|23-05-1987|1100|null|    20|  02-02-2022|
| 7900

# how to handle double delimiter files

In [None]:
#duoble  delimeter files

du_df = spark.read.format('csv').option('sep','||').option('header',True).load('/content/emp_double_pipe.txt')
du_df.show()

+-----+------+---------+----+----------+----+----+------+------------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE| SAL|COMM|DEPTNO|UPDATED_DATE|
+-----+------+---------+----+----------+----+----+------+------------+
| 7369| SMITH|    CLERK|7902|17-12-1980| 800|null|    20|  01-01-2022|
| 7499| ALLEN| SALESMAN|7698|20-02-1981|1600| 300|    30|  02-01-2022|
| 7521|  WARD| SALESMAN|7698|22-02-1981|1250| 500|    30|  03-01-2022|
| 7566| JONES|  MANAGER|7839|04-02-1981|2975|null|    20|  04-01-2022|
| 7654|MARTIN| SALESMAN|7698|21-09-1981|1250|1400|    30|  05-01-2022|
| 7698|   SGR|  MANAGER|7839|05-01-1981|2850|null|    30|  06-01-2022|
| 7782|  RAVI|  MANAGER|7839|06-09-1981|2450|null|    10|  07-01-2022|
| 7788| SCOTT|  ANALYST|7566|19-04-1987|3000|null|    20|  08-01-2022|
| 7839|  KING|PRESIDENT|null|      null|5000|null|    10|        null|
| 7844|TURNER| SALESMAN|7698|09-08-1981|1500|   0|    30|  01-02-2022|
| 7876| ADAMS|    CLERK|7788|23-05-1987|1100|null|    20|  02-02-2022|
| 7900

# how to handle multi delimiter files

In [133]:
#Multiple delimeters



mu_df = spark.read.format('csv').option('sep','||').option('header',True).load('/content/emp_multiple_delimeter.txt')

mu_df.show()

+-----+------+---------+----+----------+--------------------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE|                 SAL|
+-----+------+---------+----+----------+--------------------+
| 7839|  KING|PRESIDENT|null|      null|   5000,null,10,null|
| 7844|TURNER| SALESMAN|7698|09-08-1981|1500,0,30,01-02-2022|
| 7876| ADAMS|    CLERK|7788|23-05-1987|1100,null,20,02-0...|
| 7900| JAMES|    CLERK|7698|12-03-1981|950,null,30,03-02...|
| 7902|  FORD|  ANALYST|7566|12-03-1981|3000,null,20,04-0...|
| 7934|MILLER|    CLERK|7782|01-03-1982|1300,null,10,05-0...|
| 1234|SEKHAR|   doctor|7777|      null|667,78,80,06-02-2022|
| 7369| SMITH|    CLERK|7902|17-12-1980|800,null,20,07-02...|
| 7499| ALLEN| SALESMAN|7698|20-02-1981|1600,300,30,08-02...|
| 7521|  WARD| SALESMAN|7698|22-02-1981|    1250,500,30,null|
| 7566| JONES|  MANAGER|7839|04-02-1981|2975,null,20,01-0...|
+-----+------+---------+----+----------+--------------------+



In [134]:
mu_df = mu_df.withColumn('Split',split('SAL',','))

# mu_df = mu_df.withColumn('SAL_split',split(col('SAL'),'[,]'))


mu_df.show()

+-----+------+---------+----+----------+--------------------+--------------------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE|                 SAL|               Split|
+-----+------+---------+----+----------+--------------------+--------------------+
| 7839|  KING|PRESIDENT|null|      null|   5000,null,10,null|[5000, null, 10, ...|
| 7844|TURNER| SALESMAN|7698|09-08-1981|1500,0,30,01-02-2022|[1500, 0, 30, 01-...|
| 7876| ADAMS|    CLERK|7788|23-05-1987|1100,null,20,02-0...|[1100, null, 20, ...|
| 7900| JAMES|    CLERK|7698|12-03-1981|950,null,30,03-02...|[950, null, 30, 0...|
| 7902|  FORD|  ANALYST|7566|12-03-1981|3000,null,20,04-0...|[3000, null, 20, ...|
| 7934|MILLER|    CLERK|7782|01-03-1982|1300,null,10,05-0...|[1300, null, 10, ...|
| 1234|SEKHAR|   doctor|7777|      null|667,78,80,06-02-2022|[667, 78, 80, 06-...|
| 7369| SMITH|    CLERK|7902|17-12-1980|800,null,20,07-02...|[800, null, 20, 0...|
| 7499| ALLEN| SALESMAN|7698|20-02-1981|1600,300,30,08-02...|[1600, 300, 30, 0...|
| 75

In [135]:
mu_df = mu_df.withColumn('SAL',col('Split')[0]) \
             .withColumn('COMM',col('Split')[1]) \
             .withColumn('DEPTNO',col('Split')[2]) \
             .withColumn('UPDATED_DATE',col('Split')[3]) \
             .drop('Split')

mu_df.show()

+-----+------+---------+----+----------+----+----+------+------------+
|EMPNO| ENAME|      JOB| MGR|  HIREDATE| SAL|COMM|DEPTNO|UPDATED_DATE|
+-----+------+---------+----+----------+----+----+------+------------+
| 7839|  KING|PRESIDENT|null|      null|5000|null|    10|        null|
| 7844|TURNER| SALESMAN|7698|09-08-1981|1500|   0|    30|  01-02-2022|
| 7876| ADAMS|    CLERK|7788|23-05-1987|1100|null|    20|  02-02-2022|
| 7900| JAMES|    CLERK|7698|12-03-1981| 950|null|    30|  03-02-2022|
| 7902|  FORD|  ANALYST|7566|12-03-1981|3000|null|    20|  04-02-2022|
| 7934|MILLER|    CLERK|7782|01-03-1982|1300|null|    10|  05-02-2022|
| 1234|SEKHAR|   doctor|7777|      null| 667|  78|    80|  06-02-2022|
| 7369| SMITH|    CLERK|7902|17-12-1980| 800|null|    20|  07-02-2022|
| 7499| ALLEN| SALESMAN|7698|20-02-1981|1600| 300|    30|  08-02-2022|
| 7521|  WARD| SALESMAN|7698|22-02-1981|1250| 500|    30|        null|
| 7566| JONES|  MANAGER|7839|04-02-1981|2975|null|    20|  01-02-2021|
+-----

# how to get 53 th weak years from last 50 years

In [None]:
we_df = spark.createDataFrame([(i,"01/01/"+str(i)+"")for i in range(1990,2050)],['id','year'])

we_df = we_df.withColumn('date',to_date('year','dd/mm/yyyy')).withColumn('week',weekofyear('date')).filter('week= 53')
we_df.show()

+----+----------+----------+----+
|  id|      year|      date|week|
+----+----------+----------+----+
|1993|01/01/1993|1993-01-01|  53|
|1999|01/01/1999|1999-01-01|  53|
|2005|01/01/2005|2005-01-01|  53|
|2010|01/01/2010|2010-01-01|  53|
|2016|01/01/2016|2016-01-01|  53|
|2021|01/01/2021|2021-01-01|  53|
|2027|01/01/2027|2027-01-01|  53|
|2033|01/01/2033|2033-01-01|  53|
|2038|01/01/2038|2038-01-01|  53|
|2044|01/01/2044|2044-01-01|  53|
|2049|01/01/2049|2049-01-01|  53|
+----+----------+----------+----+



# Creating a dataframe

In [None]:
cr_df = spark.createDataFrame([(i,"02/01/"+str(i)+"") for i in range (1999,2050)],['id','year'])

cr_df.show()

+----+----------+
|  id|      year|
+----+----------+
|1999|02/01/1999|
|2000|02/01/2000|
|2001|02/01/2001|
|2002|02/01/2002|
|2003|02/01/2003|
|2004|02/01/2004|
|2005|02/01/2005|
|2006|02/01/2006|
|2007|02/01/2007|
|2008|02/01/2008|
|2009|02/01/2009|
|2010|02/01/2010|
|2011|02/01/2011|
|2012|02/01/2012|
|2013|02/01/2013|
|2014|02/01/2014|
|2015|02/01/2015|
|2016|02/01/2016|
|2017|02/01/2017|
|2018|02/01/2018|
+----+----------+
only showing top 20 rows



# how to handle or how to read variable/dynamic no.of columns data file ( with Out Header File)?

In [108]:
df = spark.read.format('text').load('/content/emp_without_header.txt').withColumn('New',split('value',','))

# To find max No.of coulumns in a table
col_size = df.select(max(size(col('New'))))

df.show(5,truncate=True)
col_size.show()
print(type(df))
df.printSchema()
     

+--------------------+--------------------+
|               value|                 New|
+--------------------+--------------------+
|7369,SMITH,CLERK,...|[7369, SMITH, CLE...|
|7499,ALLEN,SALESM...|[7499, ALLEN, SAL...|
|7521,WARD,SALESMA...|[7521, WARD, SALE...|
|7566,JONES,MANAGE...|[7566, JONES, MAN...|
|7654,MARTIN,SALES...|[7654, MARTIN, SA...|
+--------------------+--------------------+
only showing top 5 rows

+--------------+
|max(size(New))|
+--------------+
|             9|
+--------------+

<class 'pyspark.sql.dataframe.DataFrame'>
root
 |-- value: string (nullable = true)
 |-- New: array (nullable = true)
 |    |-- element: string (containsNull = false)



In [109]:
for i in range(col_size.collect()[0][0]):
  df = df.withColumn('col'+str(i),df['New'][i])
df.show(truncate = False)

+------------------------------------------------------------+----------------------------------------------------------------------+----+------+---------+----+----------+----+----+----+----------+
|value                                                       |New                                                                   |col0|col1  |col2     |col3|col4      |col5|col6|col7|col8      |
+------------------------------------------------------------+----------------------------------------------------------------------+----+------+---------+----+----------+----+----+----+----------+
|7369,SMITH,CLERK,7902,17-12-1980,800,null,20,01-01-2022     |[7369, SMITH, CLERK, 7902, 17-12-1980, 800, null, 20, 01-01-2022]     |7369|SMITH |CLERK    |7902|17-12-1980|800 |null|20  |01-01-2022|
|7499,ALLEN,SALESMAN,7698,20-02-1981,1600,300,30,02-01-2022  |[7499, ALLEN, SALESMAN, 7698, 20-02-1981, 1600, 300, 30, 02-01-2022]  |7499|ALLEN |SALESMAN |7698|20-02-1981|1600|300 |30  |02-01-2022|
|7521,WARD

In [110]:
df1 = df.drop('value').drop('New')
df1.show(truncate = False)
df1.printSchema()
print(type(df1))
     

+----+------+---------+----+----------+----+----+----+----------+
|col0|col1  |col2     |col3|col4      |col5|col6|col7|col8      |
+----+------+---------+----+----------+----+----+----+----------+
|7369|SMITH |CLERK    |7902|17-12-1980|800 |null|20  |01-01-2022|
|7499|ALLEN |SALESMAN |7698|20-02-1981|1600|300 |30  |02-01-2022|
|7521|WARD  |SALESMAN |7698|22-02-1981|1250|500 |30  |03-01-2022|
|7566|JONES |MANAGER  |7839|04-02-1981|2975|null|20  |04-01-2022|
|7654|MARTIN|SALESMAN |7698|21-09-1981|1250|1400|30  |05-01-2022|
|7698|SGR   |MANAGER  |7839|05-01-1981|2850|null|30  |06-01-2022|
|7782|RAVI  |MANAGER  |7839|06-09-1981|2450|null|10  |07-01-2022|
|7788|SCOTT |ANALYST  |7566|19-04-1987|3000|null|20  |08-01-2022|
|7839|KING  |PRESIDENT|null|null      |5000|null|10  |null      |
|7844|TURNER|SALESMAN |7698|09-08-1981|1500|0   |30  |01-02-2022|
|7876|ADAMS |CLERK    |7788|23-05-1987|1100|null|20  |02-02-2022|
|7900|JAMES |CLERK    |7698|12-03-1981|950 |null|30  |03-02-2022|
|7902|FORD