# Creating SparkSession

In [0]:
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder.appName('Pysparkdataframes').getOrCreate()

In [0]:
data = spark.read.csv(r"/FileStore/tables/pysparklearning/Train.csv", header= True)

In [0]:
data.show(truncate = False)

+---+------------+
|Day|GrocerySales|
+---+------------+
|1  |8418.59     |
|2  |8447.47     |
|3  |8336.49     |
|4  |8579.17     |
|5  |8524.31     |
|6  |8623.5      |
|7  |8320.11     |
|8  |8313.53     |
|9  |8461.34     |
|10 |8497.24     |
|11 |8674.65     |
|12 |8354.63     |
|13 |8526.67     |
|14 |8767.31     |
|15 |8689.53     |
|16 |8623.62     |
|17 |8527.89     |
|18 |8450.51     |
|19 |8629.84     |
|20 |8840.12     |
+---+------------+
only showing top 20 rows



In [0]:
type(data.select('Day','GrocerySales'))
# data.select(['Day','GrocerySales']).show(2)
# so if we using .select() then it is returning as a dataframe else if we simply using the data['col'] , it is taking that as a column

+---+------------+
|Day|GrocerySales|
+---+------------+
|  1|     8418.59|
|  2|     8447.47|
+---+------------+
only showing top 2 rows



In [0]:
data.printSchema()

root
 |-- Day: string (nullable = true)
 |-- GrocerySales: string (nullable = true)



# Building your own schema type

In [0]:
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType
custom_schema = [StructField('Day', IntegerType(),True),
                 StructField('GrocerySales', FloatType(), True)]
final_schema = StructType(fields = custom_schema)


In [0]:
data.describe().show()


+-------+------------------+-----------------+
|summary|               Day|     GrocerySales|
+-------+------------------+-----------------+
|  count|               692|              692|
|   mean|             346.5|8564.733470592795|
| stddev|199.90747859947612|428.8156683024209|
|    min|                 1|          6766.37|
|    max|                99|          9290.02|
+-------+------------------+-----------------+



In [0]:
data.filter(data['GrocerySales'] > 8500).show()

+---+------------+
|Day|GrocerySales|
+---+------------+
|  4|     8579.17|
|  5|     8524.31|
|  6|      8623.5|
| 11|     8674.65|
| 13|     8526.67|
| 14|     8767.31|
| 15|     8689.53|
| 16|     8623.62|
| 17|     8527.89|
| 19|     8629.84|
| 20|     8840.12|
| 21|     8768.17|
| 22|     8891.15|
| 23|      8851.8|
| 24|     8734.27|
| 25|     8719.43|
| 32|     8542.81|
| 33|     8544.43|
| 54|     8850.22|
| 55|      8818.8|
+---+------------+
only showing top 20 rows



In [0]:
data.tail(5)

Out[21]: [Row(Day='688', GrocerySales='8848.98'),
 Row(Day='689', GrocerySales='8800.45'),
 Row(Day='690', GrocerySales='9197.99'),
 Row(Day='691', GrocerySales='9062.44'),
 Row(Day='692', GrocerySales='8963.12')]

In [0]:
data.createOrReplaceTempView('grocerysales')

results = spark.sql("""select * from grocerysales 
                    where Day > 25""")
results.show()


+---+-----------------+
|Day|     GrocerySales|
+---+-----------------+
| 26|          8481.94|
| 27|          8383.21|
| 28|          8343.63|
| 29|          8333.17|
| 30|          8497.34|
| 31|          8326.74|
| 32|          8542.81|
| 33|          8544.43|
| 34|           8330.0|
| 35|          8005.84|
| 36|          8269.73|
| 37|          8385.14|
| 38|          8259.88|
| 39|          8411.89|
| 40|           7967.5|
| 41|7861.130000000001|
| 42|7857.489999999999|
| 43|7861.880000000001|
| 44|          7791.16|
| 45|          7870.36|
+---+-----------------+
only showing top 20 rows



# Data Manipulation
## Filtering a data frame using the filter like in pandas

In [0]:
data.columns

Out[24]: ['Day', 'GrocerySales']

In [0]:
data.filter( (data['GrocerySales'] > 8300)  & ~(data['GrocerySales'] < 8600)).show()


# data.filter(data['GrocerySales'] > np.mean(data.select(['GrocerySales']))).show()

+---+------------+
|Day|GrocerySales|
+---+------------+
|  6|      8623.5|
| 11|     8674.65|
| 14|     8767.31|
| 15|     8689.53|
| 16|     8623.62|
| 19|     8629.84|
| 20|     8840.12|
| 21|     8768.17|
| 22|     8891.15|
| 23|      8851.8|
| 24|     8734.27|
| 25|     8719.43|
| 54|     8850.22|
| 55|      8818.8|
| 57|     8759.27|
| 58|     8716.09|
| 68|     8977.75|
| 69|     8873.18|
| 70|     8668.14|
| 71|     8631.88|
+---+------------+
only showing top 20 rows



In [0]:
# we will be using collect more than the show because collect() will give the output in a list form and then we can use asDict() to convert it into a dataframe and play with it accoridingly

# creating DataFrames


- the dataframe can be easily created by a list of data points and the column names which is basically the schema
- It can also be created from RDD's(immutable) or from other datasources.
- toDF(schema) converts the RDD or anything to a DataFrame.


In [0]:
data = [('sai','reddy',23),
        ('Tri','Reddy',24)]

columns = ['FirstName', 'LastName', 'Age']

# df = spark.createDataFrame(data= data, schema = columns)

# emptyRDD = spark.sparkContext.emptyRDD()
# rdd2 = spark.sparkContext.parallelize([])

# convert empytRDD to dataframe

# df1 = emptyRDD.toDF(schema)
# df1.printSchema()

In [0]:
# Creating a RDD

rdd = spark.sparkContext.parallelize(data)

In [0]:
rdd
# We can't see what is in RDD using show action command

[0;31m---------------------------------------------------------------------------[0m
[0;31mAttributeError[0m                            Traceback (most recent call last)
File [0;32m<command-4470094567349576>:1[0m
[0;32m----> 1[0m [43mrdd[49m[38;5;241;43m.[39;49m[43mshow[49m()

[0;31mAttributeError[0m: 'RDD' object has no attribute 'show'

In [0]:
# we are converting the rdd to a dataframe
df1 = rdd.toDF(columns)

In [0]:
from pyspark.sql.types import StructField, StructType, IntegerType, FloatType, StringType, DoubleType, CharType, LongType

schema = StructType([
    StructField('FirstName', StringType(), True),
    StructField('LastName', StringType(), True),
    StructField('Age', IntegerType(), True)
])

In [0]:
df1 = df1.withColumn('Age', df1['Age'].cast(IntegerType()))

In [0]:
df1.printSchema()

root
 |-- FirstName: string (nullable = true)
 |-- LastName: string (nullable = true)
 |-- Age: integer (nullable = true)



## ToPandas() Dataframe
- we can convert the pyspark dataframe to a pandas dataframe as well, becuase it will be easy for the people who know pandas. `toPandas()`
- the difference is pyspark is faster as it uses distributed processing whereas pandas perform it on a single machine

In [0]:

df1.show()
pandas_df1 = df1.toPandas()

+---------+--------+---+
|FirstName|LastName|Age|
+---------+--------+---+
|      sai|   reddy| 23|
|      Tri|   Reddy| 24|
+---------+--------+---+



In [0]:
type(pandas_df1),type(df1)

Out[47]: (pandas.core.frame.DataFrame, pyspark.sql.dataframe.DataFrame)

In [0]:
pandas_df1.head()

Unnamed: 0,FirstName,LastName,Age
0,sai,reddy,23
1,Tri,Reddy,24



# StructType , StructField

- Structtype is a collection of structfield objects, where structfield holds the metadata of the objects which has columnname, columndatatype, nullable(true or false)


In [0]:
# converting a structured pyspark dataframe to pandas dataframe
from pyspark.sql.types import StructType, StructField, StringType,IntegerType
dataStruct = [(("James","","Smith"),"36636","M","3000"), \
      (("Michael","Rose",""),"40288","M","4000"), \
      (("Robert","","Williams"),"42114","M","4000"), \
      (("Maria","Anne","Jones"),"39192","F","4000"), \
      (("Jen","Mary","Brown"),"","F","-1") \
]

nested_schema = StructType([
    StructField('Name', StructType([
        StructField('FirstName' , StringType(), True),
        StructField('MiddleName' , StringType(), True),
         StructField('LastName' , StringType(), True)
    ])),
    StructField('dob' , StringType(), True),
    StructField('gender', StringType(), True),
    StructField('salary', StringType(), True)
])

struct_data = spark.createDataFrame( dataStruct, nested_schema)
struct_data.printSchema()
struct_data.show()

root
 |-- Name: struct (nullable = true)
 |    |-- FirstName: string (nullable = true)
 |    |-- MiddleName: string (nullable = true)
 |    |-- LastName: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = true)

+--------------------+-----+------+------+
|                Name|  dob|gender|salary|
+--------------------+-----+------+------+
|    {James, , Smith}|36636|     M|  3000|
|   {Michael, Rose, }|40288|     M|  4000|
|{Robert, , Williams}|42114|     M|  4000|
|{Maria, Anne, Jones}|39192|     F|  4000|
|  {Jen, Mary, Brown}|     |     F|    -1|
+--------------------+-----+------+------+



In [0]:
df1.show(n=3,truncate=25,vertical=True) # shows the values in the spark dataframe vertically
# The truncate keyword generally trucates the characters in the column( by default it shows 20 characters)

-RECORD 0----------
 FirstName | sai   
 LastName  | reddy 
 Age       | 23    
-RECORD 1----------
 FirstName | Tri   
 LastName  | Reddy 
 Age       | 24    



In [0]:
# if you want to change or add struct to a dataframe
df1 = df1.withColumn('Age', df1['Age'].cast(IntegerType()))

In [0]:
df1.printSchema()

root
 |-- FirstName: string (nullable = true)
 |-- LastName: string (nullable = true)
 |-- Age: integer (nullable = true)



In [0]:
# similarly we can also update existing structtype using struct
from pyspark.sql.functions import col, struct, when

updated_struct_data = struct_data.withColumn('Otherinfo',
               struct(col("dob").alias("identifier"),
                      col('gender').alias('gender'),
                      col('salary').alias('salary'),
                      when(col('salary').cast(IntegerType()) < 2000, 'Low')
                      .when(col('salary').cast(IntegerType()) < 4000, 'Medium')
                      .otherwise('High').alias('Salary_Grade')
               )).drop('id','gender','salary')

updated_struct_data.printSchema()
updated_struct_data.show(5)

root
 |-- Name: struct (nullable = true)
 |    |-- FirstName: string (nullable = true)
 |    |-- MiddleName: string (nullable = true)
 |    |-- LastName: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- Otherinfo: struct (nullable = false)
 |    |-- identifier: string (nullable = true)
 |    |-- gender: string (nullable = true)
 |    |-- salary: string (nullable = true)
 |    |-- Salary_Grade: string (nullable = false)

+--------------------+-----+--------------------+
|                Name|  dob|           Otherinfo|
+--------------------+-----+--------------------+
|    {James, , Smith}|36636|{36636, M, 3000, ...|
|   {Michael, Rose, }|40288|{40288, M, 4000, ...|
|{Robert, , Williams}|42114|{42114, M, 4000, ...|
|{Maria, Anne, Jones}|39192|{39192, F, 4000, ...|
|  {Jen, Mary, Brown}|     |      {, F, -1, Low}|
+--------------------+-----+--------------------+



In [0]:
# we can also pass ArrayType(StringType()), MapType(StringType(), StringType())

## Creating structtype from Json.
- If we have somany columns then it is definetly hard to defind the structype and if the dataformat changes every now and then, then it will be really hard.
- if we can load the sql structtype schema from the json file then it will be easy.
`df.schema.json()`


In [0]:
import json
schemaFromJson = StructType.fromJson(json.loads(schema.json))
df3 = spark.createDataFrame(
        spark.sparkContext.parallelize(structureData),schemaFromJson)
df3.printSchema()

In [0]:
# to check if the column is present in dataframe
df.schema.fieldNames.contains('colname')

# Hghly used functionalities of Pyspark

## df.Select()

In [0]:
# to see all the columns
df1.select([col for col in df1.columns]).show()
df1.select('*').show() 

# we can also select based on regex
from pyspark.sql.functions import col
df1.select(df1.colRegex("`^.*name*`")).show()


+---------+--------+---+
|FirstName|LastName|Age|
+---------+--------+---+
|      sai|   reddy| 23|
|      Tri|   Reddy| 24|
+---------+--------+---+

+---------+--------+---+
|FirstName|LastName|Age|
+---------+--------+---+
|      sai|   reddy| 23|
|      Tri|   Reddy| 24|
+---------+--------+---+

+---------+--------+
|FirstName|LastName|
+---------+--------+
|      sai|   reddy|
|      Tri|   Reddy|
+---------+--------+



In [0]:
df1.select(df1.columns[:3]).show()
# we can also select the columns from the struct column like name.firstname, name.lastname or name.*

+---------+--------+---+
|FirstName|LastName|Age|
+---------+--------+---+
|      sai|   reddy| 23|
|      Tri|   Reddy| 24|
+---------+--------+---+



## df.collect()

- It is an action operation. once it is called it triggers the compute to perform the transformations

In [0]:
# It is used to retrieve all elements of the dataset to the drivernode
dept = [("Finance",10), \
    ("Marketing",20), \
    ("Sales",30), \
    ("IT",40) \
  ]
deptColumns = ["dept_name","dept_id"]
deptDF = spark.createDataFrame(data=dept, schema = deptColumns)
deptDF.show(truncate=False)

dataCollect = deptDF.collect()
print(dataCollect)


+---------+-------+
|dept_name|dept_id|
+---------+-------+
|Finance  |10     |
|Marketing|20     |
|Sales    |30     |
|IT       |40     |
+---------+-------+

[Row(dept_name='Finance', dept_id=10), Row(dept_name='Marketing', dept_id=20), Row(dept_name='Sales', dept_id=30), Row(dept_name='IT', dept_id=40)]


In [0]:
dataselect = deptDF.select('*')
print(dataselect)

DataFrame[dept_name: string, dept_id: bigint]


- now if we notice the collect() action returned the data in a array to the driver node, whereas select is returning a Dataframe and we can't loop it.

- The main difference is select() is a transformation and it returns a dataframe, but collect() is an action.

In [0]:
for i in dataCollect:
    print(i)

Row(dept_name='Finance', dept_id=10)
Row(dept_name='Marketing', dept_id=20)
Row(dept_name='Sales', dept_id=30)
Row(dept_name='IT', dept_id=40)


## withColumn
-heavily used to change the datatypes of columns, 
  - change the value of an existing column.
  - create new column

In [0]:
deptDF = deptDF.withColumn('dept_id_2', col('dept_id')*100)

In [0]:
deptDF.withColumn('dept_id_2', col('dept_id')*100) # creates new dataframe rather than updating the original one

Out[84]: DataFrame[dept_name: string, dept_id: bigint, dept_id_2: bigint]

In [0]:
deptDF.withColumn('dept_id', col('dept_id')*1000).show()

+---------+-------+---------+
|dept_name|dept_id|dept_id_2|
+---------+-------+---------+
|  Finance|  10000|     1000|
|Marketing|  20000|     2000|
|    Sales|  30000|     3000|
|       IT|  40000|     4000|
+---------+-------+---------+



In [0]:
.withColumnRenamed('oldcol','nw_col')

## Filter and where

- same we use df.filter(isin(), multiple coditions , ~isin(list),startswith(), endswith(),contains(),like(), rlike() - this is like with regex)

In [0]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, ArrayType
from pyspark.sql.functions import col,array_contains

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

arrayStructureData = [
        (("James","","Smith"),["Java","Scala","C++"],"OH","M"),
        (("Anna","Rose",""),["Spark","Java","C++"],"NY","F"),
        (("Julia","","Williams"),["CSharp","VB"],"OH","F"),
        (("Maria","Anne","Jones"),["CSharp","VB"],"NY","M"),
        (("Jen","Mary","Brown"),["CSharp","VB"],"NY","M"),
        (("Mike","Mary","Williams"),["Python","VB"],"OH","M")
        ]
        
arrayStructureSchema = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
         StructField('languages', ArrayType(StringType()), True),
         StructField('state', StringType(), True),
         StructField('gender', StringType(), True)
         ])


df = spark.createDataFrame(data = arrayStructureData, schema = arrayStructureSchema)
df.printSchema()
df.show(truncate=False)

df.filter(df.state == "OH") \
    .show(truncate=False)

df.filter(col("state") == "OH") \
    .show(truncate=False)    
    
df.filter("gender  == 'M'") \
    .show(truncate=False)    

df.filter( (df.state  == "OH") & (df.gender  == "M") ) \
    .show(truncate=False)        

df.filter(array_contains(df.languages,"Java")) \
    .show(truncate=False)        

df.filter(df.name.lastname == "Williams") \
    .show(truncate=False) 

In [0]:
import time
while True:
    print('Cluster is running')
    time.sleep(40)
    break

Cluster is running


In [0]:
# Distinct() , dropDuplicates() -- removes the duplicates and gives the distinct rows. dropDuplicates also supports the 
distinctDF = df.distinct()
print("Distinct count: "+str(distinctDF.count()))
distinctDF.show(truncate=False)

#Drop duplicates
df2 = df.dropDuplicates()
print("Distinct count: "+str(df2.count()))
df2.show(truncate=False)

#Drop duplicates on selected columns
dropDisDF = df.dropDuplicates(["department","salary"])
print("Distinct count of department salary : "+str(dropDisDF.count()))
dropDisDF.show(truncate=False)

## sort() or orderBY()
-both performs the same operation so anyone can be used

In [0]:
df1.show()

+---------+--------+---+
|FirstName|LastName|Age|
+---------+--------+---+
|      sai|   reddy| 23|
|      Tri|   Reddy| 24|
+---------+--------+---+



In [0]:
df1.sort('FirstName','Age' , ascending = [True, False]).show()

+---------+--------+---+
|FirstName|LastName|Age|
+---------+--------+---+
|      Tri|   Reddy| 24|
|      sai|   reddy| 23|
+---------+--------+---+



In [0]:
from pyspark.sql.functions import asc,desc

df1.sort(df1.FirstName.asc(),df1.Age.asc()).show(truncate=False)
df1.sort(col("FirstName").desc(),col("Age").asc()).show(truncate=False)
df1.orderBy(col("FirstName").asc(),col("Age").asc()).show(truncate=False)

+---------+--------+---+
|FirstName|LastName|Age|
+---------+--------+---+
|Tri      |Reddy   |24 |
|sai      |reddy   |23 |
+---------+--------+---+

+---------+--------+---+
|FirstName|LastName|Age|
+---------+--------+---+
|sai      |reddy   |23 |
|Tri      |Reddy   |24 |
+---------+--------+---+

+---------+--------+---+
|FirstName|LastName|Age|
+---------+--------+---+
|Tri      |Reddy   |24 |
|sai      |reddy   |23 |
+---------+--------+---+



## GroupBy

In [0]:
df = df1

In [0]:

simpleData = [("James","Sales","NY",90000,34,10000),
    ("Michael","Sales","NY",86000,56,20000),
    ("Robert","Sales","CA",81000,30,23000),
    ("Maria","Finance","CA",90000,24,23000),
    ("Raman","Finance","CA",99000,40,24000),
    ("Scott","Finance","NY",83000,36,19000),
    ("Jen","Finance","NY",79000,53,15000),
    ("Jeff","Marketing","CA",80000,25,18000),
    ("Kumar","Marketing","NY",91000,50,21000)
  ]

schema = ["employee_name","department","state","salary","age","bonus"]
df = spark.createDataFrame(data=simpleData, schema = schema)
df.printSchema()
df.show(truncate=False)


df.groupBy("department").sum("salary").show(truncate=False)

df.groupBy("department").count().show(truncate=False)


df.groupBy("department","state") \
    .sum("salary","bonus") \
   .show(truncate=False)


root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Raman        |Finance   |CA   |99000 |40 |24000|
|Scott        |Finance   |NY   |83000 |36 |19000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
+-------------+----------+-----+------+---+-----+

+----------+-----------+
|department|sum(salary)|
+----------+-----------+
|Sales     |257000     |
|Finance   |351

In [0]:

df.groupBy("department") \
    .agg(sum("salary").alias("sum_salary"), \
         avg("salary").alias("avg_salary"), \
         sum("bonus").alias("sum_bonus") \
        #  max("bonus").alias("max_bonus") \
     ) \
    .show(truncate=False)
    
df.groupBy("department") \
    .agg(sum("salary").alias("sum_salary"), \
      avg("salary").alias("avg_salary"), \
      sum("bonus").alias("sum_bonus"), \
      max("bonus").alias("max_bonus")) \
    .where(col("sum_bonus") >= 50000) \
    .show(truncate=False)

# Joins
- left
- right
- full
- inner
- semi (new in pyspark)  - returns all the data in the left dataframe where there is a match in right dataframe
- anti ( New in pyspark) - similarly returns all rows in the left dataframe where there is no match in the right dataframe

In [0]:

# Prapare data 
import pyspark
from pyspark.sql import SparkSession

emp = [(1,"Smith",-1,"2018","10","M",3000), \
    (2,"Rose",1,"2010","20","M",4000), \
    (3,"Williams",1,"2010","10","M",1000), \
    (4,"Jones",2,"2005","10","F",2000), \
    (5,"Brown",2,"2010","40","",-1), \
      (6,"Brown",2,"2010","50","",-1) \
  ]
empColumns = ["emp_id","name","superior_emp_id","year_joined", \
       "emp_dept_id","gender","salary"]

empDF = spark.createDataFrame(data=emp, schema = empColumns)
empDF.printSchema()
empDF.show(truncate=False)

dept = [("Finance",10), \
    ("Marketing",20), \
    ("Sales",30), \
    ("IT",40) \
  ]
deptColumns = ["dept_name","dept_id"]
deptDF = spark.createDataFrame(data=dept, schema = deptColumns)
deptDF.printSchema()
deptDF.show(truncate=False)

root
 |-- emp_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- superior_emp_id: long (nullable = true)
 |-- year_joined: string (nullable = true)
 |-- emp_dept_id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

+------+--------+---------------+-----------+-----------+------+------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|
+------+--------+---------------+-----------+-----------+------+------+
|1     |Smith   |-1             |2018       |10         |M     |3000  |
|2     |Rose    |1              |2010       |20         |M     |4000  |
|3     |Williams|1              |2010       |10         |M     |1000  |
|4     |Jones   |2              |2005       |10         |F     |2000  |
|5     |Brown   |2              |2010       |40         |      |-1    |
|6     |Brown   |2              |2010       |50         |      |-1    |
+------+--------+---------------+-----------+-----------+------+-----

- This is very important because often we end up using multiple datasets so we need to master joins, it is similar to pandas mergings


In [0]:

empDF.join(deptDF , empDF.emp_dept_id == deptDF.dept_id , how = 'right').show()

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|    name|superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|     4|   Jones|              2|       2005|         10|     F|  2000|  Finance|     10|
|     3|Williams|              1|       2010|         10|     M|  1000|  Finance|     10|
|     1|   Smith|             -1|       2018|         10|     M|  3000|  Finance|     10|
|     2|    Rose|              1|       2010|         20|     M|  4000|Marketing|     20|
|  null|    null|           null|       null|       null|  null|  null|    Sales|     30|
|     5|   Brown|              2|       2010|         40|      |    -1|       IT|     40|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+



In [0]:
empDF.join(deptDF , empDF.emp_dept_id == deptDF.dept_id , how = 'left').show()

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|    name|superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|     1|   Smith|             -1|       2018|         10|     M|  3000|  Finance|     10|
|     2|    Rose|              1|       2010|         20|     M|  4000|Marketing|     20|
|     3|Williams|              1|       2010|         10|     M|  1000|  Finance|     10|
|     4|   Jones|              2|       2005|         10|     F|  2000|  Finance|     10|
|     5|   Brown|              2|       2010|         40|      |    -1|       IT|     40|
|     6|   Brown|              2|       2010|         50|      |    -1|     null|   null|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+



In [0]:
empDF.join(deptDF , empDF.emp_dept_id == deptDF.dept_id , how = 'inner').show()

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|    name|superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|     1|   Smith|             -1|       2018|         10|     M|  3000|  Finance|     10|
|     3|Williams|              1|       2010|         10|     M|  1000|  Finance|     10|
|     4|   Jones|              2|       2005|         10|     F|  2000|  Finance|     10|
|     2|    Rose|              1|       2010|         20|     M|  4000|Marketing|     20|
|     5|   Brown|              2|       2010|         40|      |    -1|       IT|     40|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+



In [0]:
empDF.join(deptDF , empDF.emp_dept_id == deptDF.dept_id , how = 'leftsemi').show() 
# gave all the records from the left which has a match in the right

+------+--------+---------------+-----------+-----------+------+------+
|emp_id|    name|superior_emp_id|year_joined|emp_dept_id|gender|salary|
+------+--------+---------------+-----------+-----------+------+------+
|     1|   Smith|             -1|       2018|         10|     M|  3000|
|     3|Williams|              1|       2010|         10|     M|  1000|
|     4|   Jones|              2|       2005|         10|     F|  2000|
|     2|    Rose|              1|       2010|         20|     M|  4000|
|     5|   Brown|              2|       2010|         40|      |    -1|
+------+--------+---------------+-----------+-----------+------+------+



In [0]:
empDF.join(deptDF , empDF.emp_dept_id == deptDF.dept_id , how = 'leftanti').show() 

+------+-----+---------------+-----------+-----------+------+------+
|emp_id| name|superior_emp_id|year_joined|emp_dept_id|gender|salary|
+------+-----+---------------+-----------+-----------+------+------+
|     6|Brown|              2|       2010|         50|      |    -1|
+------+-----+---------------+-----------+-----------+------+------+



- Are joins complete without selfjoin? Damn not 


In [0]:
empDF.alias("emp1").join(empDF.alias("emp2"),\
    col("emp1.superior_emp_id") == col("emp2.emp_id"),"inner") \
    .select(col("emp1.emp_id"),col("emp1.name"), \
    col("emp2.emp_id"),\
    col("emp2.name"))\
    .show(truncate=False)

+------+--------+------+-----+
|emp_id|name    |emp_id|name |
+------+--------+------+-----+
|2     |Rose    |1     |Smith|
|3     |Williams|1     |Smith|
|4     |Jones   |2     |Rose |
|5     |Brown   |2     |Rose |
|6     |Brown   |2     |Rose |
+------+--------+------+-----+



# Union vs unionByName

- union basically appends all the data based on positions.
- unionByName appends based on column names ( they can be anywhere in the table) it also gives a keyword allowMissingColumns so that we don't get any error

In [0]:
df1 = spark.createDataFrame([[5, 2, 6]], ["col0", "col1", "col2"])
df2 = spark.createDataFrame([[6, 7, 3]], ["col1", "col2", "col3"])
df3 = df1.unionByName(df2, allowMissingColumns=True)
df3.printSchema()
df3.show()

root
 |-- col0: long (nullable = true)
 |-- col1: long (nullable = true)
 |-- col2: long (nullable = true)
 |-- col3: long (nullable = true)

+----+----+----+----+
|col0|col1|col2|col3|
+----+----+----+----+
|   5|   2|   6|null|
|null|   6|   7|   3|
+----+----+----+----+



# User Defined Functions
- most expensive operation, use it only when there is no other choice

In [0]:
from pyspark.sql.functions import udf,col
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

columns = ["Seqno","Name"]
data = [("1", "john jones"),
    ("2", "tracey smith"),
    ("3", "amy sanders")]

df = spark.createDataFrame(data=data,schema=columns)

df.show(truncate=False)


def convertCase(str):
    resStr=""
    arr = str.split(" ")
    for x in arr:
       resStr= resStr + x[0:1].upper() + x[1:len(x)] + " "
    return resStr 

convertUDF = udf(lambda z : convertCase(z), StringType())

df.select(col("Seqno"), \
    convertUDF(col("Name")).alias("Name") ) \
   .show(truncate=False)

+-----+------------+
|Seqno|Name        |
+-----+------------+
|1    |john jones  |
|2    |tracey smith|
|3    |amy sanders |
+-----+------------+

+-----+-------------+
|Seqno|Name         |
+-----+-------------+
|1    |John Jones   |
|2    |Tracey Smith |
|3    |Amy Sanders  |
+-----+-------------+




# Transform

- basically you are forming the chain of transformations

- Becasue remember pyspark has something called **lazy evaluation**, so it doesn't start executing or performing transformations until an action function is called upon it, so everytime when a transformation is run it return another RDD. 

In [0]:
simpleData = (("Java",4000,5), \
    ("Python", 4600,10),  \
    ("Scala", 4100,15),   \
    ("Scala", 4500,15),   \
    ("PHP", 3000,20),  \
  )
columns= ["CourseName", "fee", "discount"]

df = spark.createDataFrame(data = simpleData , schema = columns)
df.printSchema()
df.show()



root
 |-- CourseName: string (nullable = true)
 |-- fee: long (nullable = true)
 |-- discount: long (nullable = true)

+----------+----+--------+
|CourseName| fee|discount|
+----------+----+--------+
|      Java|4000|       5|
|    Python|4600|      10|
|     Scala|4100|      15|
|     Scala|4500|      15|
|       PHP|3000|      20|
+----------+----+--------+



In [0]:
def discounted_fee(df):
    return df.withColumn('discounted_fee' , df.fee - df.fee*df.discount/100)

df = df.transform(discounted_fee)

df.show()

+----------+----+--------+--------------+
|CourseName| fee|discount|discounted_fee|
+----------+----+--------+--------------+
|      Java|4000|       5|        3800.0|
|    Python|4600|      10|        4140.0|
|     Scala|4100|      15|        3485.0|
|     Scala|4500|      15|        3825.0|
|       PHP|3000|      20|        2400.0|
+----------+----+--------+--------------+



In [0]:
## Other ways 
from pyspark.sql.functions import transform
df.select(transform("Languages1", lambda x: upper(x)).alias("languages1")) \
  .show()

# Apply 
- basically the pyspark donot have a apply function functionality but however we can use the pandas api of python to use that
-`import pyspark.pandas as ps or pd`

In [0]:
# Apply is basically used for performing an operation over an entire dataframe

# Map
- dataframes do not have a map function, for using map first we should convert it into a RDD and then perform the operation.


In [0]:

data = [('James','Smith','M',30),
  ('Anna','Rose','F',41),
  ('Robert','Williams','M',62), 
]

columns = ["firstname","lastname","gender","salary"]
df = spark.createDataFrame(data=data, schema = columns)
df.show()

rdd2 = df.rdd.map(lambda x: (x[0] +"," + x[1] , x[2], x[3]))
df = rdd2.toDF(['Name' , 'Sex', 'Salary'])
df.show()

+---------+--------+------+------+
|firstname|lastname|gender|salary|
+---------+--------+------+------+
|    James|   Smith|     M|    30|
|     Anna|    Rose|     F|    41|
|   Robert|Williams|     M|    62|
+---------+--------+------+------+

+---------------+---+------+
|           Name|Sex|Salary|
+---------------+---+------+
|    James,Smith|  M|    30|
|      Anna,Rose|  F|    41|
|Robert,Williams|  M|    62|
+---------------+---+------+



In [0]:
# We can also do this by passing in a function so that if we need it we can use it anywhere in the code

def func1(x):
    firstName=x.firstname
    lastName=x.lastname
    name=firstName+","+lastName
    gender=x.gender.lower()
    salary=x.salary*2
    return (name,gender,salary)

rdd2=df.rdd.map(lambda x: func1(x))

In [0]:
data = [("Project Gutenberg’s",1),
        ("Alice’s Adventures in Wonderland",1),
        ("Project Gutenberg’s",2),
        ("Adventures in Wonderland",2),
       ("Project Gutenberg’s",3)]
rdd=spark.sparkContext.parallelize(data)
for element in rdd.collect():
    print(element)

('Project Gutenberg’s', 1)
('Alice’s Adventures in Wonderland', 1)
('Project Gutenberg’s', 2)
('Adventures in Wonderland', 2)
('Project Gutenberg’s', 3)


# flatMap
- basically same as map, but it flattens out the entire RDD or dataframe

In [0]:
rdd2 = rdd.flatMap(lambda x : (x[0].split(" "), x[1]))
for row in rdd2.collect():
    print(row)

['Project', 'Gutenberg’s']
1
['Alice’s', 'Adventures', 'in', 'Wonderland']
1
['Project', 'Gutenberg’s']
2
['Adventures', 'in', 'Wonderland']
2
['Project', 'Gutenberg’s']
3


# Random Sampling
- as a datascientist or analyst it is more often that we analyse a random sample of a population to come to some hypothesis

In [0]:
df = spark.range(100)
print(df.sample(0.1).collect())

[Row(id=2), Row(id=4), Row(id=6), Row(id=15), Row(id=17), Row(id=21), Row(id=22), Row(id=23), Row(id=25), Row(id=28), Row(id=30), Row(id=59), Row(id=64), Row(id=89), Row(id=94), Row(id=98)]


- we can't guarentee that the sample method only returns the given fraction of records


In [0]:
df.sampe(withReplacement = 'if you need repeated or duplicate values', fraction = 'fraction of data needed', seed = 'to get same results')
df.takeSample(withReplacement , num = 'number of records', seed)

# Fillna

In [0]:
df.fillna(value, subset)
# we can fill different subsets with different values as well
df.fillna(value1, subset1).fillna(value2, subset2)
df.fillna({column : vlaue , column:value})

# Pivot and Unpivot

- the pivot column distinct values will become the new columns

In [0]:
pivotDF = df.groupBy("Product").pivot("Country").sum("Amount")
pivotDF.printSchema()
pivotDF.show(truncate=False)