In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import col, struct

flights_path = "/databricks-datasets/flights/departuredelays.csv"

df = spark.read.csv(flights_path, header=True, inferSchema=True)
df.show(20)
df.describe().show()
df.count()



+-------+-----+--------+------+-----------+
|   date|delay|distance|origin|destination|
+-------+-----+--------+------+-----------+
|1011245|    6|     602|   ABE|        ATL|
|1020600|   -8|     369|   ABE|        DTW|
|1021245|   -2|     602|   ABE|        ATL|
|1020605|   -4|     602|   ABE|        ATL|
|1031245|   -4|     602|   ABE|        ATL|
|1030605|    0|     602|   ABE|        ATL|
|1041243|   10|     602|   ABE|        ATL|
|1040605|   28|     602|   ABE|        ATL|
|1051245|   88|     602|   ABE|        ATL|
|1050605|    9|     602|   ABE|        ATL|
|1061215|   -6|     602|   ABE|        ATL|
|1061725|   69|     602|   ABE|        ATL|
|1061230|    0|     369|   ABE|        DTW|
|1060625|   -3|     602|   ABE|        ATL|
|1070600|    0|     369|   ABE|        DTW|
|1071725|    0|     602|   ABE|        ATL|
|1071230|    0|     369|   ABE|        DTW|
|1070625|    0|     602|   ABE|        ATL|
|1071219|    0|     569|   ABE|        ORD|
|1080600|    0|     369|   ABE| 

1391578

In [0]:
#filtering
# filter()-filter rows
df.filter(col("distance")>500).count()



775031

In [0]:
#between()-range filter
df.filter(col("distance").between(1000,1500)).count()

155093

In [0]:
#where()-same as filter
df.where(col("delay")==5).count()

21979

In [0]:
#isin()-matchvalues
df.filter(col("destination").isin(["ATL","DTW"])).count()

113744

In [0]:
#like()-start of the word it is case sensitive "A%"-startwith,"%a"-endwith
df.filter(col("destination").like("A%")).show()
df.filter(col("destination").like("%W")).show()

#rlike()-regex start with two or mor letter and to find letter in word
df.filter(col("destination").rlike("^(A|D|O)")).show()
df.filter(col("destination").rlike("L")).show()



+-------+-----+--------+------+-----------+
|   date|delay|distance|origin|destination|
+-------+-----+--------+------+-----------+
|1011245|    6|     602|   ABE|        ATL|
|1021245|   -2|     602|   ABE|        ATL|
|1020605|   -4|     602|   ABE|        ATL|
|1031245|   -4|     602|   ABE|        ATL|
|1030605|    0|     602|   ABE|        ATL|
|1041243|   10|     602|   ABE|        ATL|
|1040605|   28|     602|   ABE|        ATL|
|1051245|   88|     602|   ABE|        ATL|
|1050605|    9|     602|   ABE|        ATL|
|1061215|   -6|     602|   ABE|        ATL|
|1061725|   69|     602|   ABE|        ATL|
|1060625|   -3|     602|   ABE|        ATL|
|1071725|    0|     602|   ABE|        ATL|
|1070625|    0|     602|   ABE|        ATL|
|1080625|    1|     602|   ABE|        ATL|
|1091215|   43|     602|   ABE|        ATL|
|1091725|    0|     602|   ABE|        ATL|
|1090625|    8|     602|   ABE|        ATL|
|1101215|   -5|     602|   ABE|        ATL|
|1101725|    7|     602|   ABE| 

In [0]:
#when()-used in if else statement,add new coulmn

df.select("distance").distinct().count()

from pyspark.sql.functions import when

df.withColumn(
    "distance_group",
    when(col("distance") <500, "Near")
    .when(col("distance") <1000, "Medium")
    .otherwise("Far")
).show()

#case whrn using expression

from pyspark.sql.functions import expr

df.withColumn(
    "delay_group",
    expr("CASE WHEN delay<0 THEN 'Good' WHEN delay < 50 THEN 'Not Bad' ELSE 'Bad' END")
).show()



+-------+-----+--------+------+-----------+--------------+
|   date|delay|distance|origin|destination|distance_group|
+-------+-----+--------+------+-----------+--------------+
|1011245|    6|     602|   ABE|        ATL|        Medium|
|1020600|   -8|     369|   ABE|        DTW|          Near|
|1021245|   -2|     602|   ABE|        ATL|        Medium|
|1020605|   -4|     602|   ABE|        ATL|        Medium|
|1031245|   -4|     602|   ABE|        ATL|        Medium|
|1030605|    0|     602|   ABE|        ATL|        Medium|
|1041243|   10|     602|   ABE|        ATL|        Medium|
|1040605|   28|     602|   ABE|        ATL|        Medium|
|1051245|   88|     602|   ABE|        ATL|        Medium|
|1050605|    9|     602|   ABE|        ATL|        Medium|
|1061215|   -6|     602|   ABE|        ATL|        Medium|
|1061725|   69|     602|   ABE|        ATL|        Medium|
|1061230|    0|     369|   ABE|        DTW|          Near|
|1060625|   -3|     602|   ABE|        ATL|        Mediu