In [127]:
import pandas as pd
import findspark
findspark.init()
from pyspark.sql.session import SparkSession
spark = SparkSession.builder.appName('demo').master("local[*]").getOrCreate()

In [128]:
df1 = spark.read.option("header",True).csv("C:/Users/Sudip/Desktop/pyspark-project/2010-summary.csv")

In [129]:
df1.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: string (nullable = true)



In [130]:
df1.show(10)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
|    United States|          Singapore|   25|
|    United States|            Grenada|   54|
|       Costa Rica|      United States|  477|
|          Senegal|      United States|   29|
|    United States|   Marshall Islands|   44|
+-----------------+-------------------+-----+
only showing top 10 rows



In [131]:
df1.take(10)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count='1'),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count='264'),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='India', count='69'),
 Row(DEST_COUNTRY_NAME='Egypt', ORIGIN_COUNTRY_NAME='United States', count='24'),
 Row(DEST_COUNTRY_NAME='Equatorial Guinea', ORIGIN_COUNTRY_NAME='United States', count='1'),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Singapore', count='25'),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Grenada', count='54'),
 Row(DEST_COUNTRY_NAME='Costa Rica', ORIGIN_COUNTRY_NAME='United States', count='477'),
 Row(DEST_COUNTRY_NAME='Senegal', ORIGIN_COUNTRY_NAME='United States', count='29'),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Marshall Islands', count='44')]

In [132]:
#we know the physcal plan by using this command
# We can apply sort function as a transformation to the previous dataframe
# 
df1.sort("count").explain()

== Physical Plan ==
*(1) Sort [count#980 ASC NULLS FIRST], true, 0
+- Exchange rangepartitioning(count#980 ASC NULLS FIRST, 200), ENSURE_REQUIREMENTS, [id=#1153]
   +- FileScan csv [DEST_COUNTRY_NAME#978,ORIGIN_COUNTRY_NAME#979,count#980] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[file:/C:/Users/Sudip/Desktop/pyspark-project/2010-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,ORIGIN_COUNTRY_NAME:string,count:string>




In [133]:
# we can make any dataframe to table or view by using this command
df1.createOrReplaceTempView("flightData")

In [134]:
sql1 = spark.sql("select dest_country_name, count, origin_country_name from flightData")

In [135]:
sql1.show(5)

+-----------------+-----+-------------------+
|dest_country_name|count|origin_country_name|
+-----------------+-----+-------------------+
|    United States|    1|            Romania|
|    United States|  264|            Ireland|
|    United States|   69|              India|
|            Egypt|   24|      United States|
|Equatorial Guinea|    1|      United States|
+-----------------+-----+-------------------+
only showing top 5 rows



In [136]:
# Now we are good to use our sql skills while doing transformations as per our business logic
spark.sql("select dest_country_name from flightData where dest_country_name='Egypt'").show()


+-----------------+
|dest_country_name|
+-----------------+
|            Egypt|
+-----------------+



In [137]:
spark.sql("select count(dest_country_name) from flightData").show()

+------------------------+
|count(dest_country_name)|
+------------------------+
|                     255|
+------------------------+



In [138]:
spark.sql("select dest_country_name, count(1) from flightData Group By dest_country_name").show(5)

+-----------------+--------+
|dest_country_name|count(1)|
+-----------------+--------+
|         Anguilla|       1|
|           Russia|       1|
|         Paraguay|       1|
|          Senegal|       1|
|           Sweden|       1|
+-----------------+--------+
only showing top 5 rows



In [139]:
spark.sql("select * from flightData").show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows



In [140]:
spark.sql("select dest_country_name, rank() over(partition by dest_country_name order by count) rank \
from flightData").show(5)

+-----------------+----+
|dest_country_name|rank|
+-----------------+----+
|         Anguilla|   1|
|         Paraguay|   1|
|           Russia|   1|
|          Senegal|   1|
|           Sweden|   1|
+-----------------+----+
only showing top 5 rows



In [141]:
spark.sql("select *, \
    row_number() over(order by count) Row_number \
    from flightData").show(5)

+--------------------+-------------------+-----+----------+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|Row_number|
+--------------------+-------------------+-----+----------+
|       United States|            Romania|    1|         1|
|   Equatorial Guinea|      United States|    1|         2|
|               Malta|      United States|    1|         3|
|Saint Vincent and...|      United States|    1|         4|
|            Slovakia|      United States|    1|         5|
+--------------------+-------------------+-----+----------+
only showing top 5 rows



In [142]:
spark.sql("select *, \
    dense_rank() over(partition by origin_country_name order by count) Dense_Rank \
    from flightData").show(5)

+-----------------+-------------------+-----+----------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|Dense_Rank|
+-----------------+-------------------+-----+----------+
|    United States|           Anguilla|   20|         1|
|    United States|             Russia|  156|         1|
|    United States|            Senegal|   46|         1|
|    United States|             Sweden|   73|         1|
|    United States|           Kiribati|   18|         1|
+-----------------+-------------------+-----+----------+
only showing top 5 rows



In [143]:
 #in Python in oder to query aggregate function we need to import
from pyspark.sql.functions import max
spark.sql("select max(count) from flightData group by dest_country_name").show(5)


+----------+
|max(count)|
+----------+
|        21|
|        90|
|       152|
|        29|
|        65|
+----------+
only showing top 5 rows



In [144]:
from pyspark.sql.functions import max, min, count, avg

In [145]:
spark.sql("select min(count) from flightData").show()

+----------+
|min(count)|
+----------+
|         1|
+----------+



In [146]:
spark.sql("select max(count) from flightData").show()

+----------+
|max(count)|
+----------+
|       995|
+----------+



In [147]:
spark.sql("select avg(count) from flightData").show()

+--------------------------+
|avg(CAST(count AS DOUBLE))|
+--------------------------+
|         1655.956862745098|
+--------------------------+



In [148]:
spark.sql("select round(avg(count)) from flightData").show()

+------------------------------------+
|round(avg(CAST(count AS DOUBLE)), 0)|
+------------------------------------+
|                              1656.0|
+------------------------------------+



In [149]:
spark.sql("""
SELECT DEST_COUNTRY_NAME, sum(count) as dest_total
FROM flightData
GROUP BY DEST_COUNTRY_NAME
ORDER BY sum(count) DESC
LIMIT 5
""").show()


+-----------------+----------+
|DEST_COUNTRY_NAME|dest_total|
+-----------------+----------+
|    United States|  384932.0|
|           Canada|    8271.0|
|           Mexico|    6200.0|
|   United Kingdom|    1629.0|
|          Germany|    1392.0|
+-----------------+----------+



### Working with pyspark

In [150]:
#select only one column in pyspark

df1.select(df1["dest_country_name"]).limit(5).show(5)

+-----------------+
|dest_country_name|
+-----------------+
|    United States|
|    United States|
|    United States|
|            Egypt|
|Equatorial Guinea|
+-----------------+



In [None]:
df = spark.read.format("csv").option("header", True) \
    .option("inferSchema", True) \
    .load("C:/Users/Sudip/Desktop/pyspark-project/*.csv")