In [1]:
from pyspark.sql import SparkSession

### Read the dataset

In [3]:
spark = SparkSession.builder.appName("Spark-sql").master("local[*]").getOrCreate()

In [4]:
spark

In [5]:
df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("movies.csv")

In [6]:
df.head()

Row(Film='Zack and Miri Make a Porno', Genre='Romance', Lead Studio='The Weinstein Company', Audience score %=70, Profitability=1.747541667, Rotten Tomatoes %=64, Worldwide Gross='$41.94 ', Year=2008)

In [7]:
df.show(5)

+--------------------+-------+--------------------+----------------+-------------+-----------------+---------------+----+
|                Film|  Genre|         Lead Studio|Audience score %|Profitability|Rotten Tomatoes %|Worldwide Gross|Year|
+--------------------+-------+--------------------+----------------+-------------+-----------------+---------------+----+
|Zack and Miri Mak...|Romance|The Weinstein Com...|              70|  1.747541667|               64|        $41.94 |2008|
|     Youth in Revolt| Comedy|The Weinstein Com...|              52|         1.09|               68|        $19.62 |2010|
|You Will Meet a T...| Comedy|         Independent|              35|  1.211818182|               43|        $26.66 |2010|
|        When in Rome| Comedy|              Disney|              44|          0.0|               15|        $43.04 |2010|
|What Happens in V...| Comedy|                 Fox|              72|  6.267647029|               28|       $219.37 |2008|
+--------------------+--

In [8]:
# Now change the column for easy sql queries
df1 = df.withColumnRenamed("Lead Studio", "lead_studio") \
     .withColumnRenamed("Audience score %", "audience_score") \
     .withColumnRenamed("Rotten Tomatoes %", 'RT') \
     .withColumnRenamed("Worldwide Gross", "Worldwide_Gross")


In [9]:
df1.show(10)

+--------------------+---------+--------------------+--------------+-------------+---+---------------+----+
|                Film|    Genre|         lead_studio|audience_score|Profitability| RT|Worldwide_Gross|Year|
+--------------------+---------+--------------------+--------------+-------------+---+---------------+----+
|Zack and Miri Mak...|  Romance|The Weinstein Com...|            70|  1.747541667| 64|        $41.94 |2008|
|     Youth in Revolt|   Comedy|The Weinstein Com...|            52|         1.09| 68|        $19.62 |2010|
|You Will Meet a T...|   Comedy|         Independent|            35|  1.211818182| 43|        $26.66 |2010|
|        When in Rome|   Comedy|              Disney|            44|          0.0| 15|        $43.04 |2010|
|What Happens in V...|   Comedy|                 Fox|            72|  6.267647029| 28|       $219.37 |2008|
| Water For Elephants|    Drama|    20th Century Fox|            72|  3.081421053| 60|       $117.09 |2011|
|              WALL-E|Animat

### Create a temp view to start spark sql

In [11]:
df1.createOrReplaceTempView("Movies")

In [12]:
df1.printSchema()

root
 |-- Film: string (nullable = true)
 |-- Genre: string (nullable = true)
 |-- lead_studio: string (nullable = true)
 |-- audience_score: integer (nullable = true)
 |-- Profitability: double (nullable = true)
 |-- RT: integer (nullable = true)
 |-- Worldwide_Gross: string (nullable = true)
 |-- Year: integer (nullable = true)



In [15]:
spark.sql("select Film, Genre, RT from Movies").show(10)

+--------------------+---------+---+
|                Film|    Genre| RT|
+--------------------+---------+---+
|Zack and Miri Mak...|  Romance| 64|
|     Youth in Revolt|   Comedy| 68|
|You Will Meet a T...|   Comedy| 43|
|        When in Rome|   Comedy| 15|
|What Happens in V...|   Comedy| 28|
| Water For Elephants|    Drama| 60|
|              WALL-E|Animation| 96|
|            Waitress|  Romance| 89|
| Waiting For Forever|  Romance|  6|
|     Valentine's Day|   Comedy| 17|
+--------------------+---------+---+
only showing top 10 rows



### Use filter conditions

In [17]:
df1.distinct().show(10)

+--------------------+-------+------------+--------------+-------------+---+---------------+----+
|                Film|  Genre| lead_studio|audience_score|Profitability| RT|Worldwide_Gross|Year|
+--------------------+-------+------------+--------------+-------------+---+---------------+----+
|    The Back-up Plan| Comedy|         CBS|            47|  2.202571429| 20|        $77.09 |2010|
|    Music and Lyrics|Romance|Warner Bros.|            70|   3.64741055| 63|       $145.90 |2007|
|         The Duchess|  Drama|   Paramount|            68|  3.207850222| 60|        $43.31 |2008|
|  Something Borrowed|Romance| Independent|            48|  1.719514286| 15|        $60.18 |2011|
|Sex and the City Two| Comedy|Warner Bros.|            49|       2.8835| 15|       $288.35 |2010|
|He's Just Not Tha...| Comedy|Warner Bros.|            60|       7.1536| 42|       $178.84 |2009|
|     Valentine's Day| Comedy|Warner Bros.|            54|  4.184038462| 17|       $217.57 |2010|
|     No Reservation

In [26]:
spark.sql("select lead_studio from Movies").show(3)

+--------------------+
|         lead_studio|
+--------------------+
|The Weinstein Com...|
|The Weinstein Com...|
|         Independent|
+--------------------+
only showing top 3 rows



In [34]:
df1.filter(df1.Genre == "Comedy").show(5)

+--------------------+------+--------------------+--------------+-------------+---+---------------+----+
|                Film| Genre|         lead_studio|audience_score|Profitability| RT|Worldwide_Gross|Year|
+--------------------+------+--------------------+--------------+-------------+---+---------------+----+
|     Youth in Revolt|Comedy|The Weinstein Com...|            52|         1.09| 68|        $19.62 |2010|
|You Will Meet a T...|Comedy|         Independent|            35|  1.211818182| 43|        $26.66 |2010|
|        When in Rome|Comedy|              Disney|            44|          0.0| 15|        $43.04 |2010|
|What Happens in V...|Comedy|                 Fox|            72|  6.267647029| 28|       $219.37 |2008|
|     Valentine's Day|Comedy|        Warner Bros.|            54|  4.184038462| 17|       $217.57 |2010|
+--------------------+------+--------------------+--------------+-------------+---+---------------+----+
only showing top 5 rows



In [35]:
df1.filter(df1.audience_score != 70).show(5)

+--------------------+------+--------------------+--------------+-------------+---+---------------+----+
|                Film| Genre|         lead_studio|audience_score|Profitability| RT|Worldwide_Gross|Year|
+--------------------+------+--------------------+--------------+-------------+---+---------------+----+
|     Youth in Revolt|Comedy|The Weinstein Com...|            52|         1.09| 68|        $19.62 |2010|
|You Will Meet a T...|Comedy|         Independent|            35|  1.211818182| 43|        $26.66 |2010|
|        When in Rome|Comedy|              Disney|            44|          0.0| 15|        $43.04 |2010|
|What Happens in V...|Comedy|                 Fox|            72|  6.267647029| 28|       $219.37 |2008|
| Water For Elephants| Drama|    20th Century Fox|            72|  3.081421053| 60|       $117.09 |2011|
+--------------------+------+--------------------+--------------+-------------+---+---------------+----+
only showing top 5 rows



In [42]:
df1.filter( (df1.Genre  == "Comedy") & (df1.lead_studio  == "Fox")).show(5)  

+--------------------+------+-----------+--------------+-------------+---+---------------+----+
|                Film| Genre|lead_studio|audience_score|Profitability| RT|Worldwide_Gross|Year|
+--------------------+------+-----------+--------------+-------------+---+---------------+----+
|What Happens in V...|Comedy|        Fox|            72|  6.267647029| 28|       $219.37 |2008|
|       Marley and Me|Comedy|        Fox|            77|  3.746781818| 63|       $206.07 |2008|
|  Love & Other Drugs|Comedy|        Fox|            55|  1.817666667| 48|        $54.53 |2010|
|         Just Wright|Comedy|        Fox|            58|  1.797416667| 45|        $21.57 |2010|
|          27 Dresses|Comedy|        Fox|            71|    5.3436218| 40|       $160.31 |2008|
+--------------------+------+-----------+--------------+-------------+---+---------------+----+



### df.filter(df.state.startswith("N")).show()

In [44]:
df1.filter(df1.Genre.startswith("C")).show(4)

+--------------------+------+--------------------+--------------+-------------+---+---------------+----+
|                Film| Genre|         lead_studio|audience_score|Profitability| RT|Worldwide_Gross|Year|
+--------------------+------+--------------------+--------------+-------------+---+---------------+----+
|     Youth in Revolt|Comedy|The Weinstein Com...|            52|         1.09| 68|        $19.62 |2010|
|You Will Meet a T...|Comedy|         Independent|            35|  1.211818182| 43|        $26.66 |2010|
|        When in Rome|Comedy|              Disney|            44|          0.0| 15|        $43.04 |2010|
|What Happens in V...|Comedy|                 Fox|            72|  6.267647029| 28|       $219.37 |2008|
+--------------------+------+--------------------+--------------+-------------+---+---------------+----+
only showing top 4 rows



### #contains
df.filter(df.state.contains("H")).show()

In [45]:
df1.filter(df.Genre.contains("H")).show(3)

+----+-----+-----------+--------------+-------------+---+---------------+----+
|Film|Genre|lead_studio|audience_score|Profitability| RT|Worldwide_Gross|Year|
+----+-----+-----------+--------------+-------------+---+---------------+----+
+----+-----+-----------+--------------+-------------+---+---------------+----+



### PySpark Filter like and rlike

In [47]:
df1.filter(df1.Genre.like("%Cm%")).show()

+----+-----+-----------+--------------+-------------+---+---------------+----+
|Film|Genre|lead_studio|audience_score|Profitability| RT|Worldwide_Gross|Year|
+----+-----+-----------+--------------+-------------+---+---------------+----+
+----+-----+-----------+--------------+-------------+---+---------------+----+



### ## filter with multiple condition using sql.functions
  
from pyspark.sql import functions as f
 

In [49]:
from pyspark.sql import functions as f
df1.filter((f.col('Genre') == 'Comedy') & (f.col('lead_studio') == 'Fox')).show()

+--------------------+------+-----------+--------------+-------------+---+---------------+----+
|                Film| Genre|lead_studio|audience_score|Profitability| RT|Worldwide_Gross|Year|
+--------------------+------+-----------+--------------+-------------+---+---------------+----+
|What Happens in V...|Comedy|        Fox|            72|  6.267647029| 28|       $219.37 |2008|
|       Marley and Me|Comedy|        Fox|            77|  3.746781818| 63|       $206.07 |2008|
|  Love & Other Drugs|Comedy|        Fox|            55|  1.817666667| 48|        $54.53 |2010|
|         Just Wright|Comedy|        Fox|            58|  1.797416667| 45|        $21.57 |2010|
|          27 Dresses|Comedy|        Fox|            71|    5.3436218| 40|       $160.31 |2008|
+--------------------+------+-----------+--------------+-------------+---+---------------+----+

