In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("Filter Operations").master("local[*]").getOrCreate()

In [3]:
df = spark.read.csv("movies.csv", header=True)

In [4]:
df.show(10)

+--------------------+---------+--------------------+----------------+-------------+-----------------+---------------+----+
|                Film|    Genre|         Lead Studio|Audience score %|Profitability|Rotten Tomatoes %|Worldwide Gross|Year|
+--------------------+---------+--------------------+----------------+-------------+-----------------+---------------+----+
|Zack and Miri Mak...|  Romance|The Weinstein Com...|              70|  1.747541667|               64|        $41.94 |2008|
|     Youth in Revolt|   Comedy|The Weinstein Com...|              52|         1.09|               68|        $19.62 |2010|
|You Will Meet a T...|   Comedy|         Independent|              35|  1.211818182|               43|        $26.66 |2010|
|        When in Rome|   Comedy|              Disney|              44|            0|               15|        $43.04 |2010|
|What Happens in V...|   Comedy|                 Fox|              72|  6.267647029|               28|       $219.37 |2008|
| Water 

In [5]:
# Let's change the columns name for easy sql queries
df1 = df.withColumnRenamed("Lead Studio", "lead_studio") \
     .withColumnRenamed("Audience score %", "audience_score") \
     .withColumnRenamed("Rotten Tomatoes %", 'RT') \
     .withColumnRenamed("Worldwide Gross", "Worldwide_Gross")

### Filter Conditions

In [16]:
df_filter1 = df.filter(df.Genre.startswith("R")).show(5)

+--------------------+-------+--------------------+----------------+-------------+-----------------+---------------+----+
|                Film|  Genre|         Lead Studio|Audience score %|Profitability|Rotten Tomatoes %|Worldwide Gross|Year|
+--------------------+-------+--------------------+----------------+-------------+-----------------+---------------+----+
|Zack and Miri Mak...|Romance|The Weinstein Com...|              70|  1.747541667|               64|        $41.94 |2008|
|            Waitress|Romance|         Independent|              67|   11.0897415|               89|        $22.18 |2007|
| Waiting For Forever|Romance|         Independent|              53|        0.005|                6|         $0.03 |2011|
|Tyler Perry's Why...|Romance|         Independent|              47|    3.7241924|               46|        $55.86 |2007|
|Twilight: Breakin...|Romance|         Independent|              68|  6.383363636|               26|       $702.17 |2011|
+--------------------+--

In [17]:
df_filter2 = df1.filter(df.Genre.startswith("R")).show(5)

+--------------------+-------+--------------------+--------------+-------------+---+---------------+----+
|                Film|  Genre|         lead_studio|audience_score|Profitability| RT|Worldwide_Gross|Year|
+--------------------+-------+--------------------+--------------+-------------+---+---------------+----+
|Zack and Miri Mak...|Romance|The Weinstein Com...|            70|  1.747541667| 64|        $41.94 |2008|
|            Waitress|Romance|         Independent|            67|   11.0897415| 89|        $22.18 |2007|
| Waiting For Forever|Romance|         Independent|            53|        0.005|  6|         $0.03 |2011|
|Tyler Perry's Why...|Romance|         Independent|            47|    3.7241924| 46|        $55.86 |2007|
|Twilight: Breakin...|Romance|         Independent|            68|  6.383363636| 26|       $702.17 |2011|
+--------------------+-------+--------------------+--------------+-------------+---+---------------+----+
only showing top 5 rows



In [20]:
df_filter2 = df1.filter(df1.Genre.startswith("R")).filter(df1.lead_studio.startswith("I")).show(5)

+--------------------+-------+-----------+--------------+-------------+---+---------------+----+
|                Film|  Genre|lead_studio|audience_score|Profitability| RT|Worldwide_Gross|Year|
+--------------------+-------+-----------+--------------+-------------+---+---------------+----+
|            Waitress|Romance|Independent|            67|   11.0897415| 89|        $22.18 |2007|
| Waiting For Forever|Romance|Independent|            53|        0.005|  6|         $0.03 |2011|
|Tyler Perry's Why...|Romance|Independent|            47|    3.7241924| 46|        $55.86 |2007|
|Twilight: Breakin...|Romance|Independent|            68|  6.383363636| 26|       $702.17 |2011|
|  Something Borrowed|Romance|Independent|            48|  1.719514286| 15|        $60.18 |2011|
+--------------------+-------+-----------+--------------+-------------+---+---------------+----+
only showing top 5 rows



In [26]:
df_filter3 = df1.filter(df1.Genre.startswith("R")).filter(df1.lead_studio.startswith("I")) \
            .filter(df1.audience_score == 67).show(5)

+--------+-------+-----------+--------------+-------------+---+---------------+----+
|    Film|  Genre|lead_studio|audience_score|Profitability| RT|Worldwide_Gross|Year|
+--------+-------+-----------+--------------+-------------+---+---------------+----+
|Waitress|Romance|Independent|            67|   11.0897415| 89|        $22.18 |2007|
+--------+-------+-----------+--------------+-------------+---+---------------+----+



In [31]:
df_filter_date = df1.filter(df1.Year== "2010").show(5)

+--------------------+------+--------------------+--------------+-------------+---+---------------+----+
|                Film| Genre|         lead_studio|audience_score|Profitability| RT|Worldwide_Gross|Year|
+--------------------+------+--------------------+--------------+-------------+---+---------------+----+
|     Youth in Revolt|Comedy|The Weinstein Com...|            52|         1.09| 68|        $19.62 |2010|
|You Will Meet a T...|Comedy|         Independent|            35|  1.211818182| 43|        $26.66 |2010|
|        When in Rome|Comedy|              Disney|            44|            0| 15|        $43.04 |2010|
|     Valentine's Day|Comedy|        Warner Bros.|            54|  4.184038462| 17|       $217.57 |2010|
|    The Back-up Plan|Comedy|                 CBS|            47|  2.202571429| 20|        $77.09 |2010|
+--------------------+------+--------------------+--------------+-------------+---+---------------+----+
only showing top 5 rows



In [33]:
df_filter_date = df1.filter(df1.Year.startswith("2010")).show(5)

+--------------------+------+--------------------+--------------+-------------+---+---------------+----+
|                Film| Genre|         lead_studio|audience_score|Profitability| RT|Worldwide_Gross|Year|
+--------------------+------+--------------------+--------------+-------------+---+---------------+----+
|     Youth in Revolt|Comedy|The Weinstein Com...|            52|         1.09| 68|        $19.62 |2010|
|You Will Meet a T...|Comedy|         Independent|            35|  1.211818182| 43|        $26.66 |2010|
|        When in Rome|Comedy|              Disney|            44|            0| 15|        $43.04 |2010|
|     Valentine's Day|Comedy|        Warner Bros.|            54|  4.184038462| 17|       $217.57 |2010|
|    The Back-up Plan|Comedy|                 CBS|            47|  2.202571429| 20|        $77.09 |2010|
+--------------------+------+--------------------+--------------+-------------+---+---------------+----+
only showing top 5 rows

