## Task

- Which year had the least number of movies produced?

- Sort titles alphabetically.

- How many movies were released in 2018?

- When was the earliest release date for a movie in the dataset?

- Average rating of movies directed by Tarantino (assuming "Tarantino" is a director in the dataset).

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType, FloatType

# Initialize Spark session
spark = SparkSession.builder.appName("Task_lab7").getOrCreate()

# Define the schema for the DataFrame
schema = StructType([
    StructField("title", StringType(), nullable=False),
    StructField("director", StringType(), nullable=False),
    StructField("rating", FloatType(), nullable=False),
    StructField("year", IntegerType(), nullable=False),
    StructField("country_origin", StringType(), nullable=False),
])

# Sample data
# Combined movie data (including existing, additional, and new movies) with "release_year" and "country"
data = [
    ("Pulp Fiction", "Quentin Tarantino", 8.9, 1994, "USA"),
    ("Schindler's List", "Steven Spielberg", 8.9, 1993, "USA"),
    ("Inception", "Christopher Nolan", 8.8, 2010, "USA"),
    ("Forrest Gump", "Robert Zemeckis", 8.8, 1994, "USA"),
    ("Fight Club", "David Fincher", 8.8, 1999, "USA"),
    ("The Matrix", "The Wachowskis", 8.7, 1999, "USA"),
    ("Margin Call", "J.C. Chandor", 7.1, 2011, "USA"),
    ("The Godfather", "Francis Ford Coppola", 9.2, 1972, "USA"),
    ("The Dark Knight", "Christopher Nolan", 9.0, 2008, "USA"),
    ("Gladiator", "Ridley Scott", 8.5, 2000, "USA"),
    ("Moneyball", "Bennett Miller", 7.6, 2011, "USA"),
    ("Kill Bill", "Quentin Tarantino", 8.1, 2003, "USA"),
    ("Argo", "Ben Affleck", 7.7, 2012, "USA"),
    ("Dune", "Denis Villeneuve", 8.3, 2021, "USA"),
    ("Blade Runner", "Ridley Scott", 8.1, 1982, "USA"),
    ("The Big Short", "Adam McKay", 7.8, 2015, "USA"),
    ("The Shawshank Redemption", "Frank Darabont", 9.3, 1994, "USA"),
    ("Reservoir Dogs", "Quentin Tarantino", 8.3, 1992, "USA"),
    ("Inglourious Basterds", "Quentin Tarantino", 8.3, 2009, "USA"),
    ("The Social Network", "David Fincher", 7.7, 2010, "USA"),
    ("The Hateful Eight", "Quentin Tarantino", 7.8, 2015, "USA"),
    ("Jackie Brown", "Quentin Tarantino", 7.5, 1997, "USA")
]


# Create the DataFrame
df_task = spark.createDataFrame(data, schema=schema)


In [2]:
df_task.groupby('year').count().orderBy('count').show()

+----+-----+
|year|count|
+----+-----+
|1972|    1|
|2000|    1|
|2008|    1|
|1993|    1|
|2003|    1|
|1997|    1|
|1982|    1|
|2012|    1|
|2009|    1|
|1992|    1|
|2021|    1|
|2010|    2|
|2011|    2|
|1999|    2|
|2015|    2|
|1994|    3|
+----+-----+



In [3]:
df_task.orderBy('title').show()

+--------------------+--------------------+------+----+--------------+
|               title|            director|rating|year|country_origin|
+--------------------+--------------------+------+----+--------------+
|                Argo|         Ben Affleck|   7.7|2012|           USA|
|        Blade Runner|        Ridley Scott|   8.1|1982|           USA|
|                Dune|    Denis Villeneuve|   8.3|2021|           USA|
|          Fight Club|       David Fincher|   8.8|1999|           USA|
|        Forrest Gump|     Robert Zemeckis|   8.8|1994|           USA|
|           Gladiator|        Ridley Scott|   8.5|2000|           USA|
|           Inception|   Christopher Nolan|   8.8|2010|           USA|
|Inglourious Basterds|   Quentin Tarantino|   8.3|2009|           USA|
|        Jackie Brown|   Quentin Tarantino|   7.5|1997|           USA|
|           Kill Bill|   Quentin Tarantino|   8.1|2003|           USA|
|         Margin Call|        J.C. Chandor|   7.1|2011|           USA|
|     

In [4]:
df_task.where(df_task.year == 2018).count()

0

In [5]:
df_task.orderBy('year').show(1)

+-------------+--------------------+------+----+--------------+
|        title|            director|rating|year|country_origin|
+-------------+--------------------+------+----+--------------+
|The Godfather|Francis Ford Coppola|   9.2|1972|           USA|
+-------------+--------------------+------+----+--------------+
only showing top 1 row



In [6]:
df_task.where(df_task.director == 'Quentin Tarantino').agg({'rating': 'avg'}).show()

+-----------------+
|      avg(rating)|
+-----------------+
|8.150000095367432|
+-----------------+



In [7]:
df_task.groupBy('director').agg({'rating': 'avg'}).where(df_task.director == 'Quentin Tarantino').show()

+-----------------+-----------------+
|         director|      avg(rating)|
+-----------------+-----------------+
|Quentin Tarantino|8.150000095367432|
+-----------------+-----------------+

