In [None]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=9cc90ce5df563c523ec48d714f4df6ceaf25ed40c26596a3586b4ea4220d81dc
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, lit

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Movie Data Transformations") \
    .getOrCreate()

# 1. Load the Dataset
# Read the CSV file into a Pyspark DataFrame
df = spark.read.csv("/content/movies_data.csv", header=True, inferSchema=True)

# 2. Filter Movies by Genre
# Find all the movies in sci-fi genre
sci_fi_movies = df.filter(col("genre") == "Sci-Fi")
print("Movies in Sci-fi: ")
sci_fi_movies.show()

# 3. Top-Rated Movies
# Find the top 3 highest-rated movies.
top_rated_movies = df.orderBy(col("rating").desc()).limit(3)
print("Top 3 highest rated movies")
top_rated_movies.show()

# 4. Movies Released After 2010
# Find all movies released after 2010
movies_after_2010 = df.filter(col("date") > "2010-12-31")
print("Movies releases after 2010: ")
movies_after_2010.show()

# 5. Calculate Average Box Office Collection by Genre
# Group the movies by genre and calculate the average box office collection for each genre.
avg_box_office_by_genre = df.groupBy("genre").agg(avg("box_office").alias("avg_box_office"))
print("average box office collection for each genre: ")
avg_box_office_by_genre.show()

# 6. Add a New Column for Box Office in Billions
# Add a new column that shows the box office collection in billions
df_with_billion = df.withColumn("box_office_in_billions", col("box_office") / 1000000000)
print("box office collection in billions: ")
df_with_billion.show()

# 7. Sort Movies by Box Office Collection
# Sort the movies in descending order based on their box office collection.
sorted_movies = df.orderBy(col("box_office").desc())
print("movies order based on collections: ")
sorted_movies.show()

# 8. Count the Number of Movies per Genre
# Count the number of movies in each genre
count_movies_per_genre = df.groupBy("genre").count()
print("No.of movies in each genre: ")
count_movies_per_genre.show()



Movies in Sci-fi: 
+--------+------------+------+------+----------+----------+
|movie_id|       title| genre|rating|box_office|      date|
+--------+------------+------+------+----------+----------+
|       1|   Inception|Sci-Fi|   8.8| 830000000|2010-07-16|
|       3|Interstellar|Sci-Fi|   8.6| 677000000|2014-11-07|
+--------+------------+------+------+----------+----------+

Top 3 highest rated movies
+--------+---------------+------+------+----------+----------+
|movie_id|          title| genre|rating|box_office|      date|
+--------+---------------+------+------+----------+----------+
|       2|The Dark Knight|Action|   9.0|1004000000|2008-07-18|
|       1|      Inception|Sci-Fi|   8.8| 830000000|2010-07-16|
|       3|   Interstellar|Sci-Fi|   8.6| 677000000|2014-11-07|
+--------+---------------+------+------+----------+----------+

Movies releases after 2010: 
+--------+-----------------+---------+------+----------+----------+
|movie_id|            title|    genre|rating|box_offic