In [0]:
from pyspark.sql.functions import split, year

In [0]:
#1
df = spark.read.csv('dbfs:/mnt/20240729de/imdb_top_1000.csv', header=True, inferSchema=True)

df.printSchema()
df.show(10)

root
 |-- Poster_Link: string (nullable = true)
 |-- Series_Title: string (nullable = true)
 |-- Released_Year: string (nullable = true)
 |-- Certificate: string (nullable = true)
 |-- Runtime: string (nullable = true)
 |-- Genre: string (nullable = true)
 |-- IMDB_Rating: double (nullable = true)
 |-- Overview: string (nullable = true)
 |-- Meta_score: string (nullable = true)
 |-- Director: string (nullable = true)
 |-- Star1: string (nullable = true)
 |-- Star2: string (nullable = true)
 |-- Star3: string (nullable = true)
 |-- Star4: string (nullable = true)
 |-- No_of_Votes: string (nullable = true)
 |-- Gross: string (nullable = true)

+--------------------+--------------------+-------------+-----------+-------+--------------------+-----------+--------------------+----------+--------------------+-----------------+--------------------+-----------------+----------------+-----------+-----------+
|         Poster_Link|        Series_Title|Released_Year|Certificate|Runtime|           

In [0]:
#2
#filtering dataframe
filterDf = df.filter(df.IMDB_Rating > 8.0)
#show filtered data
filterDf.show()

+--------------------+--------------------+-------------+-----------+-------+--------------------+-----------+--------------------+----------+--------------------+------------------+--------------------+------------------+--------------------+-----------+-----------+
|         Poster_Link|        Series_Title|Released_Year|Certificate|Runtime|               Genre|IMDB_Rating|            Overview|Meta_score|            Director|             Star1|               Star2|             Star3|               Star4|No_of_Votes|      Gross|
+--------------------+--------------------+-------------+-----------+-------+--------------------+-----------+--------------------+----------+--------------------+------------------+--------------------+------------------+--------------------+-----------+-----------+
|https://m.media-a...|The Shawshank Red...|         1994|          A|142 min|               Drama|        9.3|Two imprisoned me...|        80|      Frank Darabont|       Tim Robbins|      Morgan F

In [0]:
#3
#selecting cols
selectDf = df.select('Series_Title', 'IMDB_Rating')
#show selected
selectDf.show()

+--------------------+-----------+
|        Series_Title|IMDB_Rating|
+--------------------+-----------+
|The Shawshank Red...|        9.3|
|       The Godfather|        9.2|
|     The Dark Knight|        9.0|
|The Godfather: Pa...|        9.0|
|        12 Angry Men|        9.0|
|The Lord of the R...|        8.9|
|        Pulp Fiction|        8.9|
|    Schindler's List|        8.9|
|           Inception|        8.8|
|          Fight Club|        8.8|
|The Lord of the R...|        8.8|
|        Forrest Gump|        8.8|
|Il buono, il brut...|        8.8|
|The Lord of the R...|        8.7|
|          The Matrix|        8.7|
|          Goodfellas|        8.7|
|Star Wars: Episod...|        8.7|
|One Flew Over the...|        8.7|
|            Hamilton|        8.6|
|        Gisaengchung|        8.6|
+--------------------+-----------+
only showing top 20 rows



In [0]:
#4
#create discounted column
df = df.withColumn('discounted_rating', df.IMDB_Rating * 0.9)
#show updated df
df.show()

+--------------------+--------------------+-------------+-----------+-------+--------------------+-----------+--------------------+----------+--------------------+------------------+--------------------+------------------+--------------------+-----------+-----------+-----------------+
|         Poster_Link|        Series_Title|Released_Year|Certificate|Runtime|               Genre|IMDB_Rating|            Overview|Meta_score|            Director|             Star1|               Star2|             Star3|               Star4|No_of_Votes|      Gross|discounted_rating|
+--------------------+--------------------+-------------+-----------+-------+--------------------+-----------+--------------------+----------+--------------------+------------------+--------------------+------------------+--------------------+-----------+-----------+-----------------+
|https://m.media-a...|The Shawshank Red...|         1994|          A|142 min|               Drama|        9.3|Two imprisoned me...|        80|

In [0]:
#5
#group by genre, average rating
groupDf = df.groupBy('Genre').avg('IMDB_Rating')
#show groupDf
groupDf.show()

+--------------------+------------------+
|               Genre|  avg(IMDB_Rating)|
+--------------------+------------------+
|    Action, Thriller| 7.866666666666667|
|Animation, Comedy...|               7.7|
|     Crime, Thriller| 7.966666666666666|
|Action, Adventure...| 8.199999999999998|
|   Adventure, Sci-Fi|              8.15|
|Drama, Fantasy, H...|               8.2|
|Animation, Advent...|               8.1|
|Animation, Biogra...|               7.8|
|Animation, Advent...| 7.957142857142857|
|Action, Adventure...|               7.9|
|Comedy, Crime, My...|               8.0|
|Adventure, Biogra...|               7.8|
|       Comedy, Drama| 7.871428571428573|
|Biography, Drama,...| 7.822222222222221|
|Comedy, Drama, Ro...|7.8774193548387075|
|       Action, Drama|              7.94|
|Drama, Romance, T...|              7.85|
|Drama, Mystery, S...| 8.040000000000001|
|Comedy, Family, R...|               7.8|
|Adventure, Drama,...|7.8999999999999995|
+--------------------+------------

In [0]:
#6
#split genre into 2 columns
df = df.withColumn('primary_genre', split(df['Genre'], ', ')[0])
df = df.withColumn('secondary_genre', split(df['Genre'], ', ')[1])

#show df
df.show()

+--------------------+--------------------+-------------+-----------+-------+--------------------+-----------+--------------------+----------+--------------------+------------------+--------------------+------------------+--------------------+-----------+-----------+-----------------+-------------+---------------+
|         Poster_Link|        Series_Title|Released_Year|Certificate|Runtime|               Genre|IMDB_Rating|            Overview|Meta_score|            Director|             Star1|               Star2|             Star3|               Star4|No_of_Votes|      Gross|discounted_rating|primary_genre|secondary_genre|
+--------------------+--------------------+-------------+-----------+-------+--------------------+-----------+--------------------+----------+--------------------+------------------+--------------------+------------------+--------------------+-----------+-----------+-----------------+-------------+---------------+
|https://m.media-a...|The Shawshank Red...|         

In [0]:
#7
#create a new column with year
df = df.withColumn('year', year(df['Released_Year']))

#show df
df.show()

+--------------------+--------------------+-------------+-----------+-------+--------------------+-----------+--------------------+----------+--------------------+------------------+--------------------+------------------+--------------------+-----------+-----------+-----------------+-------------+---------------+----+
|         Poster_Link|        Series_Title|Released_Year|Certificate|Runtime|               Genre|IMDB_Rating|            Overview|Meta_score|            Director|             Star1|               Star2|             Star3|               Star4|No_of_Votes|      Gross|discounted_rating|primary_genre|secondary_genre|year|
+--------------------+--------------------+-------------+-----------+-------+--------------------+-----------+--------------------+----------+--------------------+------------------+--------------------+------------------+--------------------+-----------+-----------+-----------------+-------------+---------------+----+
|https://m.media-a...|The Shawshank R

In [0]:
#8
#sort imdb rating descending
sortedDf = df.orderBy(df.IMDB_Rating.desc())

#show sortedDf
sortedDf.show()

+--------------------+--------------------+-------------+-----------+-------+--------------------+-----------+--------------------+----------+--------------------+------------------+--------------------+------------------+--------------------+-----------+-----------+-----------------+-------------+---------------+----+
|         Poster_Link|        Series_Title|Released_Year|Certificate|Runtime|               Genre|IMDB_Rating|            Overview|Meta_score|            Director|             Star1|               Star2|             Star3|               Star4|No_of_Votes|      Gross|discounted_rating|primary_genre|secondary_genre|year|
+--------------------+--------------------+-------------+-----------+-------+--------------------+-----------+--------------------+----------+--------------------+------------------+--------------------+------------------+--------------------+-----------+-----------+-----------------+-------------+---------------+----+
|https://m.media-a...|The Shawshank R

In [0]:
#9
#drop duplicates
dropDuplicateDf = df.dropDuplicates(['Series_Title'])

#show dropDuplicateDf
dropDuplicateDf.show()

+--------------------+--------------------+-------------+-----------+-------+--------------------+-----------+--------------------+----------+--------------------+-----------------+------------------+------------------+--------------------+-----------+-----------+------------------+-------------+---------------+----+
|         Poster_Link|        Series_Title|Released_Year|Certificate|Runtime|               Genre|IMDB_Rating|            Overview|Meta_score|            Director|            Star1|             Star2|             Star3|               Star4|No_of_Votes|      Gross| discounted_rating|primary_genre|secondary_genre|year|
+--------------------+--------------------+-------------+-----------+-------+--------------------+-----------+--------------------+----------+--------------------+-----------------+------------------+------------------+--------------------+-----------+-----------+------------------+-------------+---------------+----+
|https://m.media-a...|     My Name Is Khan|