**Importing and Initializing**

In [None]:
import pyspark

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, col, when, desc, max, asc, avg, round

In [None]:
spark = SparkSession.builder.appName("Netflix Data Analysis").getOrCreate()

**Displaying the Schema**

In [None]:
spark = SparkSession.builder.appName("Netflix Data Analysis").getOrCreate()

netflix_df = spark.read.csv("/content/netflix_titles.csv", header=True, inferSchema=True)
disney_plus_df = spark.read.csv("/content/disney_plus_shows.csv", header=True, inferSchema=True)
disney_movie_df = spark.read.csv("/content/disney_movie_data_final.csv", header=True, inferSchema=True)

netflix_schema = netflix_df.printSchema()
disney_plus_schema = disney_plus_df.printSchema()
disney_movie_schema = disney_movie_df.printSchema()

(netflix_schema, disney_plus_schema, disney_movie_schema)


root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)

root
 |-- imdb_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- plot: string (nullable = true)
 |-- type: string (nullable = true)
 |-- rated: string (nullable = true)
 |-- year: string (nullable = true)
 |-- released_at: string (nullable = true)
 |-- added_at: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- director: string (nullable = true)
 |-- writer: string (nullable = true)
 |-- actors: string (nullable = true)
 |-- language: str

(None, None, None)

**Reading the CSV File from the local system**

In [None]:
df = spark.read.csv("netflix_titles.csv", header = True, inferSchema = True)

In [None]:
df.printSchema()

root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)



**Data Exploration**

**Total number of Columns**

In [None]:
len(df.columns)

12

**Total number of Rows**

In [None]:
df.count()

8809

**Checking count of missing values**

In [None]:
df.select([count(when(col(c).isNull(), 1)) for c in df.columns]).show()

+---------------------------------------------+------------------------------------------+-------------------------------------------+----------------------------------------------+------------------------------------------+---------------------------------------------+------------------------------------------------+--------------------------------------------------+--------------------------------------------+----------------------------------------------+-----------------------------------------------+-------------------------------------------------+
|count(CASE WHEN (show_id IS NULL) THEN 1 END)|count(CASE WHEN (type IS NULL) THEN 1 END)|count(CASE WHEN (title IS NULL) THEN 1 END)|count(CASE WHEN (director IS NULL) THEN 1 END)|count(CASE WHEN (cast IS NULL) THEN 1 END)|count(CASE WHEN (country IS NULL) THEN 1 END)|count(CASE WHEN (date_added IS NULL) THEN 1 END)|count(CASE WHEN (release_year IS NULL) THEN 1 END)|count(CASE WHEN (rating IS NULL) THEN 1 END)|count(CASE WHEN (duratio

In [None]:
df.describe().show()

+-------+--------------------+-------------+---------------------------------+--------------------+--------------------+----------------+---------------+-----------------+-----------------+-------------+--------------------+--------------------+
|summary|             show_id|         type|                            title|            director|                cast|         country|     date_added|     release_year|           rating|     duration|           listed_in|         description|
+-------+--------------------+-------------+---------------------------------+--------------------+--------------------+----------------+---------------+-----------------+-----------------+-------------+--------------------+--------------------+
|  count|                8809|         8808|                             8807|                6173|                7983|            7977|           8796|             8807|             8803|         8804|                8806|                8806|
|   mean|       

**Column Names**

In [None]:
df.columns

['show_id',
 'type',
 'title',
 'director',
 'cast',
 'country',
 'date_added',
 'release_year',
 'rating',
 'duration',
 'listed_in',
 'description']

**Movie Rating Analysis**

**Selecting required columns and creating a new dataframe for analysis**

In [None]:
df1 = df.select(col("rating"), col("title"), col("cast"), col("listed_in"))
df1.show(10)

+------+--------------------+--------------------+--------------------+
|rating|               title|                cast|           listed_in|
+------+--------------------+--------------------+--------------------+
| PG-13|Dick Johnson Is Dead|                NULL|       Documentaries|
| TV-MA|       Blood & Water|Ama Qamata, Khosi...|International TV ...|
| TV-MA|           Ganglands|Sami Bouajila, Tr...|Crime TV Shows, I...|
| TV-MA|Jailbirds New Orl...|                NULL|Docuseries, Reali...|
| TV-MA|        Kota Factory|Mayur More, Jiten...|International TV ...|
| TV-MA|       Midnight Mass|Kate Siegel, Zach...|TV Dramas, TV Hor...|
|    PG|My Little Pony: A...|Vanessa Hudgens, ...|Children & Family...|
| TV-MA|             Sankofa|Kofi Ghanaba, Oya...|Dramas, Independe...|
| TV-14|The Great British...|Mel Giedroyc, Sue...|British TV Shows,...|
| PG-13|        The Starling|Melissa McCarthy,...|    Comedies, Dramas|
+------+--------------------+--------------------+--------------

In [None]:
df.select(df["title"], df["listed_in"]).show(10, truncate = False)

+--------------------------------+-------------------------------------------------------------+
|title                           |listed_in                                                    |
+--------------------------------+-------------------------------------------------------------+
|Dick Johnson Is Dead            |Documentaries                                                |
|Blood & Water                   |International TV Shows, TV Dramas, TV Mysteries              |
|Ganglands                       |Crime TV Shows, International TV Shows, TV Action & Adventure|
|Jailbirds New Orleans           |Docuseries, Reality TV                                       |
|Kota Factory                    |International TV Shows, Romantic TV Shows, TV Comedies       |
|Midnight Mass                   |TV Dramas, TV Horror, TV Mysteries                           |
|My Little Pony: A New Generation|Children & Family Movies                                     |
|Sankofa                      

**Displaying the first three columns**

In [None]:
df.select(df.columns[:3]).show(10)

+-------+-------+--------------------+
|show_id|   type|               title|
+-------+-------+--------------------+
|     s1|  Movie|Dick Johnson Is Dead|
|     s2|TV Show|       Blood & Water|
|     s3|TV Show|           Ganglands|
|     s4|TV Show|Jailbirds New Orl...|
|     s5|TV Show|        Kota Factory|
|     s6|TV Show|       Midnight Mass|
|     s7|  Movie|My Little Pony: A...|
|     s8|  Movie|             Sankofa|
|     s9|TV Show|The Great British...|
|    s10|  Movie|        The Starling|
+-------+-------+--------------------+
only showing top 10 rows



**Displaying data which has imdb rating 9 and 8.**

In [None]:
df1.filter(col("listed_in") == 9).show(10)

+------+-----+----+---------+
|rating|title|cast|listed_in|
+------+-----+----+---------+
+------+-----+----+---------+



In [None]:
df1.filter("listed_in == 8").show(10)

+------+-----+----+---------+
|rating|title|cast|listed_in|
+------+-----+----+---------+
+------+-----+----+---------+



**Sorting**

In [None]:
df1.sort(col("title").desc()).show(10)

+------+---------------------------------+--------------------+--------------------+
|rating|                            title|                cast|           listed_in|
+------+---------------------------------+--------------------+--------------------+
| TV-Y7|최강전사 미니특공대 : 영웅의 탄생|Um Sang-hyun, Yan...|Children & Family...|
| TV-MA|                    반드시 잡는다|       Baek Yoon-sik|Dramas, Internati...|
| TV-MA|                         마녀사냥|Si-kyung Sung, Se...|International TV ...|
| TV-14|                         海的儿子|Li Nanxing, Chris...|International TV ...|
| TV-Y7|                 忍者ハットリくん|                NULL|Anime Series, Kid...|
| TV-14|             ​​Kuch Bheege Alfaaz|Geetanjali Thapa,...|Dramas, Independe...|
| TV-14|             ​SAINT SEIYA: Kni...|Bryson Baugus, Em...|Anime Series, Int...|
| TV-14|                      ​Mayurakshi|Soumitra Chatterj...|Dramas, Internati...|
| TV-14|               ​Maj Rati ​​Keteki|Adil Hussain, Sha...|Dramas, Internati...|
| TV-14|        

**Top 10 highest-rated Disney movies based on average ratings**

In [None]:
avg_rating = df.groupBy(col("title")).agg({"release_year" : "avg"}).withColumnRenamed("avg(release_year)", "avg_year")
avg_rating.orderBy(desc("avg_year")).limit(10).show(truncate = False)
# ['show_id',
#  'type',
#  'title',
#  'director',
#  'cast',
#  'country',
#  'date_added',
#  'release_year',
#  'rating',
#  'duration',
#  'listed_in',
#  'description']

+-----------------------------------------------------+--------+
|title                                                |avg_year|
+-----------------------------------------------------+--------+
|The Parisian Agency: Exclusive Properties            |2021.0  |
|Tribhanga - Tedhi Medhi Crazy                        |2021.0  |
|The Least Expected Day: Inside the Movistar Team 2019|2021.0  |
|Fear Street Part 3: 1666                             |2021.0  |
|Ferry                                                |2021.0  |
|Biohackers                                           |2021.0  |
|The Wedding Coach                                    |2021.0  |
|Headspace Guide to Sleep                             |2021.0  |
|Rurouni Kenshin: The Beginning                       |2021.0  |
|Feels Like Ishq                                      |2021.0  |
+-----------------------------------------------------+--------+



**Disney movie which has the highest average rating**

In [None]:
avg_rating.select("title").orderBy(desc("avg_year")).limit(1).show()

+----------+
|     title|
+----------+
|Biohackers|
+----------+



**Top 10 lowest-rated Disney movies based on average ratings (not including null values)**

In [None]:
avg_rating.dropna().orderBy(asc("avg_year")).limit(10).show(truncate = False)

+---------------------------------------------+--------+
|title                                        |avg_year|
+---------------------------------------------+--------+
|Pioneers: First Women Filmmakers*            |1925.0  |
|Prelude to War                               |1942.0  |
|The Battle of Midway                         |1942.0  |
|Why We Fight: The Battle of Russia           |1943.0  |
|WWII: Report from the Aleutians              |1943.0  |
|Undercover: How to Operate Behind Enemy Lines|1943.0  |
|Tunisian Victory                             |1944.0  |
|The Negro Soldier                            |1944.0  |
|Nazi Concentration Camps                     |1945.0  |
|Five Came Back: The Reference Films          |1945.0  |
+---------------------------------------------+--------+



**Disney movie which has the lowest average rating**

In [None]:
avg_rating.dropna().select("title").orderBy(asc("avg_year")).limit(4).show(truncate = False)

+----------------------------------+
|title                             |
+----------------------------------+
|Pioneers: First Women Filmmakers* |
|Prelude to War                    |
|The Battle of Midway              |
|Why We Fight: The Battle of Russia|
+----------------------------------+



**Calculate average ratings for each genre of Disney movies**

In [None]:
genre = df.select(col("show_id"), col("title"), col("rating"), col("rdate_added"), col("listed_in"))
genre_avg = genre.groupBy("listed_in").agg(round(avg("date_added"), 1)).withColumnRenamed("round(avg(date_added), 1)", "avg_data_added")
genre_avg.dropna().show()
# ['show_id',
#  'type',
#  'title',
#  'director',
#  'cast',
#  'country',
#  'date_added',
#  'release_year',
#  'rating',
#  'duration',
#  'listed_in',
#  'description']

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `rdate_added` cannot be resolved. Did you mean one of the following? [`date_added`, `director`, `rating`, `show_id`, `title`].;
'Project [show_id#548, title#550, rating#556, 'rdate_added, listed_in#558]
+- Relation [show_id#548,type#549,title#550,director#551,cast#552,country#553,date_added#554,release_year#555,rating#556,duration#557,listed_in#558,description#559] csv


**Story plot which includes ghost keyword**

In [None]:
df.select(df.columns[:3]).filter(col("description").like("%smart")).show(10, truncate = False)
# ['show_id',
#  'type',
#  'title',
#  'director',
#  'cast',
#  'country',
#  'date_added',
#  'release_year',
#  'rating',
#  'duration',
#  'listed_in',
#  'description']

**Count the no of movie plot includes evil keyword**

In [None]:
evil_plot = df.filter(col("description").like("%school%"))
evil_plot.count()

**Grouping**

In [None]:
df.groupBy("type").count().show(truncate = False)

**Find the average IMDb rating for each director and sort the result in ascending order.**

In [None]:
avg_rating_runtime = df.groupBy("director").agg(round(avg("release_year"), 1).alias("avgerage year"))\
                     .orderBy(asc("director"))
avg_rating_runtime.dropna().show(truncate = False)
# ['show_id',
#  'type',
#  'title',
#  'director',
#  'cast',
#  'country',
#  'date_added',
#  'release_year',
#  'rating',
#  'duration',
#  'listed_in',
#  'description']

**Determine the number of movies released each year and sort them in descending order.**

In [None]:
mv_released_each_yr = df.filter(col("type") == "movie")\
                      .groupBy("release_year")\
                      .agg(count("rating").alias("No of Movies"))\
                      .orderBy(desc("release_year"))
mv_released_each_yr.show()
# ['show_id',
#  'type',
#  'title',
#  'director',
#  'cast',
#  'country',
#  'date_added',
#  'release_year',
#  'rating',
#  'duration',
#  'listed_in',
#  'description']

**Identify the top 3 directors with the most IMDb votes for their movies.**

In [None]:
changed_datatype = df.select(col("listed_in").cast("int"), col("director"), col("type"), col("title"))
top_five_directors = changed_datatype.filter(col("type") == "movie")\
                     .orderBy(desc("rating")).limit(3)
top_five_directors.show(truncate = False)
# ['show_id',
#  'type',
#  'title',
#  'director',
#  'cast',
#  'country',
#  'date_added',
#  'release_year',
#  'rating',
#  'duration',
#  'listed_in',
#  'description']

**Find the average IMDb rating for movies that won awards and those that didn't.**

In [None]:
movie_with_awards = df.withColumn("has_award",when(col("awards") == "N/A", "no award").otherwise("award"))
avg_rating_by_award = movie_with_awards.groupBy("has_award")\
                     .agg(round(avg("imdb_rating"), 1).alias("average imdb rating"))
avg_rating_by_award.show()

**Determine the most common language for Disney movies.**

In [None]:
most_common_lang = df.groupBy("language")\
                   .agg(count("imdb_id").alias("no of movies"))\
                   .orderBy(desc("no of movies")).limit(1)
most_common_lang.select(df.language.alias("most common language")).show()

**Calculate the average metascore and IMDb rating for movies in each genre.**

In [None]:
avg_rating_by_genre = df.groupBy("genre")\
                      .agg(round(avg("metascore"), 1).alias("average metascore"), round(avg("imdb_rating"), 1).alias("average imdb rating"))
avg_rating_by_genre.dropna().show(truncate = False)