In [None]:
from pyspark.sql import SparkSession

In [None]:
# Initialising the spark session
spark = SparkSession.builder.appName('EDA_NetflixTitles').getOrCreate()

In [None]:
# Reading the csv dataset into Spark DataFrame
df = spark.read.csv('netflix_titles.csv', header=True, inferSchema=True)

In [None]:
# Printing schema of the dataset (columns with data types)
df.printSchema()

# Displaying the first ten rows of the dataset
df.show(10)

#Counting the number of rows in dataset - Output(6236)
df.count()

In [None]:
# Statisical summary - currently no useful information since data type is string
df.describe().show()

df.describe([“title”]).show()

In [None]:
# Showing count of shows/movies by country
df.groupBy("country").count().show()

# Showing count of shows/movies by type (i.e. whether a TV show or movie)
df.groupBy("listed_in").count().show()

# Showing count of shows/movies by genre
df.groupBy("listed_in").count().show()

# Showing count of shows/movies by release year
df.groupBy("release_year").count().show()

In [None]:
# Checking each of the columns of the datset for missing values
from pyspark.sql.functions import col
df.select([col(c).isNull().alias(c) for c in df.columns]).show()

# Removing the rows with missing values and save it to df_cleaned
df_cleaned = df.dropna()

# Displaying the first five rows of the cleaned dataset
df_cleaned.show(5)

# Counting the number of rows in the filtered dataset (almost halved - 3772)
df_cleaned.count()

In [None]:
# Converting data types to integer for release year and show ID
df_cleaned = df_cleaned.withColumn("release_year", df_cleaned["release_year"].cast("int"))
df_cleaned = df_cleaned.withColumn("show_id", df_cleaned["show_id"].cast("int"))

In [None]:
# Printing schema to check updated schema
df_cleaned.printSchema()

In [None]:
# Showing statistical summary of the release years of the movies/shows in dataset
df_cleaned.describe("release_year").show()

# Showing the count of movies/shows by genre in descending order
df_cleaned.groupBy("listed_in").count().orderBy("count", ascending=False).show()

# Showing the count of movies/shows by rating in descending order
df_cleaned.groupBy("rating").count().orderBy("count", ascending=False).show()

In [None]:
from pyspark.sql.functions import length

df_cleaned = df_cleaned.withColumn("title_length", length("title"))

# Showing the top 5 movie/show titles with the longest character lengths
df_cleaned.orderBy("title_length", ascending=False).select("title", "title_length").show(5)

# Showing the least 5 movie/show titles with the shortest character lengths
df_cleaned.orderBy("title_length").select("title", "title_length").show(5)