In [8]:
# Install java
!apt install openjdk-8-jdk-headless -qq > /dev/null





In [9]:
# Install Pyspark
!pip install pyspark



In [10]:
# starting sparksession
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("NetflixDataCleaning").getOrCreate()

In [11]:
#Loading the dataset
file_path = "/content/netflix_titles.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)

In [12]:
print("Original Data:")
print("Total Rows:", df.count())
df.show(5)

Original Data:
Total Rows: 8809
+-------+-------+--------------------+---------------+--------------------+-------------+------------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|       director|                cast|      country|        date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+---------------+--------------------+-------------+------------------+------------+------+---------+--------------------+--------------------+
|     s1|  Movie|Dick Johnson Is Dead|Kirsten Johnson|                NULL|United States|September 25, 2021|        2020| PG-13|   90 min|       Documentaries|As her father nea...|
|     s2|TV Show|       Blood & Water|           NULL|Ama Qamata, Khosi...| South Africa|September 24, 2021|        2021| TV-MA|2 Seasons|International TV ...|After crossing pa...|
|     s3|TV Show|           Ganglands|Julien Leclercq|Sami Boua

In [17]:
from pyspark.sql.functions import col, sum
# Count null values for each column
null_counts = df.select([sum(col(c).isNull().cast("integer")).alias(c) for c in df.columns])
# Show the null counts
null_counts.show()

+-------+----+-----+--------+----+-------+----------+------------+------+--------+---------+-----------+
|show_id|type|title|director|cast|country|date_added|release_year|rating|duration|listed_in|description|
+-------+----+-----+--------+----+-------+----------+------------+------+--------+---------+-----------+
|      0|   1|    2|    2636| 826|    832|        13|           2|     6|       5|        3|          3|
+-------+----+-----+--------+----+-------+----------+------------+------+--------+---------+-----------+



In [18]:
# Drop rows with any null values
df_no_nulls = df.na.drop()
print("After removing nulls:")
print("Total Rows:", df_no_nulls.count())

After removing nulls:
Total Rows: 5332


In [20]:
#remove duplicates
df_clean = df_no_nulls.dropDuplicates()
print("After removing duplicates:")
print("Total Rows:", df_clean.count())

After removing duplicates:
Total Rows: 5332


In [21]:
df_clean.show(10)


+-------+-----+--------------------+--------------------+--------------------+--------------------+-----------------+------------+------+--------+--------------------+--------------------+
|show_id| type|               title|            director|                cast|             country|       date_added|release_year|rating|duration|           listed_in|         description|
+-------+-----+--------------------+--------------------+--------------------+--------------------+-----------------+------------+------+--------+--------------------+--------------------+
|   s461|Movie|           Surf's Up|Ash Brannon, Chri...|Shia LaBeouf, Jef...|United States, Ca...|    July 15, 2021|        2007|    PG|  86 min|Children & Family...|This Oscar-nomina...|
|   s695|Movie|               Aziza|      Soudade Kaadan|Caress Bashar, Ab...|      Lebanon, Syria|    June 17, 2021|        2019| TV-PG|  13 min|Comedies, Dramas,...|This short film f...|
|   s883|Movie|Jungle Beat: The ...|         Brent Dawe

In [22]:
#save the cleaned dataset
df_clean.write.csv("/content/netflix_cleaned.csv", header=True, mode="overwrite")
print("Cleaned dataset saved to /content/netflix_cleaned.csv")


Cleaned dataset saved to /content/netflix_cleaned.csv
