In [49]:
import pandas as pd

**Task 2: Removing Duplicates**

**Dataset:** Netflix Movies and TV Shows

In [50]:
# Loading the dataset
netflix_df = pd.read_csv("/content/netflix_titles.csv")

In [51]:
# Inspect the dataset
netflix_df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [52]:
# netflix dataframe shape
netflix_df.shape

(8807, 12)

In [53]:
netflix_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [54]:
netflix_df.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

In [55]:
# Get the initial no of rows
initial_records_count = len(netflix_df)

print(f"Initial number of records is: {initial_records_count}")

Initial number of records is: 8807


In [56]:
# Check for duplicates in rows
duplicate_values_rows=netflix_df.duplicated().sum()


In [57]:
print(f'The data contains {duplicate_values_rows} duplicate values')

The data contains 0 duplicate values


In [58]:
print(f"Number of records with missing 'director' data: {netflix_df['director'].isnull().sum()}")

Number of records with missing 'director' data: 2634


In [59]:
# Check duplicates based on a subset of columns (e.g., Director only)
duplicate_values_column=netflix_df.duplicated(subset=['director'])
print(duplicate_values_column.sum())

4278


In [60]:
# Drop duplicates based on the 'director' column
netflix_df_after_cleaning = netflix_df.drop_duplicates(subset=['director'], keep='first', inplace=False)

In [61]:
# 4. Get the final number of records
final_records_count = len(netflix_df_after_cleaning)
print(f"Number of records after dropping duplicates based on director: {final_records_count}")


Number of records after dropping duplicates based on director: 4529


In [62]:
# 5. Calculate the number of records lost
records_lost = initial_records_count - final_records_count
print(f"Number of initial_records_count: {initial_records_count}")
print(f"Number of final_records_count after cleaning: {final_records_count}")

print(f"Number of records lost: {records_lost}")

Number of initial_records_count: 8807
Number of final_records_count after cleaning: 4529
Number of records lost: 4278


In [63]:
# Save in the CSV and Print the cleaned data
netflix_df_after_cleaning.to_csv("netflix_titles_cleaned.csv", index = False)

df = pd.read_csv("netflix_titles_cleaned.csv")
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s6,TV Show,Midnight Mass,Mike Flanagan,"Kate Siegel, Zach Gilford, Hamish Linklater, H...",,"September 24, 2021",2021,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries",The arrival of a charismatic young priest brin...
4,s7,Movie,My Little Pony: A New Generation,"Robert Cullen, José Luis Ucha","Vanessa Hudgens, Kimiko Glenn, James Marsden, ...",,"September 24, 2021",2021,PG,91 min,Children & Family Movies,Equestria's divided. But a bright-eyed hero be...


In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4529 entries, 0 to 4528
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       4529 non-null   object
 1   type          4529 non-null   object
 2   title         4529 non-null   object
 3   director      4528 non-null   object
 4   cast          4130 non-null   object
 5   country       4228 non-null   object
 6   date_added    4529 non-null   object
 7   release_year  4529 non-null   int64 
 8   rating        4528 non-null   object
 9   duration      4528 non-null   object
 10  listed_in     4529 non-null   object
 11  description   4529 non-null   object
dtypes: int64(1), object(11)
memory usage: 424.7+ KB
