In [1]:
# Netflix Titles Data Analysis & Cleaning Script

# Import required library
import pandas as pd

# Step 1: Load the dataset
df = pd.read_csv("D:/Data Analytics Projects/Netflix/Dataset/Netflix_titles.csv")

# Step 2: Basic dataset information
print("Columns in dataset:")
print(df.columns)
print("\n Shape of dataset (rows, columns):", df.shape)

# Step 3: Preview first 5 records
print("\n First 5 rows of dataset:")
print(df.head())

# Step 4: Check for missing values in each column
print("\n Missing values in each column:")
print(df.isnull().sum())

# Step 5: Check unique value counts per column
print("\n Unique value counts in each column:")
print(df.nunique())

# Step 6: Generate statistical summary for numerical columns
print("\n Statistical summary of numerical columns:")
print(df.describe())

# Step 7: Distribution of content types (Movies vs TV Shows)
print("\n Content type distribution:")
print(df['type'].value_counts())

# Step 8: Top 10 countries producing most titles
print("\n Top 10 countries by content count:")
print(df['country'].value_counts().head(10))

# Step 9: Top 10 most common genres/listings
print("\n Top 10 genres/listings:")
print(df['listed_in'].value_counts().head(10))

# Step 10: Check for duplicate titles
print("\n Duplicate titles count:", df['title'].duplicated().sum())

# Step 11: Most common duration values
print("\n Top 10 most common durations:")
print(df['duration'].value_counts().head(10))

# Step 12: Release year statistics
print("\n Release year distribution summary:")
print(df['release_year'].describe())

# Step 13: Extract numeric duration values
# (e.g., from '90 min' → 90, '3 Seasons' → 3)
df['duration_cleaned'] = df['duration'].str.extract('(\d+)').astype(float)

# Step 14: Identify duration units (minutes or seasons)
df['duration_unit'] = df['duration'].apply(
    lambda x: 'minutes' if isinstance(x, str) and 'min' in x.lower() else
              'seasons' if isinstance(x, str) and 'season' in x.lower() else None
)

# Step 15: Preview cleaned duration columns
print("\n Preview of cleaned duration columns:")
print(df[['type', 'duration', 'duration_cleaned', 'duration_unit']].head())

# Step 16: Export cleaned dataset to new CSV file
output_path = "D:/Data Analytics Projects/Netflix/Dataset/updated_netflix_titles.csv"
df.to_csv(output_path, index=False)
print(f"\n Cleaned dataset saved successfully to:\n{output_path}")


Columns in dataset:
Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

 Shape of dataset (rows, columns): (8807, 12)

 First 5 rows of dataset:
  show_id     type                  title         director  \
0      s1    Movie   Dick Johnson Is Dead  Kirsten Johnson   
1      s2  TV Show          Blood & Water              NaN   
2      s3  TV Show              Ganglands  Julien Leclercq   
3      s4  TV Show  Jailbirds New Orleans              NaN   
4      s5  TV Show           Kota Factory              NaN   

                                                cast        country  \
0                                                NaN  United States   
1  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   South Africa   
2  Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...            NaN   
3                                                NaN            NaN   
4 