# Netflix Dataset Analysis

In [47]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")
%matplotlib inline 

OUTDIR = "outputs"
os.makedirs(OUTDIR, exist_ok=True)

CSV_IN = "netflix_titles.csv"  # make sure this file is in the same folder

## Data Transformation

In [48]:
def parse_duration(s):
    """Return (type, value) from strings like '90 min' or '1 Season'"""
    if pd.isna(s): return (None, None)
    s = str(s).strip()
    if s == "": return (None, None)
    s_low = s.lower()
    if "min" in s_low:
        m = re.search(r'(\d+)', s_low)
        return ("min", int(m.group(1)) if m else None)
    if "season" in s_low:
        m = re.search(r'(\d+)', s_low)
        return ("season", int(m.group(1)) if m else None)
    m = re.search(r'(\d+)', s_low)
    if m: return ("unknown", int(m.group(1)))
    return (None, None)

def first_item_from_comma_field(x):
    if pd.isna(x): return "Unknown"
    parts = [p.strip() for p in str(x).split(",") if p.strip()]
    return parts[0] if parts else "Unknown"

## Load Dataset

In [49]:
if not os.path.exists(CSV_IN):
    raise SystemExit(f"File not found: {CSV_IN} — put it in the root")

df = pd.read_csv(CSV_IN)
print("Original shape:", df.shape)

# Fix Column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
print("Columns:", list(df.columns))
df.head(3)

Original shape: (8807, 12)
Columns: ['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added', 'release_year', 'rating', 'duration', 'listed_in', 'description']


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...


## Data Cleaning

In [50]:
# Drop duplicates
df.drop_duplicates(inplace=True)

# Parse date_added
if 'date_added' in df.columns:
    df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')
    df['added_year'] = df['date_added'].dt.year
    df['added_month'] = df['date_added'].dt.month

# Ensure release_year is numeric
if 'release_year' in df.columns:
    df['release_year'] = pd.to_numeric(df['release_year'], errors='coerce').astype('Int64')

# Parse duration
if 'duration' in df.columns:
    parsed = df['duration'].apply(lambda x: pd.Series(parse_duration(x), index=['duration_type', 'duration_int']))
    df = pd.concat([df, parsed], axis=1)

# Genres list
if 'listed_in' in df.columns:
    df['genres_list'] = df['listed_in'].fillna("Unknown").apply(lambda s: [g.strip() for g in str(s).split(",") if g.strip()])

# Primary country & director
if 'country' in df.columns:
    df['primary_country'] = df['country'].apply(first_item_from_comma_field)
if 'director' in df.columns:
    df['primary_director'] = df['director'].apply(first_item_from_comma_field)
else:
    df['primary_director'] = "Unknown"

print("After cleaning shape:", df.shape)
df.info()

# Output Cleaned CSV
clean_path = os.path.join(OUTDIR, "netflix_cleaned.csv")
df.to_csv(clean_path, index=False)
print("Saved cleaned CSV to", clean_path)

After cleaning shape: (8807, 19)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   show_id           8807 non-null   object        
 1   type              8807 non-null   object        
 2   title             8807 non-null   object        
 3   director          6173 non-null   object        
 4   cast              7982 non-null   object        
 5   country           7976 non-null   object        
 6   date_added        8709 non-null   datetime64[ns]
 7   release_year      8807 non-null   Int64         
 8   rating            8803 non-null   object        
 9   duration          8804 non-null   object        
 10  listed_in         8807 non-null   object        
 11  description       8807 non-null   object        
 12  added_year        8709 non-null   float64       
 13  added_month       8709 non-null   float64    

## Data Analysis

### Movies vs Shows

In [51]:
results = {}

if 'type' in df.columns:
    results['type_counts'] = df['type'].value_counts()
    results['type_counts']
    print(results['type_counts'])

type
Movie      6131
TV Show    2676
Name: count, dtype: int64


### Top Country

In [52]:

if 'primary_country' in df.columns:
    results['top_countries'] = df['primary_country'].value_counts().head(15)
    results['top_countries']
    print(results['top_countries'])

primary_country
United States     3211
India             1008
Unknown            831
United Kingdom     628
Canada             271
Japan              259
France             213
South Korea        212
Spain              181
Mexico             134
Australia          117
Egypt              112
Turkey             111
Germany            103
China              100
Name: count, dtype: int64


### Top Directors

In [53]:
results['top_directors'] = df['primary_director'].value_counts().head(15)
results['top_directors']

primary_director
Unknown                2634
Rajiv Chilaka            22
Raúl Campos              18
Suhas Kadav              16
Marcus Raboy             16
Jay Karas                15
Cathy Garcia-Molina      13
Martin Scorsese          12
Youssef Chahine          12
Jay Chapman              12
Steven Spielberg         11
Don Michael Paul         10
Shannon Hartman           9
Yılmaz Erdoğan            9
David Dhawan              9
Name: count, dtype: int64

### Top Genres

In [54]:
if 'genres_list' in df.columns:
    genres_exploded = df.explode('genres_list')
    genres_exploded['genres_list'] = genres_exploded['genres_list'].fillna('Unknown')
    results['top_genres'] = genres_exploded['genres_list'].value_counts().head(20)
    results['top_genres']
    print(results['top_genres'])

genres_list
International Movies        2752
Dramas                      2427
Comedies                    1674
International TV Shows      1351
Documentaries                869
Action & Adventure           859
TV Dramas                    763
Independent Movies           756
Children & Family Movies     641
Romantic Movies              616
TV Comedies                  581
Thrillers                    577
Crime TV Shows               470
Kids' TV                     451
Docuseries                   395
Music & Musicals             375
Romantic TV Shows            370
Horror Movies                357
Stand-Up Comedy              343
Reality TV                   255
Name: count, dtype: int64


### New Releases Every Year

In [55]:
if 'added_year' in df.columns:
    df_valid = df['added_year'].dropna().astype(int)
    results['adds_by_year'] = df_valid.value_counts().sort_index()
    print(results['adds_by_year'])

added_year
2008       2
2009       2
2010       1
2011      13
2012       3
2013      10
2014      23
2015      73
2016     418
2017    1164
2018    1625
2019    1999
2020    1878
2021    1498
Name: count, dtype: int64


### Most VS Least popular release years

In [56]:
if 'release_year' in df.columns:
    df_valid = df['release_year'].dropna().astype(int)
    year_counts = df_valid.value_counts().sort_index()
    
    top_years = year_counts.head(5)
    bottom_years = year_counts.tail(5)
    
    print("Most Popular Release Years:")
    print(top_years.to_string(name=False))
    
    print("Least Popular Release Years:")
    print(bottom_years.to_string(name=False))


Most Popular Release Years:
release_year
1925    1
1942    2
1943    3
1944    3
1945    4
Least Popular Release Years:
release_year
2017    1032
2018    1147
2019    1030
2020     953
2021     592


In [57]:
if 'duration_type' in df.columns and 'duration_int' in df.columns:
    movies = df[(df['duration_type'] == 'min') & df['duration_int'].notna()]['duration_int'].astype(int)
    if not movies.empty:
        print("Movies:")
        print(f"The Shortest Movie is only {movies.min()} minutes")
        print(f"The longest movie is a WHOPPING {movies.max()} minutes")
    
    tv_shows = df[(df['duration_type'] != 'min') & df['duration_int'].notna()][['title', 'duration_type', 'duration_int']]
    if not tv_shows.empty:
        tv_shows['duration_int'] = tv_shows['duration_int'].astype(int)
        
        shortest = tv_shows.loc[tv_shows['duration_int'].idxmin()]
        longest = tv_shows.loc[tv_shows['duration_int'].idxmax()]
        
        print("\nTV Shows:")
        print(f"Show with the most seasons is {longest['title']} with {longest['duration_int']} {longest['duration_type']}s")
    else:
        print("No TV shows found.")


Movies:
The Shortest Movie is only 3 minutes
The longest movie is a WHOPPING 312 minutes

TV Shows:
Show with the most seasons is Grey's Anatomy with 17 seasons


## Visualizations

### Bar Graph Showing the Top Generes

In [58]:

if 'top_genres' in results:
    plt.figure(figsize=(10,6))
    sns.barplot(x=results['top_genres'].values, y=results['top_genres'].index)
    plt.title("Top genres (by number of titles)")
    plt.xlabel("Number of titles")
    plt.tight_layout()
    plt.savefig(os.path.join(OUTDIR, "top_genres.png"), dpi=200)
    plt.close()

![Top Genres](outputs/top_genres.png)

### Line graph showing additions by year

In [59]:
if 'adds_by_year' in results:
    plt.figure(figsize=(10,5))
    results['adds_by_year'].sort_index().plot(kind='line', marker='o')
    plt.title("Titles added to Netflix (by year)")
    plt.xlabel("Year added")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig(os.path.join(OUTDIR, "adds_by_year.png"), dpi=200)
    plt.close()

![Additions by Year](outputs/adds_by_year.png)

### Bar Graph Showing the Top Countries

In [60]:

if 'top_countries' in results:
    plt.figure(figsize=(10,6))
    sns.barplot(x=results['top_countries'].values, y=results['top_countries'].index)
    plt.title("Top countries (primary)")
    plt.xlabel("Number of titles")
    plt.tight_layout()
    plt.savefig(os.path.join(OUTDIR, "top_countries.png"), dpi=200)
    plt.close()

![Top Countries](outputs/top_countries.png)

### Graph showing the amount of movies by length

In [61]:
if 'movie_durations' in results and len(results['movie_durations'])>0:
    plt.figure(figsize=(8,5))
    sns.histplot(results['movie_durations'], bins=40)
    plt.title("Distribution of movie durations (minutes)")
    plt.xlabel("Minutes")
    plt.tight_layout()
    plt.savefig(os.path.join(OUTDIR, "movie_durations_hist.png"), dpi=200)
    plt.close()

![Movies By Length](outputs/movie_durations_hist.png)

## Report 

In [62]:
summary_lines = []

summary_lines.append("# Netflix Dataset Summary\n")

# Movies vs Shows
if 'type_counts' in results:
    summary_lines.append("## Titles by Type")
    for t, c in results['type_counts'].items():
        summary_lines.append(f"- {t}: {c}")
    summary_lines.append("")

# Top countries
if 'top_countries' in results:
    summary_lines.append("## Top Countries (Primary)")
    for c, n in results['top_countries'].head(10).items():
        summary_lines.append(f"- {c}: {n}")
    summary_lines.append("")

# Top Directors
if 'top_directors' in results:
    summary_lines.append("## Top Directors")
    for d, n in results['top_directors'].head(10).items():
        summary_lines.append(f"- {d}: {n}")
    summary_lines.append("")

# Top Genres
if 'top_genres' in results:
    summary_lines.append("## Top Genres")
    for g, n in results['top_genres'].head(10).items():
        summary_lines.append(f"- {g}: {n}")
    summary_lines.append("")

# Releases per year
if 'adds_by_year' in results:
    summary_lines.append("## Titles Added by Year (last 5 years)")
    for y, n in results['adds_by_year'].tail(5).items():
        summary_lines.append(f"- {y}: {n}")
    summary_lines.append("")

# Top release years
if 'release_year' in df.columns:
    df_valid = df['release_year'].dropna().astype(int)
    year_counts = df_valid.value_counts().sort_index()
    top_years = year_counts.head(5)
    bottom_years = year_counts.tail(5)
    summary_lines.append("## Most Popular Release Years")
    for y, n in top_years.items():
        summary_lines.append(f"- {y}: {n}")
    summary_lines.append("## Least Popular Release Years")
    for y, n in bottom_years.items():
        summary_lines.append(f"- {y}: {n}")
    summary_lines.append("")

# Dulation
if 'duration_type' in df.columns and 'duration_int' in df.columns:
    movies = df[(df['duration_type'] == 'min') & df['duration_int'].notna()]['duration_int'].astype(int)
    if not movies.empty:
        summary_lines.append("## Movie Durations")
        summary_lines.append(f"- Shortest movie: {movies.min()} minutes")
        summary_lines.append(f"- Longest movie: {movies.max()} minutes")
    
    tv_shows = df[(df['duration_type'] != 'min') & df['duration_int'].notna()][['title', 'duration_type', 'duration_int']]
    if not tv_shows.empty:
        tv_shows['duration_int'] = tv_shows['duration_int'].astype(int)
        shortest = tv_shows.loc[tv_shows['duration_int'].idxmin()]
        longest = tv_shows.loc[tv_shows['duration_int'].idxmax()]
        summary_lines.append("## TV Shows")
        summary_lines.append(f"- Show with fewest seasons: {shortest['title']} ({shortest['duration_int']} {shortest['duration_type']}s)")
        summary_lines.append(f"- Show with most seasons: {longest['title']} ({longest['duration_int']} {longest['duration_type']}s)")

# Write the summary of all my danalysis into a markdown file
summary_path = os.path.join(OUTDIR, "summary.md")
with open(summary_path, "w", encoding="utf-8") as f:
    f.write("\n".join(summary_lines))

print(f"Summary saved to {summary_path}")


Summary saved to outputs\summary.md
