In [None]:

import pandas as pd
import numpy as np
# Plotly is used for interactive visualizations
import plotly.express as px
import plotly.graph_objects as go
# Scikit-learn is used for the prediction model
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import pickle

print("Libraries imported successfully.")


Libraries imported successfully.


In [None]:
import pandas as pd
import numpy as np

# ==============================================================================
# CELL 1: Imports and Data Loading
# ==============================================================================

# Load the dataset (assuming 'netflix_titles.csv' is uploaded to Colab)
df = pd.read_csv('netflix_titles.csv')
print("--- Initial Data Load and Inspection ---")
print(f"DataFrame Shape: {df.shape}")
print("\nTop 5 rows of the raw data:")
print(df.head().to_markdown(index=False, numalign='left', stralign='left'))

# Check initial null counts
print("\nInitial Missing Values:")
print(df.isnull().sum().to_frame(name='Missing Count').T.to_markdown(numalign='left', stralign='left'))

--- Initial Data Load and Inspection ---
DataFrame Shape: (8807, 12)

Top 5 rows of the raw data:
| show_id   | type    | title                 | director        | cast                                                                                                                                                                                                                                                                                                            | country       | date_added         | release_year   | rating   | duration   | listed_in                                                     | description                                                                                                                                              |
|:----------|:--------|:----------------------|:----------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:

anomaly_cols = ['director', 'cast', 'country', 'rating']

print("--- BEFORE: Inspecting NaNs and filling with 'Unknown' ---")
for col in anomaly_cols:
    null_rows = df[df[col].isnull()].index
    if not null_rows.empty:
        print(f"\nCOLUMN: {col} (Original NaNs: {len(null_rows)})")
        print(f"First 3 NaN rows for {col}:")
        print(df.loc[null_rows[:3], [col]].to_markdown(numalign='left', stralign='left'))

    # Action: Fill NaN with 'Unknown'
    df[col] = df[col].fillna('Unknown')

print("\n--- AFTER: Verification ---")
# Verification of one of the filled columns (e.g., director, checking row s2)
s2_row = df[df['show_id'] == 's2'].index[0]
print(f"Row that originally had NULL director (show_id s2) now shows:")
print(df.loc[s2_row, ['director', 'rating']].to_frame().T.to_markdown(numalign='left', stralign='left'))

--- BEFORE: Inspecting NaNs and filling with 'Unknown' ---

COLUMN: director (Original NaNs: 2634)
First 3 NaN rows for director:
|    | director   |
|:---|:-----------|
| 1  | nan        |
| 3  | nan        |
| 4  | nan        |

COLUMN: cast (Original NaNs: 825)
First 3 NaN rows for cast:
|    | cast   |
|:---|:-------|
| 0  | nan    |
| 3  | nan    |
| 10 | nan    |

COLUMN: country (Original NaNs: 831)
First 3 NaN rows for country:
|    | country   |
|:---|:----------|
| 2  | nan       |
| 3  | nan       |
| 5  | nan       |

COLUMN: rating (Original NaNs: 4)
First 3 NaN rows for rating:
|      | rating   |
|:-----|:---------|
| 5989 | nan      |
| 6827 | nan      |
| 7312 | nan      |

--- AFTER: Verification ---
Row that originally had NULL director (show_id s2) now shows:
|    | director   | rating   |
|:---|:-----------|:---------|
| 1  | Unknown    | TV-MA    |


In [None]:
# ==============================================================================
# CELL 3: Date Cleaning and Feature Engineering (date_added)
# ==============================================================================

print("--- BEFORE: Inspecting date_added ---")
# Anomaly 1: Missing values (fill with mode)
df['date_added'] = df['date_added'].fillna(df['date_added'].mode()[0])
print("Missing values filled with mode.")

# Anomaly 2: Leading whitespace (e.g., " September 25, 2021")
sample_date_anomalies = df['date_added'].loc[df['date_added'].str.startswith(' ', na=False)].head(3)
if not sample_date_anomalies.empty:
    print("\nSample of Date Anomaly (Leading Whitespace):")
    print(sample_date_anomalies.to_markdown(numalign='left', stralign='left'))

# Action: Strip whitespace
df['date_added'] = df['date_added'].str.strip()
print("Whitespace stripped from date_added.")

# Action: Convert to datetime and create new features
df['date_added'] = pd.to_datetime(df['date_added'])
df['month_added'] = df['date_added'].dt.strftime('%B')
df['year_added'] = df['date_added'].dt.year

print("\n--- AFTER: Verification ---")
print(f"date_added Dtype: {df['date_added'].dtype}")
print("First 5 rows of engineered date features:")
print(df[['date_added', 'month_added', 'year_added']].head().to_markdown(index=False, numalign='left', stralign='left'))

--- BEFORE: Inspecting date_added ---
Missing values filled with mode.

Sample of Date Anomaly (Leading Whitespace):
|      | date_added        |
|:-----|:------------------|
| 6079 | August 4, 2017    |
| 6177 | December 23, 2018 |
| 6213 | December 15, 2018 |
Whitespace stripped from date_added.

--- AFTER: Verification ---
date_added Dtype: datetime64[ns]
First 5 rows of engineered date features:
| date_added          | month_added   | year_added   |
|:--------------------|:--------------|:-------------|
| 2021-09-25 00:00:00 | September     | 2021         |
| 2021-09-24 00:00:00 | September     | 2021         |
| 2021-09-24 00:00:00 | September     | 2021         |
| 2021-09-24 00:00:00 | September     | 2021         |
| 2021-09-24 00:00:00 | September     | 2021         |


In [None]:
# ==============================================================================
# CELL 4: Duration Cleaning and Feature Engineering (duration)
# ==============================================================================

print("--- BEFORE: Inspecting duration Anomaly ---")
# Anomaly 1: Missing values (already handled in Cell 2, but we verify)
if df['duration'].isnull().any():
    df['duration'] = df['duration'].fillna(df['duration'].mode()[0])
    print("Missing values handled (filled with mode).")

# Anomaly 2: Mixed units ('min' vs 'Seasons')
print("Sample of Mixed Duration Units:")
print(df[['duration', 'type']].head().to_markdown(index=False, numalign='left', stralign='left'))

# Action: Separate 'duration' into numerical and categorical components
df['duration_int'] = df['duration'].str.split(' ').str[0].astype(int)
df['duration_type'] = df['duration'].str.split(' ').str[1]

print("\n--- AFTER: Verification ---")
print("First 5 rows showing clean split:")
print(df[['duration', 'duration_int', 'duration_type']].head().to_markdown(index=False, numalign='left', stralign='left'))

--- BEFORE: Inspecting duration Anomaly ---
Missing values handled (filled with mode).
Sample of Mixed Duration Units:
| duration   | type    |
|:-----------|:--------|
| 90 min     | Movie   |
| 2 Seasons  | TV Show |
| 1 Season   | TV Show |
| 1 Season   | TV Show |
| 2 Seasons  | TV Show |

--- AFTER: Verification ---
First 5 rows showing clean split:
| duration   | duration_int   | duration_type   |
|:-----------|:---------------|:----------------|
| 90 min     | 90             | min             |
| 2 Seasons  | 2              | Seasons         |
| 1 Season   | 1              | Season          |
| 1 Season   | 1              | Season          |
| 2 Seasons  | 2              | Seasons         |


In [None]:
# 1. Content Type Distribution (Bar Chart)
type_counts = df['type'].value_counts().reset_index()
type_counts.columns = ['Type', 'Count']
fig1 = px.bar(
    type_counts,
    x='Type',
    y='Count',
    title='1. Distribution of Content Type on Netflix',
    color='Type',
    color_discrete_sequence=['#E50914', '#000000'],
    text='Count'
)
fig1.update_traces(textposition='outside')
fig1.show()

In [None]:
# 2. Content Added Over Time (Line Chart)
yearly_additions = df.groupby('year_added').size().reset_index(name='Count')
fig2 = px.line(
    yearly_additions,
    x='year_added',
    y='Count',
    title='2. Trend of Titles Added to Netflix Over Time',
    markers=True,
    line_shape='spline',
    color_discrete_sequence=['#E50914']
)
fig2.update_layout(xaxis_title='Year Added', yaxis_title='Number of Titles Added')
fig2.show()

In [None]:
# 3. Top 10 Content Producing Countries (Bar Chart)
country_df = df['country'].str.split(', ', expand=True).stack().reset_index(level=1, drop=True).to_frame('country_single')
country_counts = country_df[country_df['country_single'] != 'Unknown']['country_single'].value_counts().head(10).reset_index()
country_counts.columns = ['Country', 'Count']

fig3 = px.bar(
    country_counts,
    x='Count',
    y='Country',
    orientation='h',
    title='3. Top 10 Content Producing Countries (Excluding "Unknown")',
    color_discrete_sequence=['#3065A6'],
    text='Count'
)
fig3.update_yaxes(autorange="reversed")
fig3.update_traces(textposition='outside')
fig3.show()

In [None]:
# 4. Movie Duration Distribution (Histogram for Movies only)
df_movies = df[df['type'] == 'Movie'].copy()
fig4 = px.histogram(
    df_movies,
    x='duration_int',
    nbins=50,
    title='4. Distribution of Movie Runtimes (in minutes)',
    color_discrete_sequence=['#E50914']
)
fig4.update_layout(xaxis_title='Duration (Minutes)', yaxis_title='Number of Movies')
fig4.show()

In [None]:
# 5. TV Show Season Distribution (Bar Chart for TV Shows only)
df_tvshows = df[df['type'] == 'TV Show'].copy()
tv_season_counts = df_tvshows['duration_int'].value_counts().sort_index().reset_index()
tv_season_counts.columns = ['Seasons', 'Count']

# Combine seasons 5+ into '5+ Seasons' for better visualization
tv_season_counts['Seasons_Grouped'] = tv_season_counts['Seasons'].apply(lambda x: f'{x} Season' if x < 5 else '5+ Seasons')
tv_season_counts_grouped = tv_season_counts.groupby('Seasons_Grouped')['Count'].sum().reset_index()
ordered_seasons = ['1 Season', '2 Season', '3 Season', '4 Season', '5+ Seasons']
tv_season_counts_grouped['Seasons_Grouped'] = pd.Categorical(tv_season_counts_grouped['Seasons_Grouped'], categories=ordered_seasons, ordered=True)
tv_season_counts_grouped = tv_season_counts_grouped.sort_values('Seasons_Grouped')


fig5 = px.bar(
    tv_season_counts_grouped,
    x='Seasons_Grouped',
    y='Count',
    title='5. Distribution of TV Show Seasons',
    color='Seasons_Grouped',
    color_discrete_sequence=px.colors.sequential.Sunset,
    text='Count'
)
fig5.update_traces(textposition='outside')
fig5.update_layout(xaxis_title='Number of Seasons', yaxis_title='Number of TV Shows')
fig5.show()


In [None]:
# 5. Rating Distribution by Type (Stacked Bar Chart for top 5 ratings)
# Find the top 5 most common ratings
top_5_ratings = df['rating'].value_counts().index[:5]
df_rating_type = df[df['rating'].isin(top_5_ratings)]

rating_type_counts = df_rating_type.groupby(['rating', 'type']).size().reset_index(name='Count')

fig5 = px.bar(
    rating_type_counts,
    x='rating',
    y='Count',
    color='type',
    title='6. Content Type Breakdown for Top 5 Ratings',
    labels={'rating': 'Rating', 'type': 'Content Type'},
    color_discrete_map={'Movie': '#E50914', 'TV Show': '#3065A6'}
)
fig5.show()

In [None]:
# 4. Content Rating Distribution (Pie Chart)
# Exclude the "Unknown" rating for a cleaner look at categorized content
rating_counts = df[df['rating'] != 'Unknown']['rating'].value_counts().head(10).reset_index()
rating_counts.columns = ['Rating', 'Count']

fig4 = px.pie(
    rating_counts,
    names='Rating',
    values='Count',
    title='7. Top 10 Most Frequent Content Ratings Distribution',
    color_discrete_sequence=px.colors.qualitative.Bold,
)
fig4.update_traces(textposition='inside', textinfo='percent+label')
fig4.show()


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import pickle

# --- Load Original Data and Clean ---
# We use the original file and apply necessary cleaning/feature engineering here.
df = pd.read_csv('netflix_titles.csv')

# Handling Missing Values
for col in ['director', 'cast', 'country', 'rating', 'duration']:
    df[col] = df[col].fillna(df[col].mode()[0])

# Feature Engineering for Modeling
df['duration_int'] = df['duration'].str.split(' ').str[0].astype(int)
df['director_single'] = df['director'].str.split(', ', expand=True)[0]
df['cast_main'] = df['cast'].str.split(', ', expand=True)[0]

# Custom Features (from your previous optimized model)
df['type_country'] = df['type'] + '_' + df['country']
director_counts = df['director_single'].map(df['director_single'].value_counts())
df['director_activity'] = director_counts

# --- 1. Target Simplification (Grouping 17+ ratings into 3 classes) ---

def simplify_rating(rating_str):
    """Groups detailed content ratings into three major buckets."""
    # Group 1: Adult/Mature Content (High Age Restrictions)
    if rating_str in ['TV-MA', 'R', 'NC-17', 'UR', 'NR', 'A']:
        return 'Adult_Mature'
    # Group 2: Teen/General Family (Medium Age Restrictions)
    elif rating_str in ['PG-13', 'PG', 'TV-PG', 'TV-14', 'TV-G', 'G', 'TV-Y7-FV', 'M']:
        return 'Teen_Family'
    # Group 3: Kids/Young Children (Low/No Restrictions)
    else: # Includes TV-Y, TV-Y7, etc.
        return 'Kids_Young'

# Apply the simplification to get a new target variable
df['rating_simplified'] = df['rating'].apply(simplify_rating)

# 2. Re-encode the NEW, simplified target variable
le_simple = LabelEncoder()
df['rating_encoded_simple'] = le_simple.fit_transform(df['rating_simplified'])
print("Simplified Rating Map:", dict(zip(le_simple.classes_, le_simple.transform(le_simple.classes_))))


# --- 3. Preprocessing for Modeling ---
features_to_use = ['type', 'country', 'release_year', 'duration_int', 'cast_main',
                   'director_activity', 'type_country', 'director_single']

df_model = df[features_to_use + ['rating_encoded_simple']].copy()

# X and Y definition
X = pd.get_dummies(df_model.drop('rating_encoded_simple', axis=1), drop_first=True)
y = df_model['rating_encoded_simple']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- 4. Model Training (Deeper Random Forest Classifier) ---
model = RandomForestClassifier(n_estimators=300, max_depth=30, random_state=42, n_jobs=-1)
print("\nTraining Random Forest model on SIMPLIFIED target (3 classes)...")
model.fit(X_train, y_train)
predictions = model.predict(X_test)

# --- 5. Evaluation and Saving ---
unique_labels_present = np.unique(np.concatenate((y_test, predictions)))

# Get the simplified labels for the report
filtered_target_names = le_simple.inverse_transform(unique_labels_present.astype(int))

accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions, labels=unique_labels_present.astype(int), target_names=filtered_target_names, zero_division=0)

print("Model training complete!")
print("\n--- Model Evaluation (Simplified Target Accuracy) ---")
print(f"Final Prediction Accuracy: {accuracy:.4f}")
print("\nClassification Report (Precision, Recall, F1 for each Simplified Rating):")
print(report)
print("-" * 50)

# Save Final Model (using the simplified encoder)
model_features = list(X.columns)
with open('netflix_rating_classifier_simple.pkl', 'wb') as file:
    pickle.dump({'model': model, 'features': model_features, 'encoder': le_simple}, file)

print("SUCCESS: Optimized Model (Simplified Target) saved for deployment.")

Simplified Rating Map: {'Adult_Mature': np.int64(0), 'Kids_Young': np.int64(1), 'Teen_Family': np.int64(2)}

Training Random Forest model on SIMPLIFIED target (3 classes)...
Model training complete!

--- Model Evaluation (Simplified Target Accuracy) ---
Final Prediction Accuracy: 0.6288

Classification Report (Precision, Recall, F1 for each Simplified Rating):
              precision    recall  f1-score   support

Adult_Mature       0.61      0.75      0.67       841
  Kids_Young       1.00      0.06      0.11       118
 Teen_Family       0.65      0.59      0.62       803

    accuracy                           0.63      1762
   macro avg       0.75      0.46      0.47      1762
weighted avg       0.66      0.63      0.61      1762

--------------------------------------------------
SUCCESS: Optimized Model (Simplified Target) saved for deployment.


In [None]:
with open('netflix_rating_classifier_simple.pkl', 'wb') as file:
    pickle.dump({'model': model, 'features': model_features, 'encoder': le_simple}, file)


In [None]:
from google.colab import files
files.download('netflix_rating_classifier_simple.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>