# Netflix search engine

In [11]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.preprocessing import MinMaxScaler, MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the dataset
netflix_data = pd.read_csv(r"D:\PYTON PROGRAMMING\PYTHON FILES\Scikit-Learn\PROJECT\DATA\netflix_titles.csv")

# Data Preprocessing

# Convert 'type' column to numeric
netflix_data['type_num'] = netflix_data['type'].map({'TV Show': 0, 'Movie': 1})

# Scale 'release_year' and 'duration'
scaler = MinMaxScaler()
netflix_data['release_year_scaled'] = scaler.fit_transform(netflix_data[['release_year']])

def extract_duration(val):
    try:
        return int(val.split(' ')[0])
    except:
        return None
netflix_data['duration_num'] = netflix_data['duration'].apply(extract_duration)
netflix_data['duration_num_scaled'] = scaler.fit_transform(netflix_data[['duration_num']])

# Convert generes to binary
netflix_data['genre_list'] = netflix_data['listed_in'].apply(lambda x: x.split(', '))
mlb = MultiLabelBinarizer()
genre_netflix_data = pd.DataFrame(mlb.fit_transform(netflix_data['genre_list']), columns=mlb.classes_, index=netflix_data.index) 
netflix_data = pd.concat([netflix_data, genre_netflix_data], axis=1) 

# Cast and Country
netflix_data['cast'] = netflix_data['cast'].fillna('')
netflix_data['country'] = netflix_data['country'].fillna('')        # Fill missing values

# TF-IDF for cast
cast_vectorizer = TfidfVectorizer(max_features=50)
cast_matrix = cast_vectorizer.fit_transform(netflix_data['cast'])

# TF-IDF for country
country_vectorizer = TfidfVectorizer(max_features=20)  # can tweak max_features as needed
country_matrix = country_vectorizer.fit_transform(netflix_data['country'])

# Combine all features
features = np.hstack([
    netflix_data[['type_num', 'release_year_scaled', 'duration_num_scaled']].fillna(0).values,
    genre_netflix_data.values,
    cast_matrix.toarray(),
    country_matrix.toarray()
])

# Model Training
model_knn = NearestNeighbors(n_neighbors=6, metric='cosine')
model_knn.fit(features)

# Testing the model
title = "Kota Factory"
filtered_df = netflix_data[netflix_data['title'] == title]

if not filtered_df.empty:
    target_index = filtered_df.index[0]     # Returns the row number in the actual row number before filtering.
    distances, indices = model_knn.kneighbors([features[target_index]], n_neighbors=30)

    target_genres = set(netflix_data.loc[target_index, 'genre_list'])
    target_year = netflix_data.loc[target_index, 'release_year']
    target_type = netflix_data.loc[target_index, 'type']
    target_country = netflix_data.loc[target_index, 'country']

    scored_neighbors = []

    for i in indices[0][1:]:  # Skip the first—it’s the query itself
        row = netflix_data.loc[i]
        score = 0

        # Genre match
        if target_genres.intersection(row['genre_list']):
            score += 5
        
        # Type match
        if row['type'] == target_type:
            score += 3
        
        # Year proximity
        if abs(row['release_year'] - target_year) <= 2:
            score += 1

        # Country match
        if row['country'] == target_country:
            score += 1
        scored_neighbors.append((score, i))

    # Sort and show top results
    scored_neighbors.sort(reverse=True)
    print(f"Smarter Recommendations for '{title}':")
    for score, idx in scored_neighbors[:10]:
        print(f"{netflix_data.loc[idx, 'title']} (Score: {score})")
else:
    print(f"❌ Title '{title}' not found in the dataset.")

Smarter Recommendations for 'Kota Factory':
Little Things (Score: 10)
Taj Mahal 1989 (Score: 10)
Mismatched (Score: 10)
Bhaag Beanie Bhaag (Score: 10)
Yeh Meri Family (Score: 9)
Rishta.com (Score: 9)
Bh Se Bhade (Score: 9)
Engineering Girls (Score: 9)
College Romance (Score: 9)
My Dear Warrior (Score: 9)
