# Step-by-Step Plan

**PHASE 1: Content-Based Filtering**
- **Step 1:** Load and Prepare Data
- **Step 2:** Feature Selection for Recommendation
- **Step 3:** Text Vectorization (TF-IDF)
- **Step 4:** Compute Similarity Scores
- **Step 5:** Build a Recommendation Function
- **Step 6:** Test with Sample Inputs

**PHASE 2: Simulate Collaborative Filtering (To build Hybrid)**
- **Step 7:** Simulate Ratings Data 
- **Step 8:** Collaborative Filtering using Surprise
- **Step 9:** Final Hybrid Recommender

# PHASE1: Content-Based Filtering

In [None]:
# conda update --all
# conda install -c conda-forge scikit-surprise

In [4]:
# pip install scikit-surprise

In [None]:
# Step 1: Load and Prepare Data

In [1]:
import pandas as pd
import random

from warnings import filterwarnings
filterwarnings('ignore')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split

In [2]:
# Load the cleaned dataset (replace with your actual path)
df = pd.read_csv("clean_netflix_for_powerbi.csv")

In [3]:
# Preview the data
df = df[['title', 'description', 'listed_in', 'type', 'duration', 'rating']]
df.dropna(subset=['description', 'listed_in'], inplace=True)
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,title,description,listed_in,type,duration,rating
0,Dick Johnson Is Dead,"As her father nears the end of his life, filmm...",Documentaries,Movie,90 min,PG-13
1,Blood & Water,"After crossing paths at a party, a Cape Town t...","International TV Shows, TV Dramas, TV Mysteries",TV Show,2 Seasons,TV-MA
2,Ganglands,To protect his family from a powerful drug lor...,"Crime TV Shows, International TV Shows, TV Act...",TV Show,1 Season,TV-MA
3,Jailbirds New Orleans,"Feuds, flirtations and toilet talk go down amo...","Docuseries, Reality TV",TV Show,1 Season,TV-MA
4,Kota Factory,In a city of coaching centers known to train I...,"International TV Shows, Romantic TV Shows, TV ...",TV Show,2 Seasons,TV-MA


In [4]:
# Step 2: Combine Text Features

In [5]:
# Create a new column for text-based features
df['combined_features'] = df['listed_in'].fillna('') + ' ' + df['description'].fillna('')

In [6]:
# Step 3: Convert Text to Vectors (TF-IDF)(TF-IDF Vectorization)

In [7]:
# from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the combined features
tfidf_matrix = tfidf.fit_transform(df['combined_features'])

In [8]:
# Step 4: Cosine Similarity Calculation

In [9]:
# from sklearn.metrics.pairwise import linear_kernel

# Compute cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [10]:
# Step 5: Build Reverse Index

In [11]:
# Mapping title to index for lookup
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [12]:
# Step 5: Create the Recommendation Function

In [13]:
def recommend(title, num_recommendations=5):
    if title not in indices:
        return "Title not found in dataset."

    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations + 1]
    show_indices = [i[0] for i in sim_scores]

    return df[['title', 'listed_in', 'description']].iloc[show_indices]

In [14]:
# Step 6: Test Your Recommender

In [15]:
recommend("Blood & Water", num_recommendations=5)

Unnamed: 0,title,listed_in,description
108,Dive Club,"Kids' TV, TV Dramas, Teen TV Shows","On the shores of Cape Mercy, a skillful group ..."
4271,Lion Pride,"International TV Shows, Romantic TV Shows, TV ...","After crossing paths at a crime scene, a renow..."
4022,More to Say,"International TV Shows, TV Dramas, TV Mysteries","After her son dies in an accident, a legal con..."
222,Clickbait,"Crime TV Shows, TV Dramas, TV Mysteries",When family man Nick Brewer is abducted in a c...
4487,Accidentally in Love,"International TV Shows, Romantic TV Shows, TV ...","Rejecting the demands of her wealthy family, a..."


# PHASE 2: Simulate Collaborative Filtering (To build Hybrid)

In [16]:
# Step 7: Simulate Ratings Data 

# Assume df is your existing Netflix metadata DataFrame
titles = df['title'].dropna().unique().tolist()
users = [f'user_{i}' for i in range(1, 11)]  # 10 fake users

# Generate 20 ratings per user
simulated_ratings = []
for user in users:
    rated_titles = random.sample(titles, 20)
    for title in rated_titles:
        rating = random.randint(1, 5)
        simulated_ratings.append([user, title, rating])

ratings_df = pd.DataFrame(simulated_ratings, columns=['user_id', 'title', 'rating'])

In [17]:
ratings_df.head()

Unnamed: 0,user_id,title,rating
0,user_1,Life 2.0,2
1,user_1,P. King Duckling,2
2,user_1,Sherlock,2
3,user_1,Mighty Raju,2
4,user_1,Paathi,2


In [18]:
# Step 8: Collaborative Filtering using Surprise

# Prepare data for Surprise
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['user_id', 'title', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2)

# Train SVD model
model = SVD()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x24bdcf84880>

In [19]:
# Step 9: Final Hybrid Recommender

def recommend_hybrid(title, user_id, n=5, alpha=0.5):
    if title not in indices or user_id not in ratings_df['user_id'].unique():
        return "Invalid title or user ID."

    idx = indices[title]
    content_scores = list(enumerate(cosine_sim[idx]))
    content_scores = sorted(content_scores, key=lambda x: x[1], reverse=True)
    content_scores = [x for x in content_scores if df.iloc[x[0]]['title'] != title]

    hybrid_scores = []
    for i, sim_score in content_scores[:50]:
        candidate_title = df.iloc[i]['title']
        try:
            collab_pred = model.predict(user_id, candidate_title).est
        except:
            collab_pred = 0
        final_score = alpha * sim_score + (1 - alpha) * (collab_pred / 5)  # Normalize
        hybrid_scores.append((candidate_title, final_score))

    top_hybrid = sorted(hybrid_scores, key=lambda x: x[1], reverse=True)[:n]
    top_titles = [title for title, _ in top_hybrid]
    return df[df['title'].isin(top_titles)][['title', 'type', 'rating', 'listed_in', 'description']]

In [20]:
recommend_hybrid("Narcos", "user_1", n=5)

Unnamed: 0,title,type,rating,listed_in,description
2,Ganglands,TV Show,TV-MA,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
2921,Narcos: Mexico,TV Show,TV-MA,"Crime TV Shows, TV Action & Adventure, TV Dramas",Witness the birth of the Mexican drug war in t...
4750,El Chapo,TV Show,TV-MA,"Crime TV Shows, Spanish-Language TV Shows, TV ...",This drama series chronicles the true story of...
6673,El Cartel 2,TV Show,TV-MA,"Crime TV Shows, International TV Shows, Spanis...",Drug trafficker Pepe Cadena navigates the trea...
7463,Miss Dynamite,TV Show,TV-14,"Crime TV Shows, International TV Shows, Spanis...","Wealthy, beautiful Valentina falls in love, on..."
