In [38]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shutil
import pickle
from tqdm import tqdm
from collections import defaultdict
from surprise import Dataset, Reader, SVD, KNNBasic, NMF
from surprise.model_selection import cross_validate
from surprise.accuracy import rmse, mae
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split


In [39]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle (6).json


{'kaggle (6).json': b'{"username":"saminrazeghi","key":"d35cdf9ab3da4c583007f59bf86b7e49"}'}

In [40]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [41]:
netflix_dir = "/content/netflix_data"
required_file = os.path.join(netflix_dir, "combined_data_1.txt")  # just check the first file

# Check if data is already downloaded
if not os.path.exists(required_file):
    print("Netflix dataset not found. Downloading from Kaggle...")
    os.makedirs(netflix_dir, exist_ok=True)
    !kaggle datasets download -d netflix-inc/netflix-prize-data -p /content/netflix_data --unzip
    print("Download and extraction complete.")
else:
    print("Netflix dataset already exists. Skipping download.")


Netflix dataset already exists. Skipping download.


In [42]:
print("Step 1: Loading the Netflix dataset...")
# Function to load ratings data into a DataFrame
def load_ratings_to_dataframe(file_path, max_rows=100000):
    """Load Netflix ratings data into a pandas DataFrame"""
    print(f"Loading ratings from {file_path}...")

    # Parse ratings data
    data = []
    current_movie = None
    count = 0

    with open(file_path, 'r') as f:
        for line in tqdm(f, desc="Loading ratings"):
            line = line.strip()

            if line.endswith(':'):
                current_movie = int(line[:-1])
            else:
                user_id, rating, date = line.split(',')
                data.append([current_movie, int(user_id), int(rating), date])

                count += 1
                if count >= max_rows:
                    break

    # Create DataFrame
    ratings_df = pd.DataFrame(data, columns=['movie_id', 'user_id', 'rating', 'date'])

    print(f"\nLoaded {len(ratings_df)} ratings from {ratings_df['movie_id'].nunique()} movies")
    print(ratings_df.head())

    return ratings_df

Step 1: Loading the Netflix dataset...


In [43]:
# Function to load movie titles into a DataFrame
def load_movie_titles_to_dataframe(file_path):
    """Load Netflix movie titles into a pandas DataFrame"""
    print(f"Loading movie titles from {file_path}...")

    # Read file line by line
    data = []
    with open(file_path, 'r', encoding='latin1') as f:
        for line in tqdm(f, desc="Loading titles"):
            parts = line.strip().split(',')

            if len(parts) >= 1:
                movie_id = int(parts[0])

                # Extract year if available
                year = None
                if len(parts) >= 2 and parts[1].strip():
                    try:
                        year = int(parts[1])
                    except:
                        year = None

                # Extract title if available
                title = ""
                if len(parts) >= 3:
                    title = ','.join(parts[2:])

                data.append([movie_id, year, title])

    # Create DataFrame
    titles_df = pd.DataFrame(data, columns=['movie_id', 'year', 'title'])

    print(f"\nLoaded {len(titles_df)} movie titles")
    print(titles_df.head())

    return titles_df

In [44]:
# Load the data
dataset_path = "/content/netflix_data"
ratings_file = os.path.join(dataset_path, 'combined_data_1.txt')
titles_file = os.path.join(dataset_path, 'movie_titles.csv')

In [45]:
# Check if files exist and load them
if os.path.exists(ratings_file):
    ratings_df = load_ratings_to_dataframe(ratings_file)
else:
    print(f"Ratings file not found at {ratings_file}")
    ratings_df = None

if os.path.exists(titles_file):
    titles_df = load_movie_titles_to_dataframe(titles_file)
else:
    print(f"Titles file not found at {titles_file}")
    titles_df = None

Loading ratings from /content/netflix_data/combined_data_1.txt...


Loading ratings: 100029it [00:00, 292623.21it/s]



Loaded 100000 ratings from 30 movies
   movie_id  user_id  rating        date
0         1  1488844       3  2005-09-06
1         1   822109       5  2005-05-13
2         1   885013       4  2005-10-19
3         1    30878       4  2005-12-26
4         1   823519       3  2004-05-03
Loading movie titles from /content/netflix_data/movie_titles.csv...


Loading titles: 17770it [00:00, 643528.11it/s]



Loaded 17770 movie titles
   movie_id    year                         title
0         1  2003.0               Dinosaur Planet
1         2  2004.0    Isle of Man TT 2004 Review
2         3  1997.0                     Character
3         4  1994.0  Paula Abdul's Get Up & Dance
4         5  2004.0      The Rise and Fall of ECW


In [46]:
!ls -lh /content/netflix_data


total 2.0G
drwxr-xr-x 4 root root 4.0K Apr 22 18:51 app
-rw-r--r-- 1 root root 473M Apr 22 18:26 combined_data_1.txt
-rw-r--r-- 1 root root 530M Apr 22 18:26 combined_data_2.txt
-rw-r--r-- 1 root root 444M Apr 22 18:26 combined_data_3.txt
-rw-r--r-- 1 root root 527M Apr 22 18:27 combined_data_4.txt
drwxr-xr-x 2 root root 4.0K Apr 22 18:14 data
drwxr-xr-x 2 root root 4.0K Apr 22 18:17 figures
drwxr-xr-x 2 root root 4.0K Apr 22 18:27 models
-rw-r--r-- 1 root root 565K Apr 22 18:27 movie_titles.csv
-rw-r--r-- 1 root root 2.3M Apr 22 18:14 netflix_sample.csv
-rw-r--r-- 1 root root  11M Apr 22 18:27 probe.txt
-rw-r--r-- 1 root root  51M Apr 22 18:27 qualifying.txt
-rw-r--r-- 1 root root 5.8K Apr 22 18:26 README
-rw-r--r-- 1 root root  900 Apr 22 19:14 sample_recommendations.txt


In [47]:
print("\nStep 2: Creating a sample dataset for analysis...")
# Prepare a sample dataset for the recommendation system
sample_path = os.path.join(dataset_path, 'netflix_sample.csv')

# Create sample ratings file if it doesn't exist
if not os.path.exists(sample_path) and ratings_df is not None:
    print(f"Creating sample ratings file at {sample_path}")

    # Convert date to datetime format
    ratings_df['date'] = pd.to_datetime(ratings_df['date'])

    # Save to CSV
    ratings_df.to_csv(sample_path, index=False)
    print(f"Saved sample ratings to {sample_path}")
else:
    print(f"Sample ratings file already exists at {sample_path}")
    # Load the sample if it exists
    ratings_df = pd.read_csv(sample_path)
    ratings_df['date'] = pd.to_datetime(ratings_df['date'])


Step 2: Creating a sample dataset for analysis...
Sample ratings file already exists at /content/netflix_data/netflix_sample.csv


In [48]:
# Create necessary directories
models_dir = os.path.join(dataset_path, 'models')
data_dir = os.path.join(dataset_path, 'data')
app_dir = os.path.join(dataset_path, 'app')
app_data_dir = os.path.join(app_dir, 'data')
app_models_dir = os.path.join(app_dir, 'models')
fig_dir = os.path.join(dataset_path, 'figures')

for directory in [models_dir, data_dir, app_dir, app_data_dir, app_models_dir, fig_dir]:
    os.makedirs(directory, exist_ok=True)

# Copy sample to app directory
if not os.path.exists(os.path.join(app_data_dir, 'netflix_sample.csv')):
    shutil.copy(sample_path, os.path.join(app_data_dir, 'netflix_sample.csv'))
    print(f"Copied sample data to {os.path.join(app_data_dir, 'netflix_sample.csv')}")

# Copy titles to app directory
if titles_df is not None and not os.path.exists(os.path.join(app_data_dir, 'movie_titles.csv')):
    titles_df.to_csv(os.path.join(app_data_dir, 'movie_titles.csv'), index=False)
    print(f"Saved movie titles to {os.path.join(app_data_dir, 'movie_titles.csv')}")

print("Sample dataset preparation complete.")

Sample dataset preparation complete.


In [49]:
print("\nStep 3: Preprocessing the data...")
# Preprocess the Netflix dataset
def preprocess_netflix_data(ratings_df, titles_df):
    """
    Preprocess the Netflix dataset for modeling.

    Args:
        ratings_df: DataFrame with ratings
        titles_df: DataFrame with movie titles

    Returns:
        Processed DataFrame ready for modeling
    """
    print("Preprocessing data...")

    # Make sure date is datetime
    if not pd.api.types.is_datetime64_dtype(ratings_df['date']):
        ratings_df['date'] = pd.to_datetime(ratings_df['date'])

    # Add temporal features
    ratings_df['year'] = ratings_df['date'].dt.year
    ratings_df['month'] = ratings_df['date'].dt.month
    ratings_df['day'] = ratings_df['date'].dt.day
    ratings_df['day_of_week'] = ratings_df['date'].dt.dayofweek

    # Merge with movie metadata
    processed_df = ratings_df.merge(titles_df, on='movie_id', how='left', suffixes=('', '_movie'))

    # Rename year column from movie titles to avoid conflict
    if 'year_movie' in processed_df.columns:
        processed_df = processed_df.rename(columns={'year_movie': 'release_year'})

    # Create user features
    user_features = ratings_df.groupby('user_id').agg({
        'rating': ['mean', 'std', 'count'],
        'movie_id': 'nunique'
    })

    user_features.columns = ['user_avg_rating', 'user_rating_std', 'user_rating_count', 'user_movie_count']
    user_features = user_features.reset_index()

    # Create movie features
    movie_features = ratings_df.groupby('movie_id').agg({
        'rating': ['mean', 'std', 'count'],
        'user_id': 'nunique'
    })

    movie_features.columns = ['movie_avg_rating', 'movie_rating_std', 'movie_rating_count', 'movie_user_count']
    movie_features = movie_features.reset_index()

    # Merge features back to the main dataframe
    processed_df = processed_df.merge(user_features, on='user_id', how='left')
    processed_df = processed_df.merge(movie_features, on='movie_id', how='left')

    print("Preprocessing complete.")
    print(f"Processed dataframe shape: {processed_df.shape}")
    print(processed_df.head())

    return processed_df


Step 3: Preprocessing the data...


In [50]:
# Process data
if ratings_df is not None and titles_df is not None:
    processed_df = preprocess_netflix_data(ratings_df, titles_df)
else:
    print("Cannot preprocess data - missing ratings or titles data")
    processed_df = None

print("\nStep 4: Performing exploratory data analysis...")

Preprocessing data...
Preprocessing complete.
Processed dataframe shape: (100000, 18)
   movie_id  user_id  rating       date  year  month  day  day_of_week  \
0         1  1488844       3 2005-09-06  2005      9    6            1   
1         1   822109       5 2005-05-13  2005      5   13            4   
2         1   885013       4 2005-10-19  2005     10   19            2   
3         1    30878       4 2005-12-26  2005     12   26            0   
4         1   823519       3 2004-05-03  2004      5    3            0   

   release_year            title  user_avg_rating  user_rating_std  \
0        2003.0  Dinosaur Planet              3.0         0.816497   
1        2003.0  Dinosaur Planet              5.0              NaN   
2        2003.0  Dinosaur Planet              4.5         0.707107   
3        2003.0  Dinosaur Planet              3.0         1.224745   
4        2003.0  Dinosaur Planet              3.4         1.516575   

   user_rating_count  user_movie_count  movie_av

In [51]:
# Explore and visualize the Netflix dataset
def explore_netflix_data(df):
    """
    Perform exploratory data analysis on the Netflix dataset.

    Args:
        df: Preprocessed DataFrame
    """
    print("Performing exploratory data analysis...")

    # Set plot style
    plt.style.use('default')  # Using default style to avoid version issues

    # Basic dataset information
    print("\nDataset Overview:")
    print(f"Total ratings: {len(df)}")
    print(f"Unique users: {df['user_id'].nunique()}")
    print(f"Unique movies: {df['movie_id'].nunique()}")
    print(f"Rating range: {df['rating'].min()} to {df['rating'].max()}")
    if 'date' in df.columns:
        print(f"Date range: {df['date'].min()} to {df['date'].max()}")

    # Rating distribution
    plt.figure(figsize=(10, 6))
    sns.countplot(x='rating', data=df)
    plt.title('Distribution of Ratings')
    plt.xlabel('Rating')
    plt.ylabel('Count')
    plt.savefig(os.path.join(fig_dir, 'rating_distribution.png'))
    plt.close()

    # User activity distribution
    plt.figure(figsize=(10, 6))
    user_activity = df.groupby('user_id')['rating'].count()
    sns.histplot(user_activity, bins=50, kde=True)
    plt.title('Distribution of Ratings per User')
    plt.xlabel('Number of Ratings')
    plt.ylabel('Number of Users')
    plt.xscale('log')
    plt.savefig(os.path.join(fig_dir, 'user_activity.png'))
    plt.close()

    # Movie popularity distribution
    plt.figure(figsize=(10, 6))
    movie_popularity = df.groupby('movie_id')['rating'].count()
    sns.histplot(movie_popularity, bins=50, kde=True)
    plt.title('Distribution of Ratings per Movie')
    plt.xlabel('Number of Ratings')
    plt.ylabel('Number of Movies')
    plt.xscale('log')
    plt.savefig(os.path.join(fig_dir, 'movie_popularity.png'))
    plt.close()

    # Average rating by movie popularity
    plt.figure(figsize=(10, 6))
    movie_stats = df.groupby('movie_id').agg({'rating': ['mean', 'count']})
    movie_stats.columns = ['avg_rating', 'count']
    movie_stats = movie_stats.reset_index()

    plt.scatter(movie_stats['count'], movie_stats['avg_rating'], alpha=0.5)
    plt.title('Average Rating vs. Movie Popularity')
    plt.xlabel('Number of Ratings')
    plt.ylabel('Average Rating')
    plt.xscale('log')
    plt.savefig(os.path.join(fig_dir, 'rating_vs_popularity.png'))
    plt.close()

    print("EDA complete. Figures saved to:", fig_dir)

In [52]:
# Explore the data
if processed_df is not None:
    explore_netflix_data(processed_df)
else:
    print("Cannot perform EDA - processed data is not available")

print("\nStep 5: Creating train/test split...")

Performing exploratory data analysis...

Dataset Overview:
Total ratings: 100000
Unique users: 81490
Unique movies: 30
Rating range: 1 to 5
Date range: 2000-01-06 00:00:00 to 2005-12-31 00:00:00
EDA complete. Figures saved to: /content/netflix_data/figures

Step 5: Creating train/test split...


In [53]:
# Create train/test split
def create_train_test_split(df, method='time', test_size=0.2):
    """
    Split the dataset into training and testing sets.

    Args:
        df: Preprocessed DataFrame
        method: 'random' or 'time' based splitting
        test_size: Proportion of data for testing

    Returns:
        train_df, test_df: Training and testing DataFrames
    """
    print(f"Creating {method}-based train/test split with test_size={test_size}...")

    if method == 'time' and 'date' in df.columns:
        # Sort by timestamp
        df = df.sort_values('date')

        # Use oldest data for training, newest for testing
        train_size = int((1 - test_size) * len(df))
        train_df = df.iloc[:train_size]
        test_df = df.iloc[train_size:]
    else:
        # Random split
        train_df, test_df = train_test_split(df, test_size=test_size, random_state=42)

    print(f"Train set: {len(train_df)} samples")
    print(f"Test set: {len(test_df)} samples")

    return train_df, test_df

In [54]:
# Split the data
if processed_df is not None:
    train_df, test_df = create_train_test_split(processed_df, method='time')
else:
    print("Cannot create train/test split - processed data is not available")
    train_df, test_df = None, None

Creating time-based train/test split with test_size=0.2...
Train set: 80000 samples
Test set: 20000 samples


In [55]:
def save_and_copy_model(model_obj, name, models_dir, app_models_dir):
    # Save to models_dir
    model_path = os.path.join(models_dir, f"{name}_model.pkl")
    with open(model_path, 'wb') as f:
        pickle.dump(model_obj, f)
    print(f"{name} model saved to {model_path}")

    # Copy to app_models_dir
    os.makedirs(app_models_dir, exist_ok=True)
    app_model_path = os.path.join(app_models_dir, f"{name}_model.pkl")
    shutil.copy(model_path, app_model_path)
    print(f"Copied to app directory at {app_model_path}")

In [56]:
print("\nStep 6: Building recommendation models...")
# Build recommendation models
def build_recommendation_models(train_df, test_df):
    """
    Build and evaluate multiple recommendation models.

    Args:
        train_df: Training DataFrame
        test_df: Testing DataFrame

    Returns:
        Dictionary of trained models and their performance metrics
    """
    print("Building recommendation models...")

    # Convert dataframes to Surprise format
    reader = Reader(rating_scale=(1, 5))

    train_data = Dataset.load_from_df(train_df[['user_id', 'movie_id', 'rating']], reader)
    train_set = train_data.build_full_trainset()

    # Prepare test data
    test_set = [(row.user_id, row.movie_id, row.rating)
                for _, row in test_df[['user_id', 'movie_id', 'rating']].iterrows()]

    # Initialize models
    models = {
        'SVD': SVD(n_factors=50, n_epochs=10, lr_all=0.005, reg_all=0.02),
        'KNN': KNNBasic(k=40, sim_options={'name': 'pearson_baseline', 'user_based': False}),
        'NMF': NMF(n_factors=50, n_epochs=20)
    }

    # Train and evaluate each model
    results = {}

    for name, model in models.items():
        print(f"\nTraining {name} model...")
        model.fit(train_set)

        # Make predictions on test set
        print(f"Evaluating {name} model...")
        predictions = model.test(test_set)

        # Calculate RMSE
        rmse_score = rmse(predictions)
        mae_score = mae(predictions)

        results[name] = {
            'model': model,
            'rmse': rmse_score,
            'mae': mae_score,
            'predictions': predictions
        }

        print(f"{name} Results: RMSE = {rmse_score:.4f}, MAE = {mae_score:.4f}")

    # Train a simple baseline model (global mean)
    global_mean = train_df['rating'].mean()
    baseline_rmse = np.sqrt(np.mean((test_df['rating'] - global_mean) ** 2))
    baseline_mae = np.mean(np.abs(test_df['rating'] - global_mean))

    results['Baseline'] = {
        'model': None,
        'rmse': baseline_rmse,
        'mae': baseline_mae,
        'predictions': None
    }

    print(f"\nBaseline Results: RMSE = {baseline_rmse:.4f}, MAE = {baseline_mae:.4f}")

    for name, model in models.items():
        print(f"\nTraining {name} model...")
        model.fit(train_set)

        print(f"Evaluating {name} model...")
        predictions = model.test(test_set)

        rmse_score = rmse(predictions)
        mae_score = mae(predictions)

        results[name] = {
            'model': model,
            'rmse': rmse_score,
            'mae': mae_score,
            'predictions': predictions
        }

        print(f"{name} Results: RMSE = {rmse_score:.4f}, MAE = {mae_score:.4f}")

        # Save and copy model to both locations
        save_and_copy_model(model, name.lower(), models_dir, app_models_dir)



    return results


Step 6: Building recommendation models...


In [57]:
# Build recommendation models
if train_df is not None and test_df is not None:
    model_results = build_recommendation_models(train_df, test_df)
else:
    print("Cannot build models - train/test data is not available")
    model_results = {}

Building recommendation models...

Training SVD model...
Evaluating SVD model...
RMSE: 1.0665
MAE:  0.8359
SVD Results: RMSE = 1.0665, MAE = 0.8359

Training KNN model...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating KNN model...
RMSE: 1.1896
MAE:  0.9735
KNN Results: RMSE = 1.1896, MAE = 0.9735

Training NMF model...
Evaluating NMF model...
RMSE: 1.2651
MAE:  1.0339
NMF Results: RMSE = 1.2651, MAE = 1.0339

Baseline Results: RMSE = 1.1451, MAE = 0.9578

Training SVD model...
Evaluating SVD model...
RMSE: 1.0673
MAE:  0.8364
SVD Results: RMSE = 1.0673, MAE = 0.8364
svd model saved to /content/netflix_data/models/svd_model.pkl
Copied to app directory at /content/netflix_data/app/models/svd_model.pkl

Training KNN model...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating KNN model...
RMSE: 1.1896
MAE:  0.9735
KNN Results: RMSE = 

In [58]:
# First, define all classes at the module level
class ContentBasedRecommender:
    def __init__(self, similarity_matrix, indices, movie_data):
        self.similarity_matrix = similarity_matrix
        self.indices = indices
        self.movie_data = movie_data

    def get_recommendations(self, movie_id, top_n=10):
        """Get content-based recommendations for a movie"""
        # Get the index of the movie
        if movie_id not in self.indices:
            return []

        idx = self.indices[movie_id]

        # Get similarity scores
        sim_scores = list(enumerate(self.similarity_matrix[idx]))

        # Sort movies based on similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Get top N most similar movies
        sim_scores = sim_scores[1:top_n+1]

        # Get movie indices
        movie_indices = [i[0] for i in sim_scores]

        # Return movie IDs and similarity scores
        return [(self.movie_data.iloc[i]['movie_id'], sim_scores[j][1])
                for j, i in enumerate(movie_indices)]

class HybridRecommender:
    def __init__(self, cf_model, content_recommender, cf_weight=0.7):
        self.cf_model = cf_model
        self.content_recommender = content_recommender
        self.cf_weight = cf_weight

    def predict(self, user_id, movie_id):
        """Predict rating for a user-movie pair using hybrid approach"""
        try:
            # Get collaborative filtering prediction
            cf_pred = self.cf_model.predict(user_id, movie_id).est
            # Normalize to 0-1 scale
            cf_pred_norm = (cf_pred - 1) / 4
            # Default content score
            content_score = 0.5
            # Return weighted prediction
            return self.cf_weight * cf_pred_norm + (1 - self.cf_weight) * content_score
        except:
            # Return default score if prediction fails
            return 0.5

# Now use the globally defined class in your building function
def build_content_based_model(titles_df):
    """
    Build a content-based recommendation model using movie metadata.

    Args:
        titles_df: Movie titles DataFrame

    Returns:
        Content-based model dictionary
    """
    print("Building content-based filtering model...")

    # Create a text representation of movie features
    titles_df['text_features'] = titles_df['title'].fillna('')

    if 'year' in titles_df.columns:
        # Convert year to string and combine with title
        titles_df['text_features'] = titles_df['text_features'] + ' ' + titles_df['year'].fillna('').astype(str)

    # Create TF-IDF vectors
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(titles_df['text_features'])

    # Compute similarity matrix
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

    # Create a mapping from movie IDs to indices
    indices = pd.Series(titles_df.index, index=titles_df['movie_id']).drop_duplicates()

    # Create the recommender object
    recommender = ContentBasedRecommender(cosine_sim, indices, titles_df)

    print("Content-based model built successfully.")

    return {
        'model': recommender,
        'similarity_matrix': cosine_sim,
        'indices': indices,
        'tfidf': tfidf
    }

# Build content-based model
if titles_df is not None:
    content_model = build_content_based_model(titles_df)
else:
    print("Cannot build content-based model - movie titles data is not available")
    content_model = None

print("\nStep 8: Building hybrid recommendation system...")

# Build hybrid model
if 'SVD' in model_results and content_model is not None:
    try:
        print("Building hybrid recommendation model...")
        # Extract the SVD model
        svd_model = model_results['SVD']['model']

        # Extract the content-based model
        content_recommender = content_model['model']

        # Create the hybrid recommender instance
        hybrid_recommender = HybridRecommender(svd_model, content_recommender)

        # Create model for pickling
        pickle_model = {
            'svd_model': svd_model,
            'content_model': content_recommender,
            'cf_weight': 0.7
        }

        # Save the model
        os.makedirs(models_dir, exist_ok=True)
        model_path = os.path.join(models_dir, 'hybrid_model.pkl')
        with open(model_path, 'wb') as f:
            pickle.dump(pickle_model, f)

        # Copy to app directory
        os.makedirs(app_models_dir, exist_ok=True)
        app_model_path = os.path.join(app_models_dir, 'hybrid_model.pkl')
        shutil.copy(model_path, app_model_path)

        print(f"Hybrid model saved to {model_path}")
        print(f"Copied to app directory at {app_model_path}")

        # Store full hybrid model for use in the current session
        hybrid_model = {
            'svd_model': svd_model,
            'content_model': content_recommender,
            'hybrid_recommender': hybrid_recommender,
            'cf_weight': 0.7
        }

    except Exception as e:
        print(f"Error building hybrid model: {str(e)}")
        import traceback
        traceback.print_exc()  # Print the full stack trace
        hybrid_model = None
else:
    print("Cannot build hybrid model - missing SVD model or content model")
    hybrid_model = None

Building content-based filtering model...
Content-based model built successfully.

Step 8: Building hybrid recommendation system...
Building hybrid recommendation model...
Hybrid model saved to /content/netflix_data/models/hybrid_model.pkl
Copied to app directory at /content/netflix_data/app/models/hybrid_model.pkl


In [59]:
print("\nStep 9: Creating evaluation metrics visualization...")
# Visualize model results
def visualize_model_results(model_results):
    """
    Visualize the performance of different recommendation models.
    """
    print("Visualizing model results...")

    # Create figures directory if it doesn't exist
    fig_dir = os.path.join(dataset_path, 'figures')
    os.makedirs(fig_dir, exist_ok=True)

    # RMSE comparison
    plt.figure(figsize=(12, 6))
    rmse_values = {name: result['rmse'] for name, result in model_results.items() if 'rmse' in result}
    if rmse_values:
        plt.bar(rmse_values.keys(), rmse_values.values())
        plt.title('RMSE Comparison of Different Models')
        plt.ylabel('RMSE (lower is better)')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(os.path.join(fig_dir, 'rmse_comparison.png'))
        plt.close()
        print(f"RMSE comparison chart saved to {os.path.join(fig_dir, 'rmse_comparison.png')}")

    # MAE comparison
    plt.figure(figsize=(12, 6))
    mae_values = {name: result['mae'] for name, result in model_results.items() if 'mae' in result}
    if mae_values:
        plt.bar(mae_values.keys(), mae_values.values())
        plt.title('MAE Comparison of Different Models')
        plt.ylabel('MAE (lower is better)')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(os.path.join(fig_dir, 'mae_comparison.png'))
        plt.close()
        print(f"MAE comparison chart saved to {os.path.join(fig_dir, 'mae_comparison.png')}")




Step 9: Creating evaluation metrics visualization...


In [60]:
# Visualize model results
if model_results:
    visualize_model_results(model_results)
else:
    print("Cannot visualize results - no model results available")

Visualizing model results...
RMSE comparison chart saved to /content/netflix_data/figures/rmse_comparison.png
MAE comparison chart saved to /content/netflix_data/figures/mae_comparison.png


In [61]:
# Create sample recommendations
def create_sample_recommendations(model_results, processed_df, titles_df):
    """
    Create sample recommendations for demo purposes.
    """
    if not model_results or processed_df is None or titles_df is None:
        print("Cannot create sample recommendations - missing required data")
        return

    print("Creating sample recommendations...")

    # Choose a random user
    sample_user_id = processed_df['user_id'].sample(1).iloc[0]

    # Get movies rated by this user
    user_movies = processed_df[processed_df['user_id'] == sample_user_id]
    print(f"Sample recommendations for user {sample_user_id}")
    print(f"User has rated {len(user_movies)} movies")

    # Show user's top-rated movies
    top_rated = user_movies.sort_values('rating', ascending=False).head(5)
    print("\nUser's top-rated movies:")
    for _, row in top_rated.iterrows():
        movie_info = titles_df[titles_df['movie_id'] == row['movie_id']]
        if not movie_info.empty:
            movie = movie_info.iloc[0]
            print(f"  - {movie['title']} ({movie.get('year', 'N/A')}): {row['rating']}/5")

    # Get recommendations from each model
    print("\nRecommendations from different models:")

    for model_name, result in model_results.items():
        if model_name == 'Baseline' or 'model' not in result or result['model'] is None:
            continue

        print(f"\n{model_name} Recommendations:")

        # Get all movies not rated by this user
        rated_movies = set(user_movies['movie_id'])
        all_movies = set(processed_df['movie_id'].unique())
        unrated_movies = list(all_movies - rated_movies)

        # Limit to 100 random unrated movies for efficiency
        if len(unrated_movies) > 100:
            unrated_movies = np.random.choice(unrated_movies, 100, replace=False)

        # Make predictions for unrated movies
        predictions = []
        model = result['model']

        for movie_id in unrated_movies:
            try:
                pred = model.predict(sample_user_id, movie_id)
                predictions.append((movie_id, pred.est))
            except:
                continue

        # Sort by predicted rating and get top 5
        predictions.sort(key=lambda x: x[1], reverse=True)
        top_recs = predictions[:5]

        # Display recommendations
        for movie_id, score in top_recs:
            movie_info = titles_df[titles_df['movie_id'] == movie_id]
            if not movie_info.empty:
                movie = movie_info.iloc[0]
                print(f"  - {movie['title']} ({movie.get('year', 'N/A')}): {score:.2f}/5")

    # Save sample recommendations to file
    recommendations_file = os.path.join(dataset_path, 'sample_recommendations.txt')
    with open(recommendations_file, 'w') as f:
        f.write(f"Sample recommendations for user {sample_user_id}\n")
        f.write(f"User has rated {len(user_movies)} movies\n\n")

        f.write("User's top-rated movies:\n")
        for _, row in top_rated.iterrows():
            movie_info = titles_df[titles_df['movie_id'] == row['movie_id']]
            if not movie_info.empty:
                movie = movie_info.iloc[0]
                f.write(f"  - {movie['title']} ({movie.get('year', 'N/A')}): {row['rating']}/5\n")

        for model_name, result in model_results.items():
            if model_name == 'Baseline' or 'model' not in result or result['model'] is None:
                continue

            f.write(f"\n{model_name} Recommendations:\n")

            for movie_id, score in top_recs:
                movie_info = titles_df[titles_df['movie_id'] == movie_id]
                if not movie_info.empty:
                    movie = movie_info.iloc[0]
                    f.write(f"  - {movie['title']} ({movie.get('year', 'N/A')}): {score:.2f}/5\n")

    print(f"\nSample recommendations saved to {recommendations_file}")

In [62]:
# Create sample recommendations
if model_results and processed_df is not None and titles_df is not None:
    create_sample_recommendations(model_results, processed_df, titles_df)
else:
    print("Cannot create sample recommendations - missing required data")

Creating sample recommendations...
Sample recommendations for user 449866
User has rated 1 movies

User's top-rated movies:
  - Immortal Beloved (1994.0): 3/5

Recommendations from different models:

SVD Recommendations:
  - Lord of the Rings: The Return of the King: Extended Edition: Bonus Material (2003.0): 4.39/5
  - Inspector Morse 31: Death Is Now My Neighbour (1997.0): 4.03/5
  - The Rise and Fall of ECW (2004.0): 4.00/5
  - Lilo and Stitch (2002.0): 3.84/5
  - Isle of Man TT 2004 Review (2004.0): 3.82/5

KNN Recommendations:
  - Paula Abdul's Get Up & Dance (1994.0): 3.55/5
  - Class of Nuke 'Em High 2 (1991.0): 3.55/5
  - Chump Change (2000.0): 3.55/5
  - Dinosaur Planet (2003.0): 3.00/5
  - Isle of Man TT 2004 Review (2004.0): 3.00/5

NMF Recommendations:
  - Lord of the Rings: The Return of the King: Extended Edition: Bonus Material (2003.0): 4.73/5
  - Inspector Morse 31: Death Is Now My Neighbour (1997.0): 4.71/5
  - The Rise and Fall of ECW (2004.0): 4.39/5
  - Something's

In [63]:
print("\nStep 11: Creating Streamlit demo application...")
# Create Streamlit demo application
# Update the create_streamlit_demo function to work with the modified models
def create_streamlit_demo():
    """
    Create a Streamlit demo application script.
    """
    demo_path = os.path.join(dataset_path, 'app', 'app.py')
    os.makedirs(os.path.dirname(demo_path), exist_ok=True)

    with open(demo_path, 'w') as f:
        f.write('''

# First, define all classes at the module level
class ContentBasedRecommender:
    def __init__(self, similarity_matrix, indices, movie_data):
        self.similarity_matrix = similarity_matrix
        self.indices = indices
        self.movie_data = movie_data

    def get_recommendations(self, movie_id, top_n=10):
        """Get content-based recommendations for a movie"""
        # Get the index of the movie
        if movie_id not in self.indices:
            return []

        idx = self.indices[movie_id]

        # Get similarity scores
        sim_scores = list(enumerate(self.similarity_matrix[idx]))

        # Sort movies based on similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Get top N most similar movies
        sim_scores = sim_scores[1:top_n+1]

        # Get movie indices
        movie_indices = [i[0] for i in sim_scores]

        # Return movie IDs and similarity scores
        return [(self.movie_data.iloc[i]['movie_id'], sim_scores[j][1])
                for j, i in enumerate(movie_indices)]

class HybridRecommender:
    def __init__(self, cf_model, content_recommender, cf_weight=0.7):
        self.cf_model = cf_model
        self.content_recommender = content_recommender
        self.cf_weight = cf_weight

    def predict(self, user_id, movie_id):
        """Predict rating for a user-movie pair using hybrid approach"""
        try:
            # Get collaborative filtering prediction
            cf_pred = self.cf_model.predict(user_id, movie_id).est
            # Normalize to 0-1 scale
            cf_pred_norm = (cf_pred - 1) / 4
            # Default content score
            content_score = 0.5
            # Return weighted prediction
            return self.cf_weight * cf_pred_norm + (1 - self.cf_weight) * content_score
        except:
            # Return default score if prediction fails
            return 0.5


import streamlit as st
import pandas as pd
import numpy as np
import dill as pickle
import os



# Set page title and configuration
st.set_page_config(
    page_title="Netflix Recommendation System",
    page_icon="🎬",
    layout="wide"
)

# Title and introduction
st.title("🎬 Netflix Recommendation System")
st.markdown("""
This application demonstrates a hybrid recommendation system built using the Netflix Prize dataset.
You can select a user and see personalized movie recommendations based on different algorithms.
""")




def load_model_and_data():
    # Load model from relative path
    model_path = './app/models/hybrid_model.pkl'

    if not os.path.exists(model_path):
        st.error("Model file not found. Please ensure the model has been trained and saved correctly.")
        return None, None, None

    # Load model
    with open(model_path, 'rb') as f:
        st.write("📦 Loading model...")
        model_data = pickle.load(f)

    # Load sample ratings data
    ratings_df = pd.read_csv('./data/netflix_sample.csv')

    # Load movie titles
    titles_df = pd.read_csv('./data/movie_titles.csv',
                           header=None,
                           names=['movie_id', 'year', 'title'],
                           encoding='latin1')

    return model_data, ratings_df, titles_df




# Load data
model_data, ratings_df, titles_df = load_model_and_data()

# Check if data loaded successfully
if model_data is None or ratings_df is None or titles_df is None:
    st.warning("Please upload the required data files or check file paths.")
    st.stop()

# Create sidebar
st.sidebar.header("Settings")

# User selection
sample_users = ratings_df['user_id'].value_counts().head(100).index.tolist()
selected_user = st.sidebar.selectbox("Select a user ID", sample_users)

# Algorithm selection
algorithm = st.sidebar.radio(
    "Select recommendation algorithm",
    ["Collaborative Filtering (SVD)", "Content-Based", "Hybrid"]
)

# Number of recommendations
num_recs = st.sidebar.slider("Number of recommendations", 5, 20, 10)

# Function to get SVD recommendations
def get_svd_recommendations(user_id, n=10):
    # Get the SVD model
    svd_model = model_data['svd_model']

    # Get movies already rated by this user
    rated_movies = set(ratings_df[ratings_df['user_id'] == user_id]['movie_id'])

    # Get all movies
    all_movies = set(ratings_df['movie_id'].unique())

    # Movies to predict
    movies_to_predict = list(all_movies - rated_movies)

    # Predict ratings
    predictions = []
    for movie_id in movies_to_predict:
        try:
            pred = svd_model.predict(user_id, movie_id)
            predictions.append((movie_id, pred.est))
        except:
            continue

    # Sort by predicted rating and get top n
    predictions.sort(key=lambda x: x[1], reverse=True)
    return predictions[:n]

# Function to get content-based recommendations
def get_content_recommendations(user_id, n=10):
    # Get the content model
    content_recommender = model_data['content_model']['model']

    # Get user's top-rated movies
    user_ratings = ratings_df[ratings_df['user_id'] == user_id].sort_values('rating', ascending=False)

    # Get already rated movies
    rated_movies = set(user_ratings['movie_id'])

    # Get recommendations based on top movies
    all_recs = []
    for _, row in user_ratings.head(3).iterrows():
        movie_id = row['movie_id']
        recs = content_recommender.get_recommendations(movie_id)
        all_recs.extend(recs)

    # Remove already rated movies and sort by similarity
    filtered_recs = [(movie_id, score) for movie_id, score in all_recs if movie_id not in rated_movies]

    # Remove duplicates and sort
    unique_recs = {}
    for movie_id, score in filtered_recs:
        if movie_id not in unique_recs:
            unique_recs[movie_id] = score
        else:
            unique_recs[movie_id] = max(unique_recs[movie_id], score)

    sorted_recs = sorted(unique_recs.items(), key=lambda x: x[1], reverse=True)
    return sorted_recs[:n]

# Function to get hybrid recommendations
def get_hybrid_recommendations(user_id, n=10):
    # Get collaborative filtering recommendations
    cf_recs = get_svd_recommendations(user_id, n=n*2)

    # Get content-based recommendations
    content_recs = get_content_recommendations(user_id, n=n*2)

    # Get the hybrid recommender if available
    if 'hybrid_recommender' in model_data:
        hybrid_recommender = model_data['hybrid_recommender']
        cf_weight = model_data.get('cf_weight', 0.7)
    else:
        # Create a simple hybrid approach if no saved hybrid recommender
        cf_weight = 0.7

    # Combine recommendations
    cf_dict = dict(cf_recs)
    content_dict = dict(content_recs)

    # Normalize CF scores (they're on a 1-5 scale)
    cf_dict = {k: (v-1)/4 for k, v in cf_dict.items()}

    # Get all movie IDs
    all_movies = set(list(cf_dict.keys()) + list(content_dict.keys()))

    # Compute hybrid scores
    hybrid_scores = {}
    for movie_id in all_movies:
        cf_score = cf_dict.get(movie_id, 0.5)  # Default to middle score
        content_score = content_dict.get(movie_id, 0.5)  # Default to middle score

        # Weighted average
        hybrid_score = cf_weight * cf_score + (1 - cf_weight) * content_score
        hybrid_scores[movie_id] = hybrid_score

    # Sort by hybrid score
    sorted_hybrid = sorted(hybrid_scores.items(), key=lambda x: x[1], reverse=True)
    return sorted_hybrid[:n]

# Function to display recommendations
def display_recommendations(recommendations):
    # Create columns for movie display
    cols = st.columns(5)

    for i, (movie_id, score) in enumerate(recommendations):
        col_idx = i % 5

        # Get movie info
        movie_info = titles_df[titles_df['movie_id'] == movie_id]

        if len(movie_info) > 0:
            movie = movie_info.iloc[0]
            title = movie['title']
            year = movie.get('year', 'N/A')

            # Display in appropriate column
            with cols[col_idx]:
                st.markdown(f"**{title}** ({year})")
                st.progress(min(score, 1.0))  # Normalize score for progress bar

                st.caption(f"Score: {score:.2f}")


# Main section
if st.sidebar.button("Generate Recommendations"):
    # Display user information
    user_ratings = ratings_df[ratings_df['user_id'] == selected_user]

    st.subheader(f"User {selected_user} Profile")

    col1, col2 = st.columns(2)

    with col1:
        st.metric("Total Movies Rated", len(user_ratings))
        st.metric("Average Rating", f"{user_ratings['rating'].mean():.2f}/5")

    with col2:
        # Rating distribution
        rating_counts = user_ratings['rating'].value_counts().sort_index()
        st.bar_chart(rating_counts)

    # Show user's top rated movies
    st.subheader("User's Top-Rated Movies")
    top_movies = user_ratings.sort_values('rating', ascending=False).head(5)

    for _, row in top_movies.iterrows():
        movie_info = titles_df[titles_df['movie_id'] == row['movie_id']]
        if len(movie_info) > 0:
            movie = movie_info.iloc[0]
            st.markdown(f"**{movie['title']}** ({movie.get('year', 'N/A')}) - Rating: {row['rating']}/5")

    # Generate recommendations based on selected algorithm
    st.header(f"Recommendations using {algorithm}")

    if algorithm == "Collaborative Filtering (SVD)":
        recommendations = get_svd_recommendations(selected_user, n=num_recs)
    elif algorithm == "Content-Based":
        recommendations = get_content_recommendations(selected_user, n=num_recs)
    else:  # Hybrid
        recommendations = get_hybrid_recommendations(selected_user, n=num_recs)

    display_recommendations(recommendations)

    # Show explanation
    st.subheader("How it works")

    if algorithm == "Collaborative Filtering (SVD)":
        st.markdown("""
        **Collaborative Filtering** works by finding patterns in how users rate movies.
        It identifies users with similar tastes to provide recommendations.

        The SVD (Singular Value Decomposition) algorithm creates latent factors that represent both users and movies
        in a shared mathematical space, allowing the system to predict how a user would rate a movie they haven't seen yet.
        """)
    elif algorithm == "Content-Based":
        st.markdown("""
        **Content-Based Filtering** recommends movies similar to ones the user has highly rated in the past.

        This approach analyzes movie attributes (like titles, genres, etc.) to find similar movies,
        without relying on ratings from other users.
        """)
    else:  # Hybrid
        st.markdown("""
        **Hybrid Recommendation** combines both collaborative filtering and content-based approaches.

        This method leverages both the patterns in user ratings (collaborative filtering) and the
        similarities between movies (content-based) to provide more robust recommendations.
        """)

else:
    # Show instructions when first loading
    st.info("👈 Select a user and click 'Generate Recommendations' to get personalized movie recommendations.")

    # Show dataset information
    st.subheader("About the Dataset")
    st.markdown("""
    The **Netflix Prize Dataset** contains over 100 million ratings from Netflix customers.
    This demo uses a sample of that dataset to demonstrate different recommendation algorithms.

    The full dataset includes:
    - Ratings from approximately 480,000 users
    - Ratings for around 17,700 movies
    - Ratings on a scale from 1 to 5 stars
    - Dates of when the ratings were made
    """)

# Footer
st.sidebar.markdown("---")
st.sidebar.markdown("**Netflix Recommendation System Demo**")
st.sidebar.markdown("Created as a portfolio project")
''')


    print(f"Streamlit demo application created at {demo_path}")


Step 11: Creating Streamlit demo application...


In [64]:
create_streamlit_demo()

Streamlit demo application created at /content/netflix_data/app/app.py


In [65]:
!ls -l /content/drive/MyDrive/Netflix/netflix_dataset/app


total 16
-rw-r--r-- 1 root root 9970 Apr 22 17:50 app.py
drwxr-xr-x 2 root root 4096 Apr 22 17:31 models


In [66]:
from pyngrok import ngrok
ngrok.set_auth_token("***")

In [67]:
import os
import threading
import time

# Kill any existing tunnels
ngrok.kill()

# Path to your Streamlit app
app_path = "/content/drive/MyDrive/Netflix/netflix_dataset/app/app.py"

# Function to run Streamlit
def run_streamlit():
    os.system(f"cd '{os.path.dirname(app_path)}' && streamlit run app.py")

# Start Streamlit in background thread
thread = threading.Thread(target=run_streamlit)
thread.start()

# Wait for Streamlit to start
time.sleep(5)  # Give Streamlit time to launch

# Open ngrok tunnel to port 8501
public_url = ngrok.connect(addr="8501")
print(f"✅ Streamlit app is live at: {public_url}")


✅ Streamlit app is live at: NgrokTunnel: "https://ef04-34-106-9-163.ngrok-free.app" -> "http://localhost:8501"
