In [None]:

import os
import random
import joblib
import numpy as np
import pandas as pd
import re
from collections import defaultdict
from surprise import SVD, Dataset, Reader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from scipy import sparse
final_data =   pd
movies = pd.read_csv("C:\\Users\\LENOVO\\Desktop\\MRS_DS&ML\\data\\final_data.csv")
movies = movies[['movie_id', 'title', 'genre']].drop_duplicates()

In [None]:
print(movies.head())

   movie_id                       title                                genre
0       242                Kolya (1996)                               Comedy
1       302    L.A. Confidential (1997)  Crime, Film-Noir, Mystery, Thriller
2       377         Heavyweights (1994)                   Children's, Comedy
3        51  Legends of the Fall (1994)         Drama, Romance, War, Western
4       346         Jackie Brown (1997)                         Crime, Drama


In [None]:
movies = movies.fillna('')  # replace NaN with empty string

# Clean up the genre text
movies['genre'] = movies['genre'].str.replace(',', ' ')  # replace commas with spaces
movies['genre'] = movies['genre'].str.replace('-', ' ')  # optional cleanup


In [None]:
# Clean genre text
movies = movies.fillna('')
movies['genre'] = movies['genre'].str.replace(',', ' ', regex=False)
movies['genre'] = movies['genre'].apply(lambda x: re.sub(r'\s+', ' ', x.strip()))
movies['genre'] = movies['genre'].str.replace("'", "", regex=False)

# Combine into one text column for TF-IDF
movies['content'] = movies['title'] + " " + movies['genre']

In [None]:
print(movies.head())

   movie_id                       title                             genre  \
0       242                Kolya (1996)                            Comedy   
1       302    L.A. Confidential (1997)  Crime Film Noir Mystery Thriller   
2       377         Heavyweights (1994)                  Childrens Comedy   
3        51  Legends of the Fall (1994)         Drama Romance War Western   
4       346         Jackie Brown (1997)                       Crime Drama   

                                             content  
0                                Kolya (1996) Comedy  
1  L.A. Confidential (1997) Crime Film Noir Myste...  
2               Heavyweights (1994) Childrens Comedy  
3  Legends of the Fall (1994) Drama Romance War W...  
4                    Jackie Brown (1997) Crime Drama  


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['content'])

# Compute cosine similarity between all movies
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [None]:
indices = pd.Series(movies.index, index=movies['movie_id']).drop_duplicates()
def recommend_by_id(movie_id, n=5):
    if movie_id not in indices:
        return f"Movie ID {movie_id} not found in dataset."
    
    # Get the index of the given movie
    idx = indices[movie_id]
    
    # Compute similarity scores with all movies
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort by similarity (descending)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get top n most similar movies (skip itself)
    sim_scores = sim_scores[1:n+1]
    
    # Extract movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    return movies[['movie_id', 'title', 'genre']].iloc[movie_indices]


In [None]:
recommend_by_id(125)


Unnamed: 0,movie_id,title,genre
927,831,Escape from L.A. (1996),Action Adventure Sci Fi Thriller
181,298,Face/Off (1997),Action Sci Fi Thriller
3146,1025,Fire Down Below (1997),Action Drama Thriller
10,257,Men in Black (1997),Action Adventure Comedy Sci Fi
178,164,"Abyss, The (1989)",Action Adventure Sci Fi Thriller


In [None]:
# Collaborative Filtering (SVD)
from surprise import Dataset, Reader
from surprise import SVD
import pandas as pd
ratings = pd.read_csv("C:\\Users\\LENOVO\\Desktop\\MRS_DS&ML\\data\\final_data.csv")
reader = Reader(rating_scale=(1,5))
data = Dataset.load_from_df(ratings[['user_id','movie_id','rating']], reader)
trainset = data.build_full_trainset()

# SVD deterministic with random_state
svd_model = SVD(random_state=42, n_factors=100, n_epochs=20, verbose=False)
svd_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x13d319401f0>

In [None]:
# Make sure we work with unique movies only
movies_df = ratings.drop_duplicates(subset=['movie_id'])[['movie_id', 'title', 'genre']].reset_index(drop=True)

# Rebuild TF-IDF matrix using unique movies
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import numpy as np
import pandas as pd

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies_df['title'].fillna('') + ' ' + movies_df['genre'].fillna(''))

movie_idx_map = {mid: idx for idx, mid in enumerate(movies_df['movie_id'])}

def recommend_content(movie_id, top_n=10):
    """Recommend top_n similar movies for a given movie_id."""
    if movie_id not in movie_idx_map:
        print(f"‚ö†Ô∏è Movie ID {movie_id} not found.")
        return pd.DataFrame(columns=['movie_id','title','similarity'])

    idx = movie_idx_map[movie_id]

    sim_scores = linear_kernel(tfidf_matrix[idx], tfidf_matrix).flatten()
    sim_indices = sim_scores.argsort()[-top_n-1:-1][::-1]
    sim_values = sim_scores[sim_indices]

    recs = movies_df.iloc[sim_indices][['movie_id','title']].reset_index(drop=True)
    recs['similarity'] = sim_values
    return recs


In [None]:
print("Content-Based Recommendations for 'Toy Story (1995)':")
print(recommend_content(1, top_n=10))

Content-Based Recommendations for 'Toy Story (1995)':
   movie_id                              title  similarity
0      1072  Pyromaniac's Love Story, A (1995)    0.367618
1      1066                       Balto (1995)    0.339499
2       548  NeverEnding Story III, The (1994)    0.318385
3       478     Philadelphia Story, The (1940)    0.315246
4      1344       Story of Xinghua, The (1993)    0.313717
5       308     FairyTale: A True Story (1997)    0.312273
6      1219              Goofy Movie, A (1995)    0.307325
7       542                  Pocahontas (1995)    0.301593
8      1470            Gumby: The Movie (1995)    0.287708
9        95                     Aladdin (1992)    0.270548


In [None]:
# ‚úÖ Hybrid Recommendation (memory-safe + column names fixed)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import numpy as np
import pandas as pd

# 1Ô∏è‚É£ TF-IDF on title + genre (sparse & memory-efficient)
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(
    ratings['title'].fillna('') + ' ' + ratings['genre'].fillna('')
)

# 2Ô∏è‚É£ Map movie_id ‚Üí TF-IDF index
movie_idx_map = {mid: idx for idx, mid in enumerate(ratings['movie_id'])}

# 3Ô∏è‚É£ Hybrid Recommender
def hybrid_recommend(user_id, top_n=10, alpha=0.7, top_k_cf=50):
    """
    Memory-efficient Hybrid Recommender:
    alpha * CF + (1 - alpha) * on-demand Content Similarity
    """
    # Movies rated by this user
    rated_movies = ratings[ratings['user_id'] == user_id]['movie_id'].tolist()

    # Candidate movies (unrated)
    all_movie_ids = ratings['movie_id'].unique()
    candidate_ids = np.setdiff1d(all_movie_ids, rated_movies)

    if len(candidate_ids) == 0:
        return pd.DataFrame(columns=['title', 'year'])

    # Collaborative Filtering predictions
    cf_preds = np.array([svd_model.predict(user_id, int(mid)).est for mid in candidate_ids])

    # Top CF candidates
    candidate_with_preds = sorted(
        zip(candidate_ids, cf_preds), key=lambda x: (-x[1], int(x[0]))
    )[:top_k_cf]

    top_ids = np.array([m for m, _ in candidate_with_preds], dtype=int)
    top_cf_scores = np.array([s for _, s in candidate_with_preds], dtype=float)

    # 4Ô∏è‚É£ Compute Content Similarity (on demand)
    if len(rated_movies) == 0:
        sim_scores = np.zeros(len(top_ids))
    else:
        sim_scores = []
        for mid in top_ids:
            if mid in movie_idx_map:
                idx = movie_idx_map[mid]
                rated_idx = [movie_idx_map[r] for r in rated_movies if r in movie_idx_map]

                # Compute similarity row only for this movie (on demand)
                sims_to_rated = linear_kernel(tfidf_matrix[idx], tfidf_matrix[rated_idx]).flatten()
                sim_scores.append(sims_to_rated.mean())
            else:
                sim_scores.append(0)
        sim_scores = np.array(sim_scores)

    # 5Ô∏è‚É£ Combine CF + Content similarity
    sim_scores_scaled = sim_scores * 5.0
    hybrid_scores = alpha * top_cf_scores + (1 - alpha) * sim_scores_scaled

    # Sort final hybrid results
    movie_and_score = sorted(zip(top_ids, hybrid_scores), key=lambda x: (-x[1], x[0]))
    top_movies = [m for m, _ in movie_and_score[:top_n]]

    # Return top results with titles
    results = ratings.drop_duplicates(subset=['movie_id'])
    results = results[results['movie_id'].isin(top_movies)][['movie_id', 'title', 'year']].reset_index(drop=True)

    return results


In [None]:
user_id_example = 20
print(f"ü§ù Hybrid Recommendations for User {user_id_example}:")
print(hybrid_recommend(user_id=user_id_example, top_n=5, alpha=0.7))


ü§ù Hybrid Recommendations for User 20:
   movie_id                  title    year
0       258         Contact (1997)  1997.0
1       483      Casablanca (1942)  1942.0
2        28       Apollo 13 (1995)  1995.0
3       513  Third Man, The (1949)  1949.0
4       313         Titanic (1997)  1997.0


In [None]:
# Save Models
os.makedirs("../models", exist_ok=True)
joblib.dump(svd_model, "../models/svd_model.pkl")
joblib.dump(tfidf, "../models/tfidf_vectorizer.pkl")
# Save sparse TF-IDF matrix with scipy
sparse.save_npz("../models/tfidf_matrix.npz", tfidf_matrix)
joblib.dump(movie_idx_map, "../models/movie_idx_map.pkl")

print("Models saved successfully in '../models' folder.")

Models saved successfully in '../models' folder.
