In [1]:
import os
import sys
import pandas as pd

sys.path.append(os.path.abspath('..'))


In [4]:
from src.recommenders.Old_recommenders.Content_based import ContentRecommender
from src.recommenders.Old_recommenders.User_cf import UserBasedCF
from src.recommenders.Old_recommenders.Item_Based_CF import ItemBasedCF
from src.recommenders.recommender_system import RecommenderSystem


In [3]:
ratings_path=r"C:\Users\sarva\MoviePulse\data\processed\ratings_cleans.csv"
metadata_path=r"C:\Users\sarva\MoviePulse\data\processed\movie_feature.csv"
tfidf_path=r"C:\Users\sarva\MoviePulse\data\Features\tfidf_matrix.npz"
tfidf_index_path=r"C:\Users\sarva\MoviePulse\data\Features\tfidf_sample.csv"

## TF-IDF

In [4]:
recommender = ContentRecommender(
    feature_matrix_path=r"c:\Users\sarva\MoviePulse\data\Features\tfidf_matrix.npz",
    metadata_path=r"c:\Users\sarva\MoviePulse\data\processed\movie_feature.csv",
    is_sparse=True
)

recommender.run()
recommender.recommend("Casino (1995)", top_n=5, return_scores=True)


Loading feature matrix and metadata...
Computing cosine similarity...
Model ready for recommendations.


Unnamed: 0,title,avg_ratings,num_ratings,score
1937,"Mod Squad, The (1999)",3.1875,8.0,1.0
2716,"Fighting Seabees, The (1944)",3.5,2.0,1.0
2888,"Cell, The (2000)",3.044444,45.0,1.0
3219,Divided We Fall (Musíme si pomáhat) (2000),3.833333,3.0,1.0
3251,Beach Blanket Bingo (1965),1.5,1.0,1.0


In [5]:
recommender = ContentRecommender(
    feature_matrix_path=r"c:\Users\sarva\MoviePulse\data\Features\embeddings_matrix.npy",
    metadata_path=r"c:\Users\sarva\MoviePulse\data\processed\movie_feature.csv",
    is_sparse=False
)

recommender.run()
recommender.recommend("Casino (1995)", top_n=5, return_scores=True)


Loading feature matrix and metadata...
Computing cosine similarity...
Model ready for recommendations.


Unnamed: 0,title,avg_ratings,num_ratings,score
9103,Unforgiven (2013),0.5,1.0,0.465415
95,Muppet Treasure Island (1996),3.326923,26.0,0.453027
1117,Fools Rush In (1997),3.083333,12.0,0.439327
2481,"Circus, The (1928)",4.125,4.0,0.433671
9156,Hitchcock/Truffaut (2015),3.5,1.0,0.43318


### 🔍 Content-Based Recommendation Comparison: TF-IDF vs Embeddings

To evaluate the effectiveness of different content-based representations, I tested my recommender system using two types of feature vectors for the same input movie.

#### 🎯 Input Movie:
`"Toy Story (1995)"`

---

### 📘 Method 1: TF-IDF Based Recommendation

TF-IDF was computed on the combination of movie title and genres. The recommendations returned were:

| Title                              | Avg Rating | Num Ratings | Similarity Score |
|-----------------------------------|------------|--------------|------------------|
| For Whom the Bell Tolls (1943)    | 3.75       | 2            | 1.0              |
| Boogie Nights (1997)              | 4.08       | 39           | 1.0              |
| Stardust Memories (1980)          | 3.60       | 5            | 1.0              |
| Perfect Blue (1997)               | 4.20       | 5            | 1.0              |
| Harvey (1950)                     | 3.83       | 12           | 1.0              |

#### 🧠 Observations:

- All recommendations have a **similarity score of 1.0**, indicating **exact text matches**, likely due to similar genres or title terms.
- Some recommended movies seem **unrelated** to *Toy Story* thematically.
- TF-IDF can **overfit to surface-level text**, missing deeper semantic connections.

---

### 📙 Method 2: Embedding-Based Recommendation

The recommendations using dense embedding vectors are:

| Title                                   | Avg Rating | Num Ratings | Similarity Score |
|----------------------------------------|------------|--------------|------------------|
| Man from Snowy River, The (1982)       | 3.38       | 4            | 0.753            |
| Polytechnique (2009)                   | 1.50       | 1            | 0.749            |
| Guest from the Future (1985)           | 4.00       | 1            | 0.542            |
| Garfield's Pet Force (2009)            | 5.00       | 1            | 0.483            |
| Kinky Boots (2005)                     | 3.75       | 4            | 0.480            |

#### 🧠 Observations:

- Similarity scores are **more nuanced**, ranging between 0.48 and 0.75.
- Some recommendations (e.g., *Garfield's Pet Force*) are **more thematically similar** to *Toy Story*.
- Embeddings can **capture semantic meaning**, but also need high-quality input features like plot summaries for best performance.

---

### 📊 Conclusion

| Criteria           | TF-IDF                      | Embeddings                   |
|-------------------|-----------------------------|------------------------------|
| Similarity Scores | All 1.0 (hard match)         | Gradual (0.48–0.75)          |
| Text Sensitivity  | High (exact word match)      | Moderate (semantic match)    |
| Thematic Relevance| Sometimes poor               | Generally better             |
| Diversity         | Low                          | Medium                       |

✅ **TF-IDF** is fast and simple but may **overfit to keywords**.  
✅ **Embeddings** offer better **semantic relevance**, especially with rich features.

---

### 🚀 Future Improvements:
- Add plot descriptions or movie tags for richer embeddings.
- Filter low-rated or low-rating-count movies.
- Explore hybrid models (content + collaborative).


In [6]:
# User Based Collaborative Filtering

In [7]:
cf = UserBasedCF(
    ratings_path=r"C:\Users\sarva\MoviePulse\data\processed\ratings_cleans.csv",
    metadata_path=r"C:\Users\sarva\MoviePulse\data\processed\movie_feature.csv"
)

cf.run()
cf.recommend(user_id=2, top_n=5, return_scores=True)


📥 Loading ratings and metadata...
🧱 Creating user-item matrix...
🧠 Computing user-user cosine similarity...
✅ User-based CF model is ready.


Unnamed: 0,movieId,title,avg_ratings,num_ratings,predicted_rating
1939,2571,"Matrix, The (1999)",4.192446,278.0,2.901568
314,356,Forrest Gump (1994),4.164134,329.0,2.841191
2226,2959,Fight Club (1999),4.272936,218.0,2.773398
257,296,Pulp Fiction (1994),4.197068,307.0,2.533788
510,593,"Silence of the Lambs, The (1991)",4.16129,279.0,2.289751


In [8]:
Item_cf = ItemBasedCF(
    ratings_path=r"C:\Users\sarva\MoviePulse\data\processed\ratings_cleans.csv",
    metadata_path=r"C:\Users\sarva\MoviePulse\data\processed\movie_feature.csv"
)   

In [9]:
Item_cf.run()
cf.recommend(user_id=2, top_n=5, return_scores=True)

Loading ratings and metadata...
Creating user-item matrix...
Computing item-item cosine similarity...
Item-based CF model is ready.


Unnamed: 0,movieId,title,avg_ratings,num_ratings,predicted_rating
1939,2571,"Matrix, The (1999)",4.192446,278.0,2.901568
314,356,Forrest Gump (1994),4.164134,329.0,2.841191
2226,2959,Fight Club (1999),4.272936,218.0,2.773398
257,296,Pulp Fiction (1994),4.197068,307.0,2.533788
510,593,"Silence of the Lambs, The (1991)",4.16129,279.0,2.289751


In [10]:
from src.recommenders.recommender_system import RecommenderSystem

recommender = RecommenderSystem(
    ratings_path=r"C:\Users\sarva\MoviePulse\data\processed\ratings_cleans.csv",
    metadata_path=r"C:\Users\sarva\MoviePulse\data\processed\movie_feature.csv",
    tfidf_path=r"C:\Users\sarva\MoviePulse\data\Features\tfidf_matrix.npz",
    tfidf_index_path=r"C:\Users\sarva\MoviePulse\data\Features\tfidf_sample.csv"
)

recommender.run_all()



Loading ratings and metadata...
Creating user-item matrix...
Computing user-user similarity...
Computing item-item similarity...
Loading TF-IDF content vectors...
Recommender system initialized.


In [11]:
user_id = 1  # or whichever user was used

# Recommended movies
recommended = recommender.recommend_hybrid(user_id=user_id, top_n=10)

# Get actual ratings given by the user
user_ratings = recommender.user_item_matrix.loc[user_id]
rated_movies = user_ratings[user_ratings >= 4.0].index.tolist()

print("User's relevant rated movies:", rated_movies)
print("Recommended movies:", recommended)

User's relevant rated movies: [11.0, 13.0, 14.0, 15.0, 28.0, 73.0, 85.0, 87.0, 89.0, 97.0, 98.0, 100.0, 104.0, 105.0, 106.0, 115.0, 123.0, 137.0, 138.0, 143.0, 162.0, 185.0, 196.0, 197.0, 218.0, 244.0, 252.0, 253.0, 268.0, 274.0, 275.0, 329.0, 348.0, 402.0, 424.0, 479.0, 482.0, 492.0, 497.0, 500.0, 522.0, 525.0, 526.0, 550.0, 581.0, 583.0, 600.0, 601.0, 603.0, 617.0, 620.0, 624.0, 629.0, 630.0, 646.0, 651.0, 653.0, 657.0, 658.0, 660.0, 682.0, 754.0, 756.0, 762.0, 769.0, 792.0, 801.0, 807.0, 816.0, 820.0, 847.0, 854.0, 856.0, 857.0, 861.0, 862.0, 871.0, 879.0, 881.0, 941.0, 949.0, 957.0, 1366.0, 1497.0, 1499.0, 1542.0, 1654.0, 1701.0, 1775.0, 1832.0, 1857.0, 1880.0, 1891.0, 1892.0, 1893.0, 1911.0, 1924.0, 2109.0, 2118.0, 2280.0, 2493.0, 2649.0, 2756.0, 3034.0, 3035.0, 3063.0, 3083.0, 3109.0, 3170.0, 4011.0, 4234.0, 4978.0, 5503.0, 5548.0, 6396.0, 6970.0, 6978.0, 8009.0, 8068.0, 8388.0, 8467.0, 8536.0, 8592.0, 8741.0, 8810.0, 8872.0, 9078.0, 9285.0, 9289.0, 9325.0, 9326.0, 9331.0, 9340.0

In [12]:
from src.Evaluate.Evaluate_hybrid import evaluate_model


In [13]:
metrics = evaluate_model(model=recommender, k=10, sample_size=50)
print(metrics)

{'Precision@K': 0.0, 'Recall@K': 0.0, 'Users Evaluated': 50, 'Alpha (User-CF)': 0.4, 'Beta (Item-CF)': 0.3, 'Gamma (Content)': 0.3}


In [7]:
ratings_df = pd.read_csv(ratings_path)

In [None]:
from src.Evaluate.Evaluate_holdout import evaluate_holdout


metrics = evaluate_holdout(model=recommender, ratings_df=ratings_df, k=10, users_to_evaluate=50)
print(metrics)


Evaluating 50 users with hold-out (test_ratio=0.2, top@10)


 98%|█████████▊| 49/50 [00:07<00:00,  6.30it/s]


Evaluation Summary:
Precision@K: 0.1940
Recall@K: 0.1279
NDCG@K: 0.2393
Users Evaluated: 50
{'Precision@K': 0.19399999999999998, 'Recall@K': 0.1278599528205986, 'NDCG@K': 0.23929887759304677, 'Users Evaluated': 50}





In [8]:
user_id = ratings_df['userId'].value_counts().idxmax()

In [9]:
Rs = RecommenderSystem(
    ratings_path=ratings_path,
    metadata_path=metadata_path,
    tfidf_path=tfidf_path,
    tfidf_index_path=tfidf_index_path
)

Rs.run_all()

Loading ratings and metadata...
Creating user-item matrix...
Computing user-user similarity...
Computing item-item similarity...
Loading TF-IDF content vectors...
Recommender system initialized.


In [11]:
recs = Rs.recommend_user_based(user_id=user_id, top_n=10)
recs.head()


Unnamed: 0,movieId,title,genres,imdbId,tmdbId,genre_count,main_genre,avg_ratings,num_ratings,predicted_rating
957,1258,"Shining, The (1980)",['Horror'],81505,694.0,1,Horror,4.082569,109.0,1.157097
5166,8368,Harry Potter and the Prisoner of Azkaban (2004),"['Adventure', 'Fantasy', 'IMAX']",304141,673.0,3,Adventure,3.913978,93.0,0.929902
920,1219,Psycho (1960),"['Crime', 'Horror']",54215,539.0,2,Crime,4.036145,83.0,0.893492
1730,2324,Life Is Beautiful (La Vita è bella) (1997),"['Comedy', 'Drama', 'Romance', 'War']",118799,637.0,4,Comedy,4.147727,88.0,0.851408
7258,74458,Shutter Island (2010),"['Drama', 'Mystery', 'Thriller']",1130884,11324.0,3,Drama,4.022388,67.0,0.63413
