In [1]:
# ========================
# Load CSV and preprocess
# ========================
import pandas as pd
import numpy as np

# Load CSV
data = pd.read_csv("finaldataset.csv")

# Ensure proper types
data["timestamp"] = pd.to_datetime(
    data["timestamp"],      
    format="%d-%m-%Y %H:%M:%S",  
    errors="coerce"
)
data["overview"] = data["overview"].astype(str)
data["genres"] = data["genres"].astype(str)
data["keywords"] = data["keywords"].astype(str)
data["title"] = data["title"].astype(str)

print("Data loaded:", data.shape)


Data loaded: (507418, 12)


In [2]:
# Train/test split per user
train_data = []
test_data = []

for user in data["userId"].unique():
    user_df = data[data["userId"] == user].sort_values("timestamp")
    if len(user_df) <= 3:
        continue
    train_data.append(user_df.iloc[:-3])
    test_data.append(user_df.iloc[-3:])

train_data = pd.concat(train_data)
test_data = pd.concat(test_data)

print("Train data:", train_data.shape)
print("Test data:", test_data.shape)

Train data: (492295, 12)
Test data: (15120, 12)


In [3]:
# ========================
# Extract aspects from overview, keywords, genres
# ========================
import re

def extract_aspects(row):
    # Combine textual features
    text = row["overview"] + " " + row["keywords"] + " " + row["genres"]
    # Extract meaningful word pairs
    return " ".join(set(re.findall(r"\b\w+\s\w+\b", text.lower())))

train_data["aspect_text"] = train_data.apply(extract_aspects, axis=1)


In [4]:
# ========================
# Convert aspects to TF-IDF vectors
# ========================
from sklearn.feature_extraction.text import TfidfVectorizer

movie_text = train_data.drop_duplicates("movieId")[["movieId", "aspect_text"]]

tfidf = TfidfVectorizer(max_features=1000)
movie_vectors = tfidf.fit_transform(movie_text["aspect_text"])

movie_vector_df = pd.DataFrame(
    movie_vectors.toarray(),
    index=movie_text["movieId"],
    columns=tfidf.get_feature_names_out()
)

print("Movie aspect matrix shape:", movie_vector_df.shape)


Movie aspect matrix shape: (3822, 1000)


In [5]:
# ========================
# Trainable Tiny Neural Network for Aspect Scores
# ========================
import torch
import torch.nn as nn
import torch.optim as optim

class TinyAspectNN(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 32)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(32, 1)  # predict user preference score

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x.squeeze()

# Initialize model
input_dim = movie_vector_df.shape[1]
tiny_nn = TinyAspectNN(input_dim)

# Loss + optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(tiny_nn.parameters(), lr=0.01)


In [6]:
# Create training dataset for NN
train_X = []
train_y = []

for _, row in train_data.iterrows():
    movie_vec = movie_vector_df.loc[row["movieId"]].values
    train_X.append(movie_vec)
    train_y.append(row["rating"])

train_X = torch.tensor(np.array(train_X), dtype=torch.float32)
train_y = torch.tensor(np.array(train_y), dtype=torch.float32)


In [7]:
# Tiny training loop (fast, CPU-friendly)
tiny_nn.train()
epochs = 5  # very small number to keep it fast
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = tiny_nn(train_X)
    loss = criterion(outputs, train_y)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")


Epoch 1/5, Loss: 13.6786
Epoch 2/5, Loss: 12.8778
Epoch 3/5, Loss: 12.0039
Epoch 4/5, Loss: 11.0193
Epoch 5/5, Loss: 9.9505


In [None]:
# ======================== 
# Build user aspect vector
# ========================
def build_user_aspect_vector(user_id, alpha=0.7):
    user_movies = train_data[train_data["userId"] == user_id].sort_values("timestamp")
    user_vector = np.zeros(movie_vector_df.shape[1])
    for _, row in user_movies.iterrows():
        movie_vec = movie_vector_df.loc[row["movieId"]].values
        user_vector = alpha * user_vector + (1 - alpha) * movie_vec
    return user_vector


In [9]:
# ========================
# Compute aspect-based similarity scores
# ========================
from sklearn.metrics.pairwise import cosine_similarity

def aspect_score(user_id):
    tiny_nn.eval()
    with torch.no_grad():
        scores = tiny_nn(torch.tensor(movie_vector_df.values, dtype=torch.float32)).numpy()
    return pd.Series(scores, index=movie_vector_df.index)



In [10]:
# ========================
# Build user-item matrix & SVD
# ========================
from sklearn.decomposition import TruncatedSVD

user_item = train_data.pivot_table(
    index="userId",
    columns="movieId",
    values="rating"
).fillna(0)

svd = TruncatedSVD(n_components=50, random_state=42)
user_factors = svd.fit_transform(user_item)
item_factors = svd.components_

def cf_predict(user_id, movie_id):
    if user_id not in user_item.index or movie_id not in user_item.columns:
        return 0
    u = user_item.index.get_loc(user_id)
    m = user_item.columns.get_loc(movie_id)
    return np.dot(user_factors[u], item_factors[:, m])


In [11]:
# ========================
# Hybrid recommendation: CF + Aspect
# ========================
def hybrid_recommend(user_id, top_n=5, w_cf=0.4, w_aspect=0.6):
    if user_id not in train_data["userId"].unique():
        top_movies = train_data.groupby("movieId").size().sort_values(ascending=False).head(top_n)
        return pd.Series(1.0, index=top_movies.index)
    
    aspect_scores = aspect_score(user_id)
    cf_scores = pd.Series({movie: cf_predict(user_id, movie) for movie in movie_vector_df.index})
    
    final_scores = w_aspect * aspect_scores + w_cf * cf_scores
    
    watched = set(train_data[train_data["userId"] == user_id]["movieId"])
    final_scores = final_scores.drop(index=watched, errors="ignore")
    
    return final_scores.sort_values(ascending=False).head(top_n)


In [13]:
# Save only model weights
torch.save(tiny_nn.state_dict(), "tiny_aspect_nn_weights.pth")
print("✅ Tiny NN weights saved")


✅ Tiny NN weights saved


In [14]:
# Initialize the model first (same architecture)
tiny_nn_loaded = TinyAspectNN(input_dim=movie_vector_df.shape[1])

# Load weights
tiny_nn_loaded.load_state_dict(torch.load("tiny_aspect_nn_weights.pth"))
tiny_nn_loaded.eval()  # set to evaluation mode
print("✅ Tiny NN model loaded with weights")


✅ Tiny NN model loaded with weights


In [15]:
# ========================
# Prepare test movies & ratings dictionaries
# ========================
test_movies = (
    test_data.groupby("userId")["movieId"]
    .apply(set)
    .to_dict()
)

test_ratings = (
    test_data.groupby("userId")
    .apply(lambda x: x.set_index("movieId")["rating"].to_dict())
    .to_dict()
)


  .apply(lambda x: x.set_index("movieId")["rating"].to_dict())


In [17]:
# ========================
# Evaluation using NN-based hybrid
# ========================
def precision_recall_at_k_nn(k=5):
    precisions, recalls = [], []

    for user, true_movies in test_movies.items():
        recs = hybrid_recommend(user, top_n=k).index  # compute dynamically
        hits = len(set(recs) & true_movies)
        precisions.append(hits / k)
        recalls.append(hits / len(true_movies))

    return np.mean(precisions), np.mean(recalls)


def ndcg_at_k_nn(k=5):
    ndcgs = []

    for user, rating_dict in test_ratings.items():
        recs = hybrid_recommend(user, top_n=k).index  # compute dynamically
        dcg = 0.0
        for i, movie in enumerate(recs):
            if movie in rating_dict:
                dcg += (2 ** rating_dict[movie] - 1) / np.log2(i + 2)

        ideal = sorted(rating_dict.values(), reverse=True)[:k]
        idcg = sum((2 ** r - 1) / np.log2(i + 2) for i, r in enumerate(ideal))

        if idcg > 0:
            ndcgs.append(dcg / idcg)

    return np.mean(ndcgs)


p, r = precision_recall_at_k_nn(k=5)
print("Precision@5:", p)
print("Recall@5:", r)
print("NDCG@5:", ndcg_at_k_nn(k=5))


Precision@5: 0.017063492063492062
Recall@5: 0.028439153439153434
NDCG@5: 0.02354420462661541


In [19]:
# ========================
# Show top-5 recommendations for a user (input-based)
# ========================
user_input = input("Enter User ID to get recommendations: ")

# Convert to int safely
try:
    user_id = int(user_input)
except ValueError:
    print("⚠️ Invalid input. Using default User ID = 15")
    user_id = 15

# Check if user exists in train_data
if user_id not in train_data["userId"].unique():
    print(f"User ID {user_id} not found. Showing top-rated movies instead.")
    top_movies = train_data.groupby("movieId")["rating"].mean().sort_values(ascending=False).head(5)
    titles = data.drop_duplicates("movieId").set_index("movieId")["title"]
    print("Top-5 Rated Movies:")
    print(titles.loc[top_movies.index])
else:
    # Get hybrid recommendations
    recs = hybrid_recommend(user_id)
    titles = data.drop_duplicates("movieId").set_index("movieId")["title"]
    print(f"Top-5 Recommended Movies for User {user_id}:")
    print(titles.loc[recs.index])
    print("\nHybrid Scores:\n", recs)


Top-5 Recommended Movies for User 10:
movieId
1291    Indiana Jones and the Last Crusade
1193       One Flew Over the Cuckoo's Nest
1213                            GoodFellas
110                             Braveheart
1270                    Back to the Future
Name: title, dtype: object

Hybrid Scores:
 movieId
1291    1.160771
1193    1.136716
1213    1.073246
110     1.052303
1270    1.046009
dtype: float64
