In [1]:
from google.colab import files
uploaded = files.upload()


Saving sample dataset.txt to sample dataset.txt


In [4]:
!pip uninstall -y keras tensorflow

Found existing installation: keras 2.15.0
Uninstalling keras-2.15.0:
  Successfully uninstalled keras-2.15.0
Found existing installation: tensorflow 2.18.0
Uninstalling tensorflow-2.18.0:
  Successfully uninstalled tensorflow-2.18.0


In [1]:
!pip install tensorflow==2.16.1



In [2]:
import pandas as pd
import numpy as np
import json
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
import tensorflow as tf

#Working Tokenizer and pad_sequences for TF 2.16.1
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from scipy.spatial.distance import cdist
import random

In [3]:
#Reproducibility
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

In [4]:
df= pd.read_csv('sample dataset.txt')
df.head()

Unnamed: 0,id,title,genres,overview,popularity,vote_average,poster_url
0,27205,Inception,Action | Sci-Fi,A skilled thief who uses dream-sharing to stea...,150.0,8.3,https://image.tmdb.org/t/p/original/qmDpIHrmpJ...
1,597,The Matrix,Action | Sci-Fi,A hacker discovers the world is a simulation a...,140.0,8.7,https://image.tmdb.org/t/p/original/aOIuZAjPa2...
2,155,The Dark Knight,Action | Crime | Drama,Batman faces the Joker in a battle for Gotham’...,200.0,8.5,https://image.tmdb.org/t/p/original/1hRoyzDtpg...
3,13,Forrest Gump,Drama | Romance,"The life journey of Forrest Gump, a man with a...",110.0,8.8,https://image.tmdb.org/t/p/original/saHP97rTPS...
4,497,The Green Mile,Crime | Drama | Fantasy,A death row guard witnesses supernatural event...,80.0,8.5,https://image.tmdb.org/t/p/original/velWPhVMQe...


In [5]:
df['genres']= df['genres'].str.split('\|')
df.head()

Unnamed: 0,id,title,genres,overview,popularity,vote_average,poster_url
0,27205,Inception,"[Action , Sci-Fi]",A skilled thief who uses dream-sharing to stea...,150.0,8.3,https://image.tmdb.org/t/p/original/qmDpIHrmpJ...
1,597,The Matrix,"[Action , Sci-Fi]",A hacker discovers the world is a simulation a...,140.0,8.7,https://image.tmdb.org/t/p/original/aOIuZAjPa2...
2,155,The Dark Knight,"[Action , Crime , Drama]",Batman faces the Joker in a battle for Gotham’...,200.0,8.5,https://image.tmdb.org/t/p/original/1hRoyzDtpg...
3,13,Forrest Gump,"[Drama , Romance]","The life journey of Forrest Gump, a man with a...",110.0,8.8,https://image.tmdb.org/t/p/original/saHP97rTPS...
4,497,The Green Mile,"[Crime , Drama , Fantasy]",A death row guard witnesses supernatural event...,80.0,8.5,https://image.tmdb.org/t/p/original/velWPhVMQe...


In [6]:
df['overview']= df['overview'].fillna('')


In [7]:
max_tfidf_features = 100
tfidf= TfidfVectorizer(max_features= max_tfidf_features, stop_words='english')
tfid_matrix= tfidf.fit_transform(df['overview'])
with open('tfid_vectorizer.pkl','wb') as f:
  pickle.dump(tfidf, f)


In [8]:

mlb= MultiLabelBinarizer()
genre_matrix= mlb.fit_transform(df['genres'])
with open('genre_encoder.pkl', 'wb') as f:
  pickle.dump(mlb, f)

In [9]:
## Build Hybrid Movie Embedding Model
def build_movie_embedding_model(tfidf_dim, n_genres, seq_len, vocab_size, embed_dim=16):
  #TF-TDF branch
  tfidf_input= Input(shape=(tfidf_dim,), name= 'tfidf_input')
  x1= Dense(64, activation='relu')(tfidf_input)
  x1= Dropout(0.2)(x1)

  #Genre Branch
  genre_input= Input(shape=(n_genres,), name= 'genre_input')
  x2= Dense(32, activation='relu')(genre_input)
  x2= Dropout(0.2)(x2)

  #RNN Branch
  seq_input = Input(shape=(seq_len,), name= 'seq_input')
  x3= Embedding(input_dim= vocab_size, output_dim=embed_dim, mask_zero=True)(seq_input)
  x3= LSTM(32)(x3)

  #Merge
  merged= Concatenate()([x1, x2, x3])
  x= Dense(64, activation='relu')(merged)
  x= Dropout(0.2)(x)
  embedding= Dense(32, activation='relu', name='movie_embedding')(x)

  model= Model(inputs=[tfidf_input, genre_input, seq_input], outputs=embedding)
  return model

In [18]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

descriptions = df["overview"].astype(str).tolist()

tokenizer = Tokenizer()
tokenizer.fit_on_texts(descriptions)

sequences = tokenizer.texts_to_sequences(descriptions)

max_seq_len = 100
padded_sequences = pad_sequences(sequences, maxlen=max_seq_len, padding='post')

vocab_size = len(tokenizer.word_index) + 1

In [13]:
model = build_movie_embedding_model(
    tfidf_dim=tfid_matrix.shape[1],
    n_genres=genre_matrix.shape[1],
    seq_len=max_seq_len,
    vocab_size=vocab_size
)
model.summary()

In [17]:

reconstructed= Dense(
    tfid_matrix.shape[1],
    activation='sigmoid',
    name='reconstructed'
)(model.output)
autoencoder = Model(inputs=model.input, outputs=reconstructed)
#Compile the model
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.summary()

In [21]:
tfidf_array= tfid_matrix.toarray()
X={
    'tfidf_input': tfidf_array,
    'genre_input': genre_matrix,
    'seq_input': padded_sequences
}
y= tfidf_array


In [22]:
from sklearn.model_selection import train_test_split
index_train, index_val= train_test_split(df.index, test_size=0.2, random_state=42)
X_train= {key: value[index_train] for key, value in X.items()}
X_val= {key: value[index_val] for key, value in X.items()}
y_train= y[index_train]
y_val= y[index_val]

In [23]:
ecb= EarlyStopping(
    monitor='val_loss',
    patience=2,
    restore_best_weights=True
)
ckpt= ModelCheckpoint(
    'autoencoder.h5',
    save_best_only=True
)

In [24]:
autoencoder.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=32,
    epochs=10,
    callbacks=[ecb, ckpt]
)

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6s/step - loss: 0.2348



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6s/step - loss: 0.2348 - val_loss: 0.2326
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 648ms/step - loss: 0.2338



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 786ms/step - loss: 0.2338 - val_loss: 0.2319
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 170ms/step - loss: 0.2326



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 294ms/step - loss: 0.2326 - val_loss: 0.2311
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step - loss: 0.2320



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 199ms/step - loss: 0.2320 - val_loss: 0.2302
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 178ms/step - loss: 0.2312



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 311ms/step - loss: 0.2312 - val_loss: 0.2294
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 176ms/step - loss: 0.2298



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 307ms/step - loss: 0.2298 - val_loss: 0.2284
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 159ms/step - loss: 0.2288



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 283ms/step - loss: 0.2288 - val_loss: 0.2275
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step - loss: 0.2278



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 249ms/step - loss: 0.2278 - val_loss: 0.2264
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step - loss: 0.2266



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 205ms/step - loss: 0.2266 - val_loss: 0.2252
Epoch 10/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 171ms/step - loss: 0.2253



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 331ms/step - loss: 0.2253 - val_loss: 0.2239


<keras.src.callbacks.history.History at 0x7cd71d320990>

In [27]:
autoencoder.save('autoencoder.h5')
model.save('model.h5')



In [45]:
def recommend_for_user(user_json_path, top_n=5):
  user= json.load(open(user_json_path))
  liked= user.get('liked_movies',[])
  ratings= user.get('ratings',{})
  genres_pref= user.get('liked_genres',[])
  mood= user.get('mood')

  embeddings= model.predict({
      'tfidf_input': tfidf_array,
      'genre_input': genre_matrix,
      'seq_input': padded_sequences
  })

  vecs,wts= [],[]
  for mid in liked:
    index= df.index[df['id']==mid]
    if len(index):
      vecs.append(embeddings[index[0]]*ratings.get(str(mid),1.0))
      wts.append(ratings.get(str(mid),1.0))

  for g in genres_pref:
    index= df.explode('genres').index[df.explode('genres')['genres']==g]
    if len(index):
      vecs.append(np.mean(embeddings[index], axis=0))
      wts.append(1.0)
  mood_map = {'Happy':['Comedy'],'Sad':['Drama'],'Excited':['Action'],'Calm':['Adventure'],'Angry':['Action'],'Fearful':['Thriller']}
  mgen= mood_map.get(mood,[])
  for g in mgen:
    index= df.explode('genres').index[df.explode('genres')['genres']==g]
    if len(index):
      vecs.append(np.mean(embeddings[index], axis=0))
      wts.append(0.5)

  if not vecs:
    user_vec= np.mean(embeddings, axis=0, keepdims=True)
  else:
    user_vec= np.average(vecs, axis=0, weights=wts).reshape(1,-1)

  dists= cdist(user_vec, embeddings, metric='cosine').flatten()
  df['dist']= dists

  candidates= df[~df['id'].isin(liked)]
  top_idx= candidates.nsmallest(top_n,'dist').index

  recs =(
        df.loc[top_idx, ['id','title','overview','poster_url']]
          .rename(columns={'id':'movie_id', 'poster_url':'poster_url'})
          .to_dict(orient='records')
  )

  out={'recommended_movies':recs}
  out_path = user_json_path.replace('.json','_recs.json')
  with open(out_path,'w') as f:
    json.dump(out,f)
  return out_path

In [43]:
import os
os.makedirs("users", exist_ok=True)


In [46]:
import os
import json

os.makedirs("users", exist_ok=True)

with open("users/sample_user.json", "w") as f:
    json.dump({
        "liked_movies": [101, 103],
        "ratings": {
            "101": 4.5,
            "103": 3.8
        },
        "liked_genres": ["Adventure", "Sci-Fi"],
        "mood": "Excited"
    }, f)

recs = recommend_for_user("users/sample_user.json")
print(recs)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
users/sample_user_recs.json


In [47]:
import json

with open("users/sample_user_recs.json", "r") as f:
    recommendations = json.load(f)

print(json.dumps(recommendations, indent=2))

{
  "recommended_movies": [
    {
      "movie_id": 497698,
      "title": "Black Panther",
      "overview": "T\u2019Challa returns home to take the throne and defend Wakanda.",
      "poster_url": "https://image.tmdb.org/t/p/original/uxzzxijgPIY7slzFvMotPv8wjKA.jpg"
    },
    {
      "movie_id": 4971,
      "title": "Gladiator",
      "overview": "A former Roman general seeks vengeance as a gladiator in the arena.",
      "poster_url": "https://image.tmdb.org/t/p/original/ty8TGRuvJLPUmAR1H1nRIsgwvim.jpg"
    },
    {
      "movie_id": 680,
      "title": "Pulp Fiction",
      "overview": "The lives of two hitmen intertwine in a series of violent encounters.",
      "poster_url": "https://image.tmdb.org/t/p/original/dM2w364MScsjFf8pfMbaWUcWrR.jpg"
    },
    {
      "movie_id": 597,
      "title": "The Matrix",
      "overview": "A hacker discovers the world is a simulation and joins a rebellion.",
      "poster_url": "https://image.tmdb.org/t/p/original/aOIuZAjPa2dPZZUvUd6ibgj4VZ1.j