In [None]:
### WatchWise: RL-Based Personalized Movie Suggestions

# Step 1: Install Required Libraries
#!pip install gymnasium transformers pandas numpy scikit-learn torch

import pandas as pd
import numpy as np
import gymnasium as gym
import random
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import pipeline
from sklearn.preprocessing import LabelEncoder
import pickle

In [None]:
# Step 2: Load IMDb Dataset
from google.colab import drive
drive.mount('/content/drive')

# Check if preprocessed data exists
import os
if os.path.exists("/content/drive/MyDrive/movies_processed.csv"):
    movies_df = pd.read_csv("/content/drive/MyDrive/movies_processed.csv")
else:
    !wget -O title.basics.tsv.gz https://datasets.imdbws.com/title.basics.tsv.gz
    !wget -O title.ratings.tsv.gz https://datasets.imdbws.com/title.ratings.tsv.gz
    !gunzip title.basics.tsv.gz title.ratings.tsv.gz  # Unzip

    movies_df = pd.read_csv("title.basics.tsv", sep="\t", dtype=str, na_values="\\N")
    ratings_df = pd.read_csv("title.ratings.tsv", sep="\t", dtype=str, na_values="\\N")

    # Filter movies and merge with ratings
    movies_df = movies_df[movies_df["titleType"] == "movie"][["tconst", "primaryTitle", "startYear", "genres"]]
    ratings_df = ratings_df[["tconst", "averageRating"]]
    movies_df = movies_df.merge(ratings_df, on="tconst")

    # Convert ratings to float and filter movies with rating >= 7.0
    movies_df["averageRating"] = movies_df["averageRating"].astype(float)
    movies_df = movies_df[movies_df["averageRating"] >= 7.0]

    # Encode movie titles as numerical indices
    label_encoder = LabelEncoder()
    movies_df["movie_encoded"] = label_encoder.fit_transform(movies_df["primaryTitle"])

    # Save preprocessed data
    movies_df.to_csv("/content/drive/MyDrive/movies_processed.csv", index=False)
    pickle.dump(label_encoder, open("/content/drive/MyDrive/label_encoder.pkl", "wb"))

print(movies_df.head())

Mounted at /content/drive
      tconst         primaryTitle  startYear             genres  \
0  tt0070080  Barnen från Jordbro       1996        Documentary   
1  tt0070596    Socialist Realism       2023      Drama,History   
2  tt0093061            Funkytown       1998  Documentary,Music   
3  tt0095102        Fallen Couple       1995           Thriller   
4  tt0097066        Citizen Tania       1997        Crime,Drama   

   averageRating  movie_encoded  
0            7.1           6727  
1            7.1          48877  
2            7.7          20324  
3            7.6          18336  
4            7.7          11703  


In [None]:
    # Convert ratings to float and filter movies with rating >= 7.0
    movies_df["averageRating"] = movies_df["averageRating"].astype(float)
    movies_df = movies_df[(movies_df["averageRating"] >= 7.0) & (movies_df["startYear"].astype(float) > 1994)]

    # Encode movie titles as numerical indices
    label_encoder = LabelEncoder()
    movies_df["movie_encoded"] = label_encoder.fit_transform(movies_df["primaryTitle"])

    # Save preprocessed data
    movies_df.to_csv("/content/drive/MyDrive/movies_processed.csv", index=False)
    pickle.dump(label_encoder, open("/content/drive/MyDrive/label_encoder.pkl", "wb"))

    print(movies_df.head())

      tconst         primaryTitle  startYear             genres  \
0  tt0070080  Barnen från Jordbro       1996        Documentary   
1  tt0070596    Socialist Realism       2023      Drama,History   
2  tt0093061            Funkytown       1998  Documentary,Music   
3  tt0095102        Fallen Couple       1995           Thriller   
4  tt0097066        Citizen Tania       1997        Crime,Drama   

   averageRating  movie_encoded  
0            7.1           6727  
1            7.1          48877  
2            7.7          20324  
3            7.6          18336  
4            7.7          11703  


In [None]:
# Step 3: Mood-Based Genre Mapping
mood_genre_map = {
    "happy": ["Comedy", "Animation", "Adventure"],
    "sad": ["Drama", "Romance"],
    "excited": ["Action", "Thriller", "Sci-Fi"],
    "calm": ["Documentary", "Biography"],
    "lonely": ["Drama", "Romance", "Slice of Life"],
    "stressed": ["Comedy", "Feel-Good", "Music"],
    "bored": ["Mystery", "Sci-Fi", "Fantasy"],
    "angry": ["Action", "Crime", "Thriller"]
}

def get_genres_from_mood(mood):
    return mood_genre_map.get(mood.lower(), [])

In [None]:
genres=get_genres_from_mood(analyze_mood("very happy"))
mood_movies = movies_df[movies_df["genres"].apply(lambda x: isinstance(x, str) and any(g in x for g in genres))]
mood_movie_indices = mood_movies["movie_encoded"].tolist()
print(mood_movies.head(10))
print(mood_movie_indices)

       tconst                           primaryTitle  startYear  \
10  tt0108549             West from North Goes South       2004   
11  tt0109028                             Cold Fever       1995   
14  tt0109184                           Az igazi Mao       1995   
16  tt0109363                   Seeking the Cafe Bob       1995   
23  tt0109973     Halali oder Der Schuß ins Brötchen       1995   
33  tt0110384                              Looosers!       1995   
37  tt0110639                          Nella mischia       1995   
40  tt0110911                        Pullman paradis       1995   
42  tt0110985  Rhinoskin: The Making of a Movie Star       1995   
45  tt0111355                 Tales from a Hard City       1995   

                  genres  averageRating  movie_encoded  
10        Comedy,Mystery            7.9          64495  
11  Comedy,Drama,Mystery            7.0          12064  
14    Comedy,Documentary            8.0           6037  
16        Comedy,Romance          

In [None]:
# Step 4: Sentiment Analysis for Chatbot
sentiment_analyzer = pipeline("sentiment-analysis")

def analyze_mood(user_input):
    sentiment = sentiment_analyzer(user_input)[0]
    label = sentiment["label"].lower()

    if label == "positive":
        detected_mood = "happy"
    elif label == "negative":
        detected_mood = "sad"
    else:
        detected_mood = "calm"

    # Keyword-based mood detection
    mood_keywords = {
        "excited": ["excited", "thrilled", "energetic", "pumped"],
        "lonely": ["lonely", "alone", "isolated"],
        "stressed": ["stressed", "overwhelmed", "anxious"],
        "bored": ["bored", "nothing to do", "dull"],
        "angry": ["angry", "frustrated", "mad"]
    }

    for mood, keywords in mood_keywords.items():
        if any(word in user_input.lower() for word in keywords):
            detected_mood = mood
            break

    return detected_mood



No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


In [None]:
user_text = "my day was very dull"
user_mood = analyze_mood(user_text)
print(f"Detected Mood: {user_mood}")

Detected Mood: bored


In [None]:
# Step 5: Save/Load State for Persistent Recommendations and Feedback
recommendations_file = "/content/drive/MyDrive/recommendations.pkl"
feedback_file = "/content/drive/MyDrive/u_feedback.pkl"

recommended_movies = []
user_feedback = {}

def save_recommendations():
    with open(recommendations_file, "wb") as f:
        pickle.dump(recommended_movies, f)

def load_recommendations():
    global recommended_movies
    if os.path.exists(recommendations_file):
        with open(recommendations_file, "rb") as f:
            recommended_movies = pickle.load(f)

def save_feedback():
    with open(feedback_file, "wb") as f:
        pickle.dump(user_feedback, f)

def load_feedback():
    global user_feedback
    if os.path.exists(feedback_file):
        with open(feedback_file, "rb") as f:
            user_feedback = pickle.load(f)

load_recommendations()
load_feedback()

In [None]:
# Previous
# Step 6: Interactive Chatbot for Personalized Suggestions
def collect_feedback():
    global user_feedback
    for movie in recommended_movies:
        feedback = int(input(f"Did you like {movie}? (yes/no): ").strip().lower())
        user_feedback[movie] = 1 if feedback == 1 else -1
    save_feedback()

def recommend_movies_based_on_mood():
    global recommended_movies
    user_input = input("How are you feeling today?")
    user_mood = analyze_mood(user_input)
    genres = get_genres_from_mood(user_mood)
    recommendations = movies_df[movies_df["genres"].apply(lambda x: isinstance(x, str) and any(g in x for g in genres))]
    recommended_movies = recommendations["primaryTitle"].tolist()[:10]
    save_recommendations()
    print("Here are some movies you might enjoy:")
    print(recommendations[["primaryTitle", "averageRating"]].head(10))

    collect_feedback()

if not recommended_movies:
    recommend_movies_based_on_mood()
else:
    print("Loaded previous recommendations:", recommended_movies)


Loaded previous recommendations: ['Het laatste Joegoslavische elftal', 'Bold Heroes', 'Halt', 'Last Woman Standing', 'Definitely, Maybe', 'Enter the Cage', 'El triángulo rosa y la cura nazi para la homosexualidad', 'Hazard', 'Against the Grain', 'American Widow']


In [None]:
load_recommendations()
load_feedback()

In [None]:
print(recommended_movies)

['McConkey', 'Rascal Does Not Dream of a Knapsack Kid', 'Ramen Heads', 'Butak', 'Miraculous', 'Antim Sanskar: The Last Ritual', 'Caixa Preta', 'Rab Dian Rakhan', 'Gharamat AlFaqr', 'Maa']


In [None]:
# Main
# Step 6: Interactive Chatbot for Personalized Suggestions
def collect_feedback():
    global user_feedback
    for movie in recommended_movies:
        feedback = int(input(f"Did you like {movie}? (yes/no): ").strip().lower())
        user_feedback[movie] = 1 if feedback == 1 else -1
    save_feedback()

def recommend_movies_based_on_mood():
    global recommended_movies
    user_input = input("How are you feeling today? ")
    user_mood = analyze_mood(user_input)
    genres = get_genres_from_mood(user_mood)

    # Get movie indices that match the mood genres
    mood_movies = movies_df[movies_df["genres"].apply(lambda x: isinstance(x, str) and any(g in x for g in genres))]
    mood_movie_indices = mood_movies["movie_encoded"].tolist()

    # Convert to tensor and get Q-values
    with torch.no_grad():
        q_values = q_network(torch.tensor(np.random.rand(1, state_size), dtype=torch.float32))

    # Select top movies based on highest Q-values
    sorted_indices = torch.argsort(q_values, descending=True).squeeze(0)
    recommended_indices = [idx.item() for idx in sorted_indices if idx.item() in mood_movie_indices][:10]

    # Convert indices back to movie titles
    recommended_movies = [movies_df.iloc[idx]["primaryTitle"] for idx in recommended_indices]
    #recommended_movies = mood_movies[mood_movies["movie_encoded"].isin(recommended_indices)]["primaryTitle"].head(10)

    save_recommendations()
    print("Here are some movies you might enjoy based on your mood and feedback:")
    #print(mood_movies[mood_movies["movie_encoded"].isin(recommended_indices)][["primaryTitle", "averageRating"]].head(10))
    for i in range(10):
      print(recommended_movies[i])

    collect_feedback()
    # save_feedback()



In [None]:
recommend_movies_based_on_mood()

How are you feeling today? my day was very dull
Here are some movies you might enjoy based on your mood and feedback:
Kazakh Business in India
Letters from Kurdistan
Fatal Damage
Sathyam
To Kill an Irishman
Moharram - Jugend der ewigen Morgenröte
Circumstances 3
Il segreto nel segreto
The Rebel: Guido Picelli - The Forgotten Hero
Homo Novus
Did you like Kazakh Business in India? (yes/no): 1
Did you like Letters from Kurdistan? (yes/no): 1
Did you like Fatal Damage? (yes/no): 1
Did you like Sathyam? (yes/no): 1
Did you like To Kill an Irishman? (yes/no): 1
Did you like Moharram - Jugend der ewigen Morgenröte? (yes/no): 1
Did you like Circumstances 3? (yes/no): 1
Did you like Il segreto nel segreto? (yes/no): 1
Did you like The Rebel: Guido Picelli - The Forgotten Hero? (yes/no): 1
Did you like Homo Novus? (yes/no): 1


In [None]:
# Step 7: Reinforcement Learning Environment Setup
class MovieRecommendationEnv(gym.Env):
    def __init__(self, movie_list):
        super(MovieRecommendationEnv, self).__init__()
        self.movie_list = movie_list
        self.action_space = gym.spaces.Discrete(len(movie_list))
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(10,), dtype=np.float32)

    def reset(self):
        self.current_state = np.random.rand(10)
        return self.current_state

    def step(self, action):
      movie_title = self.movie_list[action]
      reward = user_feedback.get(movie_title, 0)  # Reward based on past feedback
      done = False
      next_state = np.random.rand(10)
      return next_state, reward, done, {}


movie_env = MovieRecommendationEnv(movies_df["movie_encoded"].tolist())

In [None]:
# Step 8: Optimized Q-Learning Model
class QNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        return self.fc2(x)

state_size = 10
action_size = len(movies_df)
q_network = QNetwork(state_size, action_size)
optimizer = optim.Adam(q_network.parameters(), lr=0.01)

# Mini-Batch Q-Learning Training
batch_size = 32
for i in range(1000):
    batch_indices = np.random.choice(len(movies_df), batch_size, replace=False)
    batch_states = torch.tensor(np.random.rand(batch_size, state_size), dtype=torch.float32)
    batch_actions = torch.tensor(batch_indices, dtype=torch.int64)

    optimizer.zero_grad()
    output = q_network(batch_states)
    selected_q_values = output.gather(1, batch_actions.unsqueeze(1)).squeeze(1)
    target_values = torch.tensor([user_feedback.get(movies_df.iloc[i]["primaryTitle"], 0) for i in batch_indices], dtype=torch.float32)
    loss = nn.functional.mse_loss(selected_q_values, target_values)
    loss.backward()
    optimizer.step()

print("Colab Notebook Setup Complete! Ready for mid-semester review.")

Colab Notebook Setup Complete! Ready for mid-semester review.


In [None]:
# Step 8: Fine-Tuning Q-Learning Model Based on Feedback
state_size = 10
action_size = len(movies_df)
q_network = QNetwork(state_size, action_size)
optimizer = optim.Adam(q_network.parameters(), lr=0.01)

# Train only on feedback
batch_size = min(len(user_feedback), 32)
if batch_size > 0:
    batch_states = torch.tensor(np.random.rand(batch_size, state_size), dtype=torch.float32)
    # Convert movie titles to encoded indices
    batch_action_titles = list(user_feedback.keys())  # Movie titles
    batch_actions = torch.tensor(label_encoder.transform(batch_action_titles), dtype=torch.int64)

    batch_rewards = torch.tensor(list(user_feedback.values()), dtype=torch.float32)

    optimizer.zero_grad()
    output = q_network(batch_states)
    selected_q_values = output.gather(1, batch_actions.unsqueeze(1)).squeeze(1)
    loss = nn.functional.mse_loss(selected_q_values, batch_rewards)
    loss.backward()
    optimizer.step()

print("Colab Notebook Setup Complete! Ready for mid-semester review.")

Colab Notebook Setup Complete! Ready for mid-semester review.


In [None]:
print(recommended_movies)

['Socialist Realism', 'Citizen Tania', 'Cold Fever', 'White Feast', 'Seeking the Cafe Bob', 'Nude with Oranges', 'Dolores Claiborne', 'Eden Valley', 'Esti Kornél csodálatos utazása', 'Hei shan lu']
