# Integrating NCF as an External Recommenders

In this notebook, the LLM + RAG + NCF model is implemented.

## Environment Initialization

In [3]:
'''
Check Current Working Directory
'''
import os
import sys
from pathlib import Path

# Set working directory
os.chdir("..")

# Add project root to Python path
project_root = os.getcwd()
if project_root not in sys.path:
    sys.path.insert(0, project_root)

print("Project Root:", project_root)
print("Current Working Directory:", os.getcwd())

Project Root: C:\Users\91953\Documents\GitHub\RAG-Movie-CRS
Current Working Directory: C:\Users\91953\Documents\GitHub\RAG-Movie-CRS


### Requirements

In [5]:
!pip install -r requirements.txt



### RAG, LLM, Global Settings

In [14]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from tqdm import tqdm
import json
import csv
from pathlib import Path

# Import LlamaIndex components
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.llms.ollama import Ollama
from llama_index.core import Settings

from scripts.ncf_recommender import NCFModel, INSPIREDDataProcessor

In [15]:
# Initialize the embedding model
embed_model = OllamaEmbedding(
    model_name="nomic-embed-text",
    request_timeout=300.0,
)

# Initialize the LLM with optimized settings
llm = Ollama(
    model="llama3.2:1B",
    request_timeout=300.0,
    temperature=0.1,
    additional_kwargs={"num_gpu": 0}  # Forcing CPU usage
)

# Set global configurations
Settings.embed_model = embed_model
Settings.llm = llm

### Load Dataset

In [16]:
'''
Functions:
    - Load INSPIRED dataset from TSV 
    - Convert to documents

Args:
    - data_path: Path to the TSV file
    - max_rows: Maximum number of rows to load (None = load all rows)
'''

def load_inspired_dataset(data_path, max_rows=None):
    
    if not Path(data_path).exists():
        raise FileNotFoundError(f"INSPIRED dataset not found at '{data_path}'.")
    
    documents = []
    
    # First pass: count total rows for progress bar
    with open(data_path, 'r', encoding='utf-8') as f:
        total_rows = sum(1 for _ in f) - 1  # Subtract header row
    
    if max_rows is not None:
        total_rows = min(total_rows, max_rows)
    
    # Load the TSV data
    with open(data_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f, delimiter='\t')
        
        for idx, row in enumerate(tqdm(reader, total=total_rows, desc="Loading data")):
            # Stop if we've reached max_rows
            if max_rows is not None and idx >= max_rows:
                break
                
            # Extract conversation information from TSV
            dialog_id = row.get('dialog_id', '')
            turn_id = row.get('turn_id', '')
            utterance = row.get('utterance', '')
            speaker = row.get('speaker', '')
            movie_name = row.get('movies', '')
            
            # Create document text
            doc_text = f"Speaker: {speaker}\nUtterance: {utterance}\n"
            
            if movie_name:
                doc_text += f"Movie mentioned: {movie_name}\n"
            
            # Create metadata
            metadata = {
                "dialog_id": dialog_id,
                "turn_id": turn_id,
                "speaker": speaker,
                "movie_name": movie_name if movie_name else "None"
            }
            
            # Create Document object
            doc = Document(text=doc_text, metadata=metadata)
            documents.append(doc)
    
    print(f"Loaded {len(documents)} turns from INSPIRED dataset")
    return documents

In [17]:
'''
Functions:
    - Load the unprocessed INSPIRED movie database for reference
'''

def load_movie_database(dataset_dir="data"):
    
    movie_db_path = Path(dataset_dir) / "raw" / "movie_database.tsv"
    
    if not movie_db_path.exists():
        print(f"Movie database not found at {movie_db_path}")
        return {}
    
    movies = {}
    with open(movie_db_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f, delimiter='\t')
        for row in reader:
            movie_id = row.get('movieId', '')
            movies[movie_id] = row
    
    print(f"Loaded {len(movies)} movies from movie database")
    return movies

## Define Functions

### Create Vector Index

In [18]:
'''
Function:
    - Load INSPIRED dataset
    - Create vector index

Args:
    - dataset_dir: Directory containing the dataset
    - split: Which split to use (train, dev, test)
    - max_rows: Maximum number of rows to load (None for all rows)
    - load_movie_db: Whether to load movie database (needed for external recommenders)

Returns:
    - index: VectorStoreIndex for RAG
    - movie_db: Movie database (if load_movie_db=True), else None
'''

def load_and_index_documents(dataset_dir="data", split="train", max_rows=None, load_movie_db=True):
        
    # Construct path to the data file 
    data_path = Path(dataset_dir) / "processed" / f"{split}.tsv"
    
    if not data_path.exists():
        raise FileNotFoundError(
            f"Data file not found at {data_path}. \nCheck for typos."
        )
    
    # Load INSPIRED documents with max_rows limit
    docs = load_inspired_dataset(data_path, max_rows=max_rows)
    
    if not docs:
        raise ValueError("No documents loaded from INSPIRED dataset")
    
    # load movie database for reference
    # Used by external recommenders
    movie_db = load_movie_database(dataset_dir)
    
    # Load documents, then
    # Build vector index from documents
    print("Building vector index...")
    index = VectorStoreIndex.from_documents(
        docs, 
        embed_model=embed_model,
        show_progress=True
    )
    
    return index, movie_db

### Query Engine

In [19]:
'''
Function:
    - Create query engine with specified retrieval parameters
'''

def create_query_engine(index, similarity_top_k=5):
    
    query_engine = index.as_query_engine(
        llm=llm,
        similarity_top_k=similarity_top_k,
        response_mode="compact",
        streaming=True
    )
    
    return query_engine

### Adapt External Recommender

### Interactive Conversation

In [20]:
'''
Interactive conversation with history tracking
'''
def interactive_conversation_with_history():
    
    try:
        # Load documents and create index
        print("\nLoading INSPIRED dataset...")
        index, _ = load_and_index_documents(split="train", max_rows=20)
        
        # Create chat engine instead of query engine
        chat_engine = index.as_chat_engine(
            llm=llm,
            similarity_top_k=5,
            chat_mode="context"  # Use context mode for history
        )
        
        
        print("|| MovieCRS is Ready ||")
        
        print("\nYou can now ask for movie recommendations.")
        print("Type 'quit', 'exit', or 'q' to end the conversation.\n")
        
        while True:
            user_input = input("You: ").strip()
            
            if user_input.lower() in ['quit', 'exit', 'q', 'bye']:
                print("\nMovieCRS: Exiting...")
                break
            
            if not user_input:
                continue
            
            try:
                # Use chat() instead of query() - it handles history automatically
                print("\nMovieCRS: ", end="", flush=True)
                response = chat_engine.chat(user_input)
                
                # Print only the response text
                print(f"{response.response}\n")
                
            except Exception as e:
                print(f"Error: {str(e)}\n")
        
        return True
    
    except Exception as e:
        print(f"System Error: {str(e)}")
        return False

## Main

In [21]:
# Main execution
if __name__ == "__main__":
    
    print("Starting RAG Pipeline with INSPIRED Dataset...")
    
    # Start interactive conversation with history
    success = interactive_conversation_with_history()
    
    if not success:
        print("\nSystem failed to start.")

Starting RAG Pipeline with INSPIRED Dataset...

Loading INSPIRED dataset...
System Error: name 'csv' is not defined

System failed to start.


## Evaluation

### Standard Metrics

In [None]:
from scripts.evaluator import RecommenderEvaluator

print("RUNNING NCF EVALUATION")

# Initialize evaluator
evaluator = RecommenderEvaluator(k_values=[1, 3, 5, 10])

# Load test data
data_processor = INSPIREDDataProcessor(dataset_dir="data")
data_processor.load_movie_database()
data_processor.load_dialogs(split="test", max_dialogs=None)

# Build ground truth
ground_truth = {}
for user_idx, item_idx, rating in data_processor.interactions:
    if user_idx not in ground_truth:
        ground_truth[user_idx] = set()
    ground_truth[user_idx].add(item_idx)

# Prepare predictions list
predictions = []

# Get all movie indices
all_movie_indices = torch.tensor(
    list(ncf_mappings['movie_to_idx'].values()),
    dtype=torch.long
).to(device)

# Evaluate users
test_users = list(ground_truth.keys())[:100]
print(f"\nEvaluating on {len(test_users)} users...\n")

for user_idx in tqdm(test_users, desc="Generating predictions"):
    if user_idx not in ground_truth or not ground_truth[user_idx]:
        continue
    
    # Get recommendations
    recommendations = ncf_model.predict_top_k(
        user_idx=user_idx,
        candidate_items=all_movie_indices,
        k=10
    )
    
    recommended_ids = [rec['item_idx'] for rec in recommendations]
    
    # Add to predictions
    predictions.append({
        'recommended': recommended_ids,
        'ground_truth': ground_truth[user_idx]
    })

# Evaluate batch
results = evaluator.evaluate_batch(predictions)

# Print results
evaluator.print_results(
    dataset_name="NCF Test Set",
    num_samples=len(predictions)
)

In [None]:
# Save results
output_path = "models/ncf_checkpoints/evaluation_results.json"

evaluator.save_results(
    output_path=output_path,
    model_name="NCF+LLM+RAG",
    metadata={
        'num_users': ncf_model.num_users,
        'num_items': ncf_model.num_items,
        'embedding_dim': checkpoint['embedding_dim'],
        'test_samples': len(predictions)
    }
)

# Show results table
results_table = evaluator.get_summary_table()
print("\nResults Summary:")
print(results_table)

### Contextual Metrics

## Conclusion

## DEBUGGING

In [2]:
'''
Check Current Working Directory
'''
import os

# Change Working Directory if needed 
os.chdir("C:/Users/91953/JupyterNotebooks/RAG-2-optimized")

# Check if in correct directory
current_directory = os.getcwd()
print("Current Working Directory:", current_directory)


Current Working Directory: C:\Users\91953\JupyterNotebooks\RAG-2-optimized


In [3]:

import csv
from pathlib import Path

def check_columns(file_path, file_description):
    """Check and print column names from a TSV file"""
    print(f"\n{'='*60}")
    print(f"{file_description}")
    print(f"File: {file_path}")
    print(f"{'='*60}")
    
    if not Path(file_path).exists():
        print(f"❌ FILE NOT FOUND!")
        return None
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f, delimiter='\t')
            columns = reader.fieldnames
            
            print(f"✓ Found {len(columns)} columns:")
            for i, col in enumerate(columns, 1):
                print(f"  {i}. {col}")
            
            # Print first row as sample
            print(f"\nSample row (first row):")
            first_row = next(reader, None)
            if first_row:
                for col in columns[:5]:  # Show first 5 columns
                    value = first_row.get(col, '')
                    # Truncate long values
                    if len(str(value)) > 50:
                        value = str(value)[:50] + "..."
                    print(f"  {col}: {value}")
            
            return columns
    
    except Exception as e:
        print(f"❌ ERROR: {e}")
        return None


if __name__ == "__main__":
    print("\n" + "="*60)
    print("CHECKING INSPIRED DATASET COLUMN NAMES")
    print("="*60)
    
    # Define file paths - UPDATE THESE TO YOUR ACTUAL PATHS
    dataset_dir = "data"  # Change this if needed
    
    files_to_check = [
        (f"{dataset_dir}/raw/movie_database.tsv", "Movie Database"),
        (f"{dataset_dir}/processed/train.tsv", "Training Dialog Data"),
        (f"{dataset_dir}/processed/test.tsv", "Test Dialog Data"),
        (f"{dataset_dir}/processed/dev.tsv", "Dev Dialog Data"),
    ]
    
    print(f"\nDataset directory: {dataset_dir}")
    print(f"Checking {len(files_to_check)} files...\n")
    
    for file_path, description in files_to_check:
        check_columns(file_path, description)
    
    print("\n" + "="*60)
    print("DONE")
    print("="*60)





CHECKING INSPIRED DATASET COLUMN NAMES

Dataset directory: data
Checking 4 files...


Movie Database
File: data/raw/movie_database.tsv
✓ Found 30 columns:
  1. title
  2. year
  3. trailer_duration
  4. actors
  5. awards
  6. box_office
  7. country
  8. director
  9. dvd_release
  10. genre
  11. imdb_id
  12. imdb_type
  13. imdb_votes
  14. language
  15. long_plot
  16. movie_runtime
  17. poster
  18. production
  19. rated
  20. rating
  21. release_date
  22. short_plot
  23. video_id
  24. writer
  25. youtube_comment
  26. youtube_dislike
  27. youtube_favorite
  28. youtube_like
  29. youtube_link
  30. youtube_view

Sample row (first row):
  title: Antlers
  year: 2020
  trailer_duration: 123.0
  actors: Keri Russell, Jesse Plemons, Jeremy T. Thomas, Gra...
  awards: 

Training Dialog Data
File: data/processed/train.tsv
✓ Found 19 columns:
  1. dialog_id
  2. utt_id
  3. speaker
  4. turn_id
  5. text
  6. text_with_placeholder
  7. movies
  8. genres
  9. people_names
  1