# Integrating Transformer as an External Recommenders

In this notebook, the LLM + RAG + Transformer model is implemented.

## Environment Initialization

In [1]:
'''
Check Current Working Directory
'''
import os

# Change Working Directory if needed 
os.chdir("C:/Users/91953/JupyterNotebooks/RAG-2-optimized")

# Check if in correct directory
current_directory = os.getcwd()
print("Current Working Directory:", current_directory)

Current Working Directory: C:\Users\91953\JupyterNotebooks\RAG-2-optimized


### Requirements

In [2]:
!pip install -r requirements.txt



### RAG, LLM, Global Settings

In [3]:
from pathlib import Path
import csv
import subprocess
from llama_index.core import VectorStoreIndex, Document, Settings
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.llms.ollama import Ollama
from tqdm import tqdm
import torch
# Import the transformer recommender
from scripts.transformer_recommender import TransformerRecommender, INSPIREDDataProcessor

In [4]:
# Initialize the embedding model
embed_model = OllamaEmbedding(
    model_name="nomic-embed-text",
    request_timeout=300.0,
)

# Initialize the LLM with optimized settings
llm = Ollama(
    model="llama3.2:1b",
    request_timeout=300.0,
    temperature=0.7,
    additional_kwargs={"num_gpu": 0}
)

# Set global configurations
Settings.embed_model = embed_model
Settings.llm = llm

# Global transformer model and processor
transformer_model = None
data_processor = None

### Load Dataset

In [5]:
'''
Functions:
    - Load INSPIRED dataset from TSV 
    - Convert to documents

Args:
    - data_path: Path to the TSV file
    - max_rows: Maximum number of rows to load (None = load all rows)
'''

def load_inspired_dataset(data_path, max_rows=None):
    
    if not Path(data_path).exists():
        raise FileNotFoundError(f"INSPIRED dataset not found at '{data_path}'.")
    
    documents = []
    
    # First pass: count total rows for progress bar
    with open(data_path, 'r', encoding='utf-8') as f:
        total_rows = sum(1 for _ in f) - 1  # Subtract header row
    
    if max_rows is not None:
        total_rows = min(total_rows, max_rows)
    
    # Load the TSV data
    with open(data_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f, delimiter='\t')
        
        for idx, row in enumerate(tqdm(reader, total=total_rows, desc="Loading data")):
            # Stop if we've reached max_rows
            if max_rows is not None and idx >= max_rows:
                break
                
            # Extract conversation information from TSV
            dialog_id = row.get('dialog_id', '')
            turn_id = row.get('turn_id', '')
            utterance = row.get('utterance', '')
            speaker = row.get('speaker', '')
            movie_name = row.get('movie_name', '')
            
            # Create document text
            doc_text = f"Speaker: {speaker}\nUtterance: {utterance}\n"
            
            if movie_name:
                doc_text += f"Movie mentioned: {movie_name}\n"
            
            # Create metadata
            metadata = {
                "dialog_id": dialog_id,
                "turn_id": turn_id,
                "speaker": speaker,
                "movie_name": movie_name if movie_name else "None"
            }
            
            # Create Document object
            doc = Document(text=doc_text, metadata=metadata)
            documents.append(doc)
    
    print(f"Loaded {len(documents)} turns from INSPIRED dataset")
    return documents

In [6]:
'''
Customized Function is created in the external recommender python file

Functions:
    - Load the unprocessed INSPIRED movie database for reference

'''
'''

def load_movie_database(dataset_dir="data"):
    
    movie_db_path = Path(dataset_dir) / "raw" / "movie_database.tsv"
    
    if not movie_db_path.exists():
        print(f"Movie database not found at {movie_db_path}")
        return {}
    
    movies = {}
    with open(movie_db_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f, delimiter='\t')
        for row in reader:
            movie_id = row.get('movieId', '')
            movies[movie_id] = row
    
    print(f"Loaded {len(movies)} movies from movie database")
    return movies
'''

'\n\ndef load_movie_database(dataset_dir="data"):\n\n    movie_db_path = Path(dataset_dir) / "raw" / "movie_database.tsv"\n\n    if not movie_db_path.exists():\n        print(f"Movie database not found at {movie_db_path}")\n        return {}\n\n    movies = {}\n    with open(movie_db_path, \'r\', encoding=\'utf-8\') as f:\n        reader = csv.DictReader(f, delimiter=\'\t\')\n        for row in reader:\n            movie_id = row.get(\'movieId\', \'\')\n            movies[movie_id] = row\n\n    print(f"Loaded {len(movies)} movies from movie database")\n    return movies\n'

## Define Functions

### Create Vector Index

In [7]:
'''
Function:
    - Load INSPIRED dataset
    - Create vector index

Args:
    - dataset_dir: Directory containing the dataset
    - split: Which split to use (train, dev, test)
    - max_rows: Maximum number of rows to load (None for all rows)
    - load_movie_db: Whether to load movie database (needed for external recommenders)

Returns:
    - index: VectorStoreIndex for RAG
'''

def load_and_index_documents(dataset_dir="data", split="train", max_rows=None):
        
    # Construct path to the data file 
    data_path = Path(dataset_dir) / "processed" / f"{split}.tsv"
    
    if not data_path.exists():
        raise FileNotFoundError(
            f"Data file not found at {data_path}. \nCheck for typos."
        )
    
    # Load INSPIRED documents with max_rows limit
    docs = load_inspired_dataset(data_path, max_rows=max_rows)
    
    if not docs:
        raise ValueError("No documents loaded from INSPIRED dataset")
    
    # load movie database for reference
    # Used by external recommenders
    #movie_db = INSPIREDDataProcessor.load_movie_database(dataset_dir)
    
    # Load documents, then
    # Build vector index from documents
    print("Building vector index...")
    index = VectorStoreIndex.from_documents(
        docs, 
        embed_model=embed_model,
        show_progress=True
    )
    
    return index

### Query Engine

In [8]:
'''
Function:
    - Create query engine with specified retrieval parameters
'''

def create_query_engine(index, similarity_top_k=5):
    
    query_engine = index.as_query_engine(
        llm=llm,
        similarity_top_k=similarity_top_k,
        response_mode="compact",
        streaming=True
    )
    
    return query_engine

### Adapt External Recommender

In [9]:
'''
Function:
    - Initialize the transformer recommender
    - Train the transformer recommender
'''

def initialize_transformer_model(dataset_dir="data"):

    global transformer_model, data_processor
    
    print("|| INITIALIZING TRANSFORMER RECOMMENDER ||")

    # Load movie database
    data_processor = INSPIREDDataProcessor(dataset_dir)
    movie_id_map, movie_name_map = data_processor.load_movie_database()
    
    # Initialize model
    transformer_model = TransformerRecommender(num_movies=len(movie_id_map))
    
    # TODO: Add training code here to train from scratch
    # For now, we'll use it in inference mode (random initialization)
    
    print("Transformer model initialized")
    return transformer_model, data_processor

In [10]:
'''
Get top-k movie recommendations from transformer
'''
def get_transformer_recommendations(conversation_history, top_k=3):
    
    if transformer_model is None or data_processor is None:
        raise ValueError("Transformer model not initialized")
    
    # Combine conversation history into single text
    conversation_text = " ".join(conversation_history[-6:])  # Last 3 exchanges
    
    # Get recommendations from transformer
    recommendations = transformer_model.predict_top_k(
        conversation_text,
        data_processor.movie_name_map,
        k=top_k
    )
    
    return recommendations

### Interactive Conversation

In [11]:
'''
Interactive conversation with transformer + LLM pipeline
using history
'''

def interactive_conversation_with_transformer():
    
    try:
        # Load documents and create index
        print("\nLoading INSPIRED dataset...")
        index = load_and_index_documents(split="train", max_rows=100)
        
        # Initialize transformer model
        transformer_model, data_processor = initialize_transformer_model()
        
        # Create chat engine
        chat_engine = index.as_chat_engine(
            llm=llm,
            similarity_top_k=5,
            chat_mode="context"
        )
        
        print("|| MovieCRS ready! ||")
        print("(Using Transformer + LLM Pipeline)")

        print("\nYou can now ask for movie recommendations.")
        print("Type 'quit', 'exit', or 'q' to end the conversation.\n")
        
        conversation_history = []
        
        while True:
            user_input = input("You: ").strip()
            
            if user_input.lower() in ['quit', 'exit', 'q', 'bye']:
                print("\nMovieCRS: Bye.")
                break
            
            if not user_input:
                continue
            
            try:
                # Add user input to history
                conversation_history.append(f"User: {user_input}")
                
                # STEP 1: Get transformer recommendations
                print("\n[Generating recommendations...]", end="", flush=True)
                transformer_recs = get_transformer_recommendations(conversation_history, top_k=5)
                
                # Format transformer recommendations
                rec_text = "\n".join([
                    f"- {rec['movie_name']} (score: {rec['score']:.3f})"
                    for rec in transformer_recs[:5]  # Top 5
                ])
                
                # STEP 2: Create enhanced query with transformer recommendations
                enhanced_query = f"""
User query: 
{user_input}

Top recommended movies from our recommendation system:
{rec_text}

Based on the user's preferences and these recommendations, provide a natural, 
conversational response suggesting appropriate movies.
"""
                
                # STEP 3: Get LLM response
                print("\r[Generating response...]    ", end="", flush=True)
                response = chat_engine.chat(enhanced_query)
                
                print("\nMovieCRS: ", end="", flush=True)
                print(f"{response.response}\n")
                
                # Add to history
                conversation_history.append(f"MovieCRS: {response.response}")
                
            except Exception as e:
                print(f"\nError: {str(e)}\n")
        
        return True
    
    except Exception as e:
        print(f"System Error: {str(e)}")
        return False

## Main

In [None]:
# Main execution
if __name__ == "__main__":
    
    print("Starting Transformer + LLM Pipeline with INSPIRED Dataset...")
    
    # Start interactive conversation with transformer
    success = interactive_conversation_with_transformer()
    
    if not success:
        print("\nSystem failed to start. Check the error messages above.")

Starting Transformer + LLM Pipeline with INSPIRED Dataset...

Loading INSPIRED dataset...


Loading data: 100%|██████████| 100/100 [00:00<00:00, 17050.02it/s]

Loaded 100 turns from INSPIRED dataset
Building vector index...





Parsing nodes:   0%|          | 0/100 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/100 [00:00<?, ?it/s]

|| INITIALIZING TRANSFORMER RECOMMENDER ||
Loading 17869 movies from database...
Loaded 16764 movies
Transformer model initialized
|| MovieCRS ready! ||
(Using Transformer + LLM Pipeline)

You can now ask for movie recommendations.
Type 'quit', 'exit', or 'q' to end the conversation.



You:  i like horror but not today, now i want romance



[Generating response...]    ..]
MovieCRS: Speaker: RECOMMENDER
Utterance:

Hey there! Since you're in the mood for something different from horror movies today, I've got some romance movie suggestions that might interest you. 

Based on your preferences and my previous recommendations, here are a few options to consider:

- Steel (1997) is a romantic thriller starring Bruce Willis and Angelina Jolie.
- Fire and Ice (2016) is a drama film based on the novel "Fire and Ice" by Ayn Rand.
- The Covenant (2003) has a unique storyline that might appeal to horror fans looking for something different.

Give these movies a try, or let me know if you'd like more recommendations!



## Evaluation

### Standard Metrics

### Contextual Metrics

## Conclusion