In [6]:
import os
print("Working Directory:", os.getcwd())


Working Directory: d:\KRISPI\Code\poisoning_detection


In [5]:
cd ..

d:\KRISPI\Code\poisoning_detection


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [7]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import os

class RAGRecommender:
    """
    Simple RAG System for Movie Recommendations
    """
    
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        self.knowledge_base = None
        self.embeddings = None
        self.is_built = False
        
    def build_knowledge_base(self, movies_df, save_path='knowledge_base.pkl'):
        """
        Build vector database from movie data
        """
        print("Building RAG Knowledge Base...")
        
        # Create descriptive texts for each movie
        descriptions = []
        for _, row in movies_df.iterrows():
            desc = f"Movie: {row['title']}. Genres: {row['genres']}."
            descriptions.append(desc)
        
        # Generate embeddings
        print("Generating embeddings...")
        self.embeddings = self.model.encode(descriptions, show_progress_bar=True)
        
        # Build knowledge base
        self.knowledge_base = movies_df.copy()
        self.knowledge_base['description'] = descriptions
        self.knowledge_base['embedding'] = list(self.embeddings)
        
        self.is_built = True
        print(f"Knowledge base built with {len(self.knowledge_base)} items")
        
        # Save for later use
        with open(save_path, 'wb') as f:
            pickle.dump({
                'knowledge_base': self.knowledge_base,
                'embeddings': self.embeddings
            }, f)
        print(f"Knowledge base saved to {save_path}")
        
        return self.knowledge_base
    
    def load_knowledge_base(self, load_path='knowledge_base.pkl'):
        """
        Load pre-built knowledge base
        """
        if os.path.exists(load_path):
            with open(load_path, 'rb') as f:
                data = pickle.load(f)
            self.knowledge_base = data['knowledge_base']
            self.embeddings = data['embeddings']
            self.is_built = True
            print(f"Knowledge base loaded from {load_path}")
            return True
        else:
            print("No saved knowledge base found. Please build first.")
            return False
    
    def retrieve(self, query, top_k=10):
        """
        Retrieve top-k most relevant movies based on query
        """
        if not self.is_built:
            raise ValueError("Knowledge base not built. Call build_knowledge_base() first.")
        
        # Encode query
        query_embedding = self.model.encode([query])
        
        # Calculate similarities
        similarities = cosine_similarity(query_embedding, self.embeddings)[0]
        
        # Get top-k indices
        top_indices = np.argsort(similarities)[-top_k:][::-1]
        
        # Prepare results
        results = self.knowledge_base.iloc[top_indices].copy()
        results['similarity_score'] = similarities[top_indices]
        
        return results
    
    def recommend(self, query, top_k=10, rerank=True):
        """
        Full recommendation pipeline with optional reranking simulation
        """
        # Step 1: Retrieval
        candidates = self.retrieve(query, top_k=top_k * 2)  # Get more for reranking
        
        if rerank:
            # Step 2: Simple reranking simulation (replace with actual LLM in future)
            candidates = self._simulate_llm_reranking(candidates, query, top_k)
        
        return candidates.head(top_k)
    
    def _simulate_llm_reranking(self, candidates, query, top_k):
        """
        Simulate LLM reranking with simple heuristics
        In real scenario, this would call an actual LLM API
        """
        # Simple heuristic: boost scores for exact genre matches
        query_lower = query.lower()
        
        for idx, row in candidates.iterrows():
            boost = 1.0
            
            # Boost if query contains genre words
            genres = str(row['genres']).lower()
            if any(genre in query_lower for genre in ['action', 'comedy', 'drama', 'horror', 'romance']):
                if any(genre in genres for genre in ['action', 'comedy', 'drama', 'horror', 'romance']):
                    boost *= 1.2
            
            # Boost for exact title matches
            title = str(row['title']).lower()
            if any(word in title for word in query_lower.split()):
                boost *= 1.1
            
            candidates.loc[idx, 'similarity_score'] *= boost
        
        # Rerank based on boosted scores
        candidates = candidates.sort_values('similarity_score', ascending=False)
        return candidates.head(top_k)
    
    def get_movie_by_id(self, movie_id):
        """
        Get movie details by ID
        """
        if not self.is_built:
            return None
        return self.knowledge_base[self.knowledge_base['movieId'] == movie_id]

# Test the RAG system
if __name__ == "__main__":
    # Load data
    movies = pd.read_csv('data/ml-32m/movies.csv')
    
    # Initialize and build RAG system
    rag = RAGRecommender()
    
    # Build or load knowledge base
    if not rag.load_knowledge_base():
        rag.build_knowledge_base(movies)
    
    # Test queries
    test_queries = [
        "action movies with comedy",
        "emotional drama films",
        "scary horror movies",
        "romantic comedy"
    ]
    
    for query in test_queries:
        print(f"\nQuery: '{query}'")
        recommendations = rag.recommend(query, top_k=3)
        
        for _, movie in recommendations.iterrows():
            print(f"  {movie['title']} | {movie['genres']} | Score: {movie['similarity_score']:.3f}")

Knowledge base loaded from knowledge_base.pkl

🎬 Query: 'action movies with comedy'
   📽 Action Point (2018) | Comedy | Score: 1.023
   📽 Project Space 13 (2021) | Comedy | Score: 0.840
   📽 Before Next Spring (2021) | Comedy | Score: 0.827

🎬 Query: 'emotional drama films'
   📽 Emotional Arithmetic (2007) | Drama | Score: 0.952
   📽 Teenage Emotions (2021) | Drama | Score: 0.918
   📽 7 Emotions (2018) | Comedy|Drama | Score: 0.897

🎬 Query: 'scary horror movies'
   📽 Scary Movie 4 (2006) | Comedy|Horror | Score: 1.035
   📽 American Scary (2006) | Comedy|Documentary|Horror | Score: 1.032
   📽 Scary Stories to Tell in the Dark (2019) | Horror | Score: 1.027

🎬 Query: 'romantic comedy'
   📽 The New Romantic (2018) | Comedy|Drama | Score: 1.039
   📽 Romantic Comedy (1983) | Comedy | Score: 0.987
   📽 Romantics, The (2010) | Comedy|Drama|Romance | Score: 0.963
