In [1]:
!pip install -q langchain langchain_community google-generativeai sentence-transformers faiss-cpu

In [None]:
import pandas as pd
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document
import google.generativeai as genai
from google.api_core import exceptions
import shutil
from config.api_keys import GEMINI_API_KEY

In [None]:
# Get your Gemini API key from: https://makersuite.google.com/app/apikey
GEMINI_API_KEY = GEMINI_API_KEY

# Configure Gemini
genai.configure(api_key=GEMINI_API_KEY)

# Test Gemini connection
try:
    model = genai.GenerativeModel('gemini-2.5-flash')
    response = model.generate_content("Hello, are you working?")
    print("‚úÖ Gemini API connected successfully!")
    print(f"Test response: {response.text}")
except Exception as e:
    print(f"‚ùå Gemini API connection failed: {e}")
    print("Please check your API key and make sure it's valid")

‚úÖ Gemini API connected successfully!
Test response: Hello! Yes, you could say I am. As an AI, I'm always "on" and ready to assist by processing information and generating responses.

How can I help you today?


In [4]:
try:
    # Try different possible file paths
    try:
        df = pd.read_csv('/kaggle/input/chatbot-dataset/cleaned_conversations.csv')
    except:
        df = pd.read_csv('/kaggle/input/chatbot-dataset/cleaned_conversations.csv')
    print(f"‚úÖ Dataset loaded! Size: {len(df)} conversations")
    print(f"Dataset columns: {df.columns.tolist()}")
except Exception as e:
    print(f"‚ùå Error loading dataset: {e}")
    print("Creating sample dataset for testing...")
    # Create sample data if file not found
    df = pd.DataFrame({
        'input': [
            'What is artificial intelligence?',
            'Explain machine learning',
            'What is deep learning?',
            'How does neural network work?',
            'What is natural language processing?'
        ],
        'response': [
            'Artificial intelligence is the simulation of human intelligence processes by machines, especially computer systems.',
            'Machine learning is a subset of AI that enables computers to learn and make decisions from data without being explicitly programmed.',
            'Deep learning is a type of machine learning that uses neural networks with multiple layers to analyze various factors in data.',
            'Neural networks are computing systems inspired by biological neural networks that learn to perform tasks by considering examples.',
            'Natural language processing is a branch of AI that helps computers understand, interpret and manipulate human language.'
        ]
    })
    print("‚úÖ Sample dataset created for testing")

# Combine input and response for chunking
df['combined_text'] = df['input'] + " " + df['response']
print("Sample combined text:")
print(df['combined_text'].iloc[0][:200] + "...")

‚úÖ Dataset loaded! Size: 121838 conversations
Dataset columns: ['input', 'response']
Sample combined text:
hi getting ready cheetah chasing stay shape must fast hunting one favorite hobby...


In [5]:
# Convert DataFrame to LangChain Documents
documents = []
for idx, row in df.iterrows():
    doc = Document(
        page_content=row['combined_text'],
        metadata={
            'input': row['input'],
            'response': row['response'],
            'source': 'conversation_data',
            'id': idx
        }
    )
    documents.append(doc)

print(f"‚úÖ Created {len(documents)} documents")

# Initialize text splitter for chunking
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Larger chunks since we're not fine-tuning
    chunk_overlap=100,
    length_function=len,
    separators=["\n\n", "\n", " ", ""]
)

# Split documents into chunks
chunks = text_splitter.split_documents(documents)
print(f"‚úÖ Created {len(chunks)} chunks from {len(documents)} documents")
print(f"Sample chunk: {chunks[0].page_content[:200]}...")

‚úÖ Created 121838 documents
‚úÖ Created 121838 chunks from 121838 documents
Sample chunk: hi getting ready cheetah chasing stay shape must fast hunting one favorite hobby...


In [6]:
# Initialize embeddings model
print("üîÑ Loading embedding model...")
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'}
)

# Create FAISS vector store
print("üîÑ Creating FAISS vector database...")
vector_db = FAISS.from_documents(chunks, embedding_model)

# Save the vector database
vector_db.save_local("vector_db/gemini_rag")
print("‚úÖ Vector database created and saved!")

# Test retrieval
query = "What is artificial intelligence?"
similar_docs = vector_db.similarity_search(query, k=2)
print(f"\nüîç Retrieval test for: '{query}'")
for i, doc in enumerate(similar_docs):
    print(f"Result {i+1}: {doc.page_content[:150]}...")

üîÑ Loading embedding model...


  embedding_model = HuggingFaceEmbeddings(
2025-11-07 15:05:46.210581: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762527946.233070     137 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762527946.240054     137 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


üîÑ Creating FAISS vector database...
‚úÖ Vector database created and saved!

üîç Retrieval test for: 'What is artificial intelligence?'
Result 1: ai well time ever tried video game mortal combat...
Result 2: really robotics...


In [7]:
class GeminiRAGSystem:
    def __init__(self, vector_db, gemini_api_key):
        self.vector_db = vector_db
        self.retriever = vector_db.as_retriever(search_kwargs={"k": 4})
        genai.configure(api_key=gemini_api_key)
        self.model = genai.GenerativeModel('gemini-2.5-flash')
        
    def get_context(self, question):
        """Retrieve relevant context from vector database"""
        docs = self.retriever.get_relevant_documents(question)
        context = "\n\n".join([doc.page_content for doc in docs])
        return context, docs
    
    def ask_question(self, question, conversation_history=[]):
        """Ask question with RAG context"""
        # Get relevant context
        context, source_docs = self.get_context(question)
        
        # Build conversation history
        history_text = ""
        if conversation_history:
            history_text = "\nPrevious conversation:\n"
            for i, (q, a) in enumerate(conversation_history[-3:]):  # Last 3 exchanges
                history_text += f"Q: {q}\nA: {a}\n"
        
        # Create enhanced prompt
        prompt = f"""Based on the following context and conversation history, please answer the question.

Context Information:
{context}
{history_text}
Current Question: {question}

Please provide a helpful and accurate answer based on the context provided. If the context doesn't contain enough information, you can use your general knowledge but please indicate this."""

        try:
            # Generate response using Gemini
            response = self.model.generate_content(prompt)
            answer = response.text
            
            return {
                'question': question,
                'answer': answer,
                'sources': source_docs,
                'context_used': context[:500] + "..." if len(context) > 500 else context
            }
            
        except exceptions.InvalidArgument as e:
            return {
                'question': question,
                'answer': f"Error: Invalid API key or configuration. Please check your Gemini API key.",
                'sources': [],
                'context_used': ""
            }
        except Exception as e:
            return {
                'question': question,
                'answer': f"Error generating response: {str(e)}",
                'sources': [],
                'context_used': ""
            }

In [8]:
print("üîÑ Initializing Gemini RAG System...")
gemini_rag = GeminiRAGSystem(vector_db, GEMINI_API_KEY)
print("‚úÖ Gemini RAG System ready!")

üîÑ Initializing Gemini RAG System...
‚úÖ Gemini RAG System ready!


In [9]:
def test_gemini_rag(question, conversation_history=[]):
    print(f"ü§î Question: {question}")
    print("‚è≥ Generating response...")
    
    result = gemini_rag.ask_question(question, conversation_history)
    
    print(f"ü§ñ Gemini Answer: {result['answer']}")
    print(f"\nüìö Context used: {result['context_used']}")
    print(f"\nüîç Sources retrieved ({len(result['sources'])}):")
    for i, doc in enumerate(result['sources']):
        print(f"Source {i+1}: {doc.page_content[:150]}...")
        print(f"   Metadata: {doc.metadata}\n")
    
    return result

# Test with single question
print("üß™ Testing Gemini RAG System...")
print("=" * 80)

test_question = "What is artificial intelligence?"
result1 = test_gemini_rag(test_question)

üß™ Testing Gemini RAG System...
ü§î Question: What is artificial intelligence?
‚è≥ Generating response...


  docs = self.retriever.get_relevant_documents(question)


ü§ñ Gemini Answer: Based on the provided context, "ai" is used as an abbreviation. The conversation also refers to "human like robot" in relation to Isaac Asimov's book series, suggesting that artificial intelligence could involve creating robots that mimic human capabilities or appearance.

However, the context doesn't offer a direct definition of what artificial intelligence is.

**Using general knowledge:** Artificial intelligence (AI) refers to the simulation of human intelligence in machines that are programmed to think like humans and mimic their actions. It encompasses various fields like machine learning, natural language processing, computer vision, and robotics, aiming to enable machines to perform tasks that typically require human intellect, such as learning, problem-solving, and understanding.

üìö Context used: ai well time ever tried video game mortal combat

really robotics

meant robot lol hey cheating five word

heard science based yes book series isaac asimov human

In [10]:
print("=" * 80)

# Test with follow-up question (conversational)
follow_up = "Can you explain machine learning too?"
conversation_history = [(test_question, result1['answer'])]
result2 = test_gemini_rag(follow_up, conversation_history)

ü§î Question: Can you explain machine learning too?
‚è≥ Generating response...
ü§ñ Gemini Answer: Based on the provided context and conversation history, there isn't enough information to explain "machine learning" directly. While the context mentions "learning curve" and "love learning computer creature," these don't define the technical concept of machine learning. The previous answer to "What is artificial intelligence?" did mention machine learning as a field encompassed by AI, but did not define it.

**Using general knowledge:**
Machine learning (ML) is a subfield of artificial intelligence (AI) that empowers computer systems to learn from data without being explicitly programmed. Instead of following pre-defined instructions for every possible scenario, ML algorithms are designed to analyze vast amounts of data, recognize patterns, and make predictions or decisions based on what they've learned.

In essence, it allows machines to "learn" from experience, much like humans do. Th

In [11]:
def interactive_chat():
    """Start an interactive chat with the Gemini RAG system"""
    print("\nüí¨ Starting Interactive Chat Mode!")
    print("Type 'quit' to exit, 'history' to see conversation history")
    print("-" * 50)
    
    conversation_history = []
    
    while True:
        user_input = input("\nYou: ").strip()
        
        if user_input.lower() == 'quit':
            print("Goodbye! üëã")
            break
        elif user_input.lower() == 'history':
            print("\nüìú Conversation History:")
            for i, (q, a) in enumerate(conversation_history):
                print(f"{i+1}. Q: {q}")
                print(f"   A: {a[:100]}...")
            continue
        elif not user_input:
            continue
            
        print("‚è≥ Thinking...")
        result = gemini_rag.ask_question(user_input, conversation_history)
        
        print(f"\nü§ñ Gemini: {result['answer']}")
        
        # Add to conversation history
        conversation_history.append((user_input, result['answer']))
        
        # Show sources if available
        if result['sources']:
            print(f"\nüìö Sources used: {len(result['sources'])} relevant documents")

print("‚úÖ Interactive chat function ready!")
print("To start chatting, uncomment and run: interactive_chat()")

‚úÖ Interactive chat function ready!
To start chatting, uncomment and run: interactive_chat()


In [12]:
def batch_test_questions(questions):
    """Test multiple questions at once"""
    print("üß™ Batch Testing Questions...")
    print("=" * 60)
    
    results = []
    for i, question in enumerate(questions, 1):
        print(f"\n{i}. Question: {question}")
        result = gemini_rag.ask_question(question)
        print(f"   Answer: {result['answer'][:150]}...")
        results.append(result)
    
    return results

# Test multiple questions
test_questions = [
    "What is artificial intelligence?",
    "Explain machine learning in simple terms",
    "How does deep learning work?",
    "What are the applications of AI?"
]

batch_results = batch_test_questions(test_questions)

üß™ Batch Testing Questions...

1. Question: What is artificial intelligence?
   Answer: The provided context doesn't directly define artificial intelligence.

Based on general knowledge, artificial intelligence (AI) is the simulation of h...

2. Question: Explain machine learning in simple terms
   Answer: Based on the context provided, which only indicates a desire to understand ("need help help understand", "want know trying understand understand wrote...

3. Question: How does deep learning work?
   Answer: Based on the context provided, there isn't enough information to explain how deep learning works. The conversation mentions "deep cool" and "really de...

4. Question: What are the applications of AI?
   Answer: Based on the provided context, the information regarding applications of AI is very limited. The conversation mentions "robotics" and gives a single, ...


In [13]:
print("üì¶ Saving vector database for future use...")
shutil.make_archive('gemini_rag_vector_db', 'zip', 'vector_db')
print("‚úÖ Vector database saved as 'gemini_rag_vector_db.zip'")

üì¶ Saving vector database for future use...
‚úÖ Vector database saved as 'gemini_rag_vector_db.zip'


In [14]:
def load_gemini_rag_system(vector_db_path, gemini_api_key):
    """Load a saved Gemini RAG system"""
    print("üîÑ Loading saved Gemini RAG system...")
    
    # Load embeddings
    embedding_model = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )
    
    # Load vector database
    vector_db = FAISS.load_local(vector_db_path, embedding_model)
    
    # Create Gemini RAG system
    gemini_rag = GeminiRAGSystem(vector_db, gemini_api_key)
    print("‚úÖ Gemini RAG system loaded successfully!")
    return gemini_rag

print("‚úÖ Utility function created!")
print("To load system later, use: load_gemini_rag_system('vector_db/gemini_rag', GEMINI_API_KEY)")

‚úÖ Utility function created!
To load system later, use: load_gemini_rag_system('vector_db/gemini_rag', GEMINI_API_KEY)


In [15]:
print("\nüìä Performance Comparison")
print("=" * 50)
print("‚úÖ Advantages of Gemini API approach:")
print("   ‚Ä¢ Instant setup (no training required)")
print("   ‚Ä¢ No GPU needed during inference")
print("   ‚Ä¢ Much faster responses")
print("   ‚Ä¢ Access to Google's latest model")
print("   ‚Ä¢ Cost-effective for most use cases")
print("   ‚Ä¢ Easy to update knowledge")
print("   ‚Ä¢ Built-in safety features")

print("\nüí° Cost Note: Gemini Pro API costs ~$0.000125 per 1K characters")
print("   (Very affordable for most applications)")

print("\nüéâ Your Gemini RAG System is Ready!")
print("\nüìÅ Files created:")
print("   ‚Ä¢ Vector database: 'vector_db/gemini_rag/'")
print("   ‚Ä¢ Downloadable zip: 'gemini_rag_vector_db.zip'")

print("\nüîß Quick Usage Example:")
print("""
# Ask a question
result = gemini_rag.ask_question("What is AI?")
print(result['answer'])

# Conversational chat
history = []
result1 = gemini_rag.ask_question("First question", history)
history.append(("First question", result1['answer']))
result2 = gemini_rag.ask_question("Follow-up question", history)
""")


üìä Performance Comparison
‚úÖ Advantages of Gemini API approach:
   ‚Ä¢ Instant setup (no training required)
   ‚Ä¢ No GPU needed during inference
   ‚Ä¢ Much faster responses
   ‚Ä¢ Access to Google's latest model
   ‚Ä¢ Cost-effective for most use cases
   ‚Ä¢ Easy to update knowledge
   ‚Ä¢ Built-in safety features

üí° Cost Note: Gemini Pro API costs ~$0.000125 per 1K characters
   (Very affordable for most applications)

üéâ Your Gemini RAG System is Ready!

üìÅ Files created:
   ‚Ä¢ Vector database: 'vector_db/gemini_rag/'
   ‚Ä¢ Downloadable zip: 'gemini_rag_vector_db.zip'

üîß Quick Usage Example:

# Ask a question
result = gemini_rag.ask_question("What is AI?")
print(result['answer'])

# Conversational chat
history = []
result1 = gemini_rag.ask_question("First question", history)
history.append(("First question", result1['answer']))
result2 = gemini_rag.ask_question("Follow-up question", history)

