In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 🔍 Overview

The AI Study Buddy is a personalized learning assistant that leverages Google’s Gemini Flash 2.0 API, embeddings, and FAISS vector search to provide evidence-based, multilingual, and context-aware academic support. This assistant extracts knowledge from uploaded PDF study guides and provides intelligent, summarized responses to user queries — making learning interactive and efficient.

# **🧱 Project Architecture**

* 🗂️ Data Ingestion: PDF + web documents + notes
  
* 🧬 Embeddings: Convert content to vector form

* 🔤 Translation Utilities : Translates the input text into a specified target language

* 🧠 LLM Integration: Query + reasoning

* 🌐 Multilingual Support: EN/FR/HI/ES/AR using Gemini Flash 2.0




# 🧪 Setup & Environment

# 1. 🔧 Environment Setup
To ensure smooth execution within the Kaggle Notebook environment:

* Required libraries such as google-generativeai, faiss-cpu, PyPDF2, langdetect, and deep-translator are installed.

* kaggle_secrets is used to securely load the Gemini API key from the notebook environment.

In [None]:
!pip install -q google-generativeai faiss-cpu PyPDF2 langdetect deep-translator



# Import Libraries

In [None]:
import os
import faiss
import PyPDF2
import numpy as np
import google.generativeai as genai
from langdetect import detect
from deep_translator import GoogleTranslator
from sklearn.metrics.pairwise import cosine_similarity
from google.colab import userdata  


# 🔑 API Key Setup

In [None]:
import os
from kaggle_secrets import UserSecretsClient

GOOGLE_API_KEY = UserSecretsClient().get_secret("GOOGLE_API_KEY")
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

model = genai.GenerativeModel("gemini-2.0-flash")  


# 2. 📄 PDF Document Parsing

Uploaded study material (e.g., Machine learning mastery) is parsed using PyPDF2:

* The extracted raw text is cleaned and segmented into chunks of ~1000 characters.

* These chunks act as semantic units for embedding and search operations.

This allows the AI to work with large documents effectively while maintaining response accuracy.

In [None]:
#Add files as pdf path
pdf_path = "/kaggle/input/machine-learning-algorithms/machine_learning_algorithms_from_scratch_sample.pdf"
pdf_path = "/kaggle/input/machine-learning-mastery-for-engineers/MachineLearningMasteryforEngineers.pdf"

def extract_text_from_pdf(file_path):
    text = ""
    with open(file_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text()
    return text

raw_text = extract_text_from_pdf(pdf_path)
print("✅ PDF loaded. Length of text:", len(raw_text))

# 3. 📚 Split Text into Overlapping Chunks

* 🔹 Splits large text into smaller chunks.

* ✂️ Each chunk has a fixed **chunk_size** (default: 1000).

* 🔁 Overlaps chunks by **overlap** characters (default: 200).

* 📤 Returns a list of overlapping text segments.

In [None]:
def chunk_text(text, chunk_size=1000, overlap=200):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start = end - overlap
    return chunks

chunks = chunk_text(raw_text)
print("✅ Total Chunks:", len(chunks))


# 🧠 4. Generate Embeddings + Vector Indexing (FAISS)

* 🧠 **embed_text():** Converts text to 768D embeddings using Gemini.

* 📦 **create_faiss_index():** Builds FAISS index and stores all chunk embeddings.

* ⚡ Enables fast semantic search over document chunks.

In [None]:

# Embed a single chunk of text using Gemini
def embed_text(text):
    response = genai.embed_content(
        model="models/embedding-001",
        content=text,
        task_type="retrieval_document"
    )
    embedding = response["embedding"]
    return np.array(embedding, dtype=np.float32).reshape(1, -1)

# Create FAISS index from all text chunks
def create_faiss_index(chunks):
    dim = 768  # Gemini embedding size
    index = faiss.IndexFlatL2(dim)
    embeddings = []

    for chunk in chunks:
        emb = embed_text(chunk)
        index.add(emb)
        embeddings.append(emb)

    return index, np.vstack(embeddings)

# Run it
faiss_index, chunk_embeddings = create_faiss_index(chunks)


# 5. 🌐🔤 Translation Utilities 

This code provides basic language detection and translation features:

* Initializes a translation object using googletrans.

* Detects the language of a given text using langdetect.

* Translates the input text into a specified target language using Google Translate.



In [None]:
def detect_language(text):
    return detect(text)

def translate_text(text, target_lang):
    return GoogleTranslator(source='auto', target=target_lang).translate(text)


# 6. 🧠🔎 Semantic Search + Answering with Gemini ✨

This code performs semantic search and generates answers using Gemini:

* Finds the top k most relevant text chunks to the user's query using FAISS similarity search.

* Builds a context-aware prompt with the retrieved chunks and sends it to the Gemini model to generate an informed answer.

✅ Enables intelligent Q&A by combining retrieval with LLM reasoning.

In [None]:
def search_similar_chunks(query, chunks, index, chunk_embeddings, k=3):
    query_vector = embed_text(query)
    D, I = index.search(query_vector, k)
    return [chunks[i] for i in I[0]]

def generate_gemini_response(query, context_chunks):
    prompt = "You are an AI Study Buddy. Answer the question based on the context:\n\n"
    for i, chunk in enumerate(context_chunks):
        prompt += f"[Context {i+1}]: {chunk}\n\n"
    prompt += f"Question: {query}\nAnswer:"
    response = model.generate_content(prompt)
    return response.text

# 7. 🗣️🌍 Multilingual Question Answering with Gemini 
This block handles a full question-answering pipeline:

* Step 3: User Input
Prompts the user to ask a question in any language (English, French, Hindi, Spanish or Arabic).

* Step 4: Language Detection & Translation
Detects the input language and translates it to English if needed for processing.

* Step 5: Semantic Search + Answer Generation
Uses FAISS to find relevant content chunks and Gemini to generate an answer.

* Step 6: Translate Back
If the original question wasn’t in English, the answer is translated back to the user’s language.

✅ Delivers intelligent, context-aware answers in multiple languages.

In [None]:
# Step 3: Ask a question
user_input = "What are the different types of machine learning?"

#You can use this on hindi. Use the next command:
#user_input = "Machine learning ka konse konse bhag hain?"


# Step 4: Language check and translation
lang = detect_language(user_input)

# Force default to English if detection is unreliable
if lang not in ["en", "fr", "hi", "es", "ar"]:
    lang = "en"

translated_input = translate_text(user_input, "en") if lang != "en" else user_input

# Step 5: Semantic Search and Gemini Answer
top_chunks = search_similar_chunks(translated_input, chunks, faiss_index, chunk_embeddings)
answer = generate_gemini_response(translated_input, top_chunks)

# Step 6: Translate back to original language if needed
final_answer = translate_text(answer, lang) if lang != "en" else answer

print("\n💬 AI Study Buddy says:\n", final_answer)


# Summary

# Summary

**📘 AI Study Buddy – Smart PDF Question Answering**

AI Study Buddy is an intelligent assistant that helps students interactively learn from study guides in PDF format. It uses Google Gemini's language and embedding models to understand questions in English, French, or Arabic, and answers them contextually based on the uploaded material.

🔍 Core Features

* Multilingual support (English, French, Hindi, Spanish, Arabic)

* PDF ingestion + semantic chunking

* Embedding with embedding-001 + FAISS vector search

* Context-aware responses using Gemini Flash 2.0

* Translation handled seamlessly both ways

📦 How It Works

* PDF is loaded and chunked.

* Each chunk is embedded and stored in FAISS for semantic search.

* User's question is detected for language and translated to English if needed.

* Relevant chunks are retrieved and passed to Gemini Flash for response generation.

* Final answer is translated back to the original input language.

💡 Perfect for:

* Revising from notes

* Getting answers directly from large PDFs

* Non-English speaking students accessing global content

