<a href="https://colab.research.google.com/github/Prashant-Phuyal7/LANGCHAIN-CHAT-PDF/blob/main/AI_RAG_SYSTEM_Simple_Project_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
 # ============================================================================
# SCHOOL AI RAG SYSTEM - COMPREHENSIVE IMPLEMENTATION WITH DETAILED COMMENTS
# ============================================================================
# This system demonstrates all 7 components of a School AI RAG System:
# 1. Data Sources  2. OCR+NLP  3. Embeddings  4. Databases
# 5. Features  6. RAG Pipeline  7. Dashboard & Notifications
# ============================================================================


In [2]:
 # INSTALLATION REQUIREMENTS

!pip install sentence-transformers chromadb pymongo pandas numpy scikit-learn transformers torch
!pip install pytesseract pillow nltk spacy
!python -m spacy download en_core_web_sm

Collecting chromadb
  Downloading chromadb-1.1.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting pymongo
  Downloading pymongo-4.15.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.9 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.37.0-py3-none-any.whl.metadata (2.4 kB)
Collecting pypika>=0.48.9 (from chromadb)
  Downloading PyPika-0.48.9.tar.gz (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
# IMPORT LIBRARIES
# ============================================================================
import pandas as pd              # For data manipulation and analysis
import numpy as np               # For numerical operations
from sentence_transformers import SentenceTransformer  # For creating text embeddings
import chromadb                  # Vector database for similarity search
from chromadb.config import Settings  # ChromaDB configuration
import nltk                      # Natural Language Processing toolkit
from nltk.corpus import stopwords      # For removing common words
from nltk.tokenize import word_tokenize # For breaking text into words
import spacy                     # Advanced NLP library for NER
import json                      # For JSON data handling
from datetime import datetime, timedelta  # For date/time operations
import random                    # For generating sample data
from typing import List, Dict    # For type hints
import re                        # For regular expressions

In [4]:
# Download required NLTK data files
nltk.download('punkt')           # Tokenizer models
nltk.download('stopwords')       # Stopword lists
nltk.download('punkt_tab')       # Added to download the missing resource

# Load spaCy English model for Named Entity Recognition
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [5]:
# MAIN SCHOOL AI RAG SYSTEM CLASS
# ============================================================================

class SchoolAIRAG:
    """
    Main class that implements all components of School AI RAG System

    Components:
    - Data ingestion and preprocessing
    - Vector embeddings and storage
    - Q&A system with RAG
    - Student behavior tracking
    - Career guidance system
    - Health monitoring and alerts
    - Multi-role dashboards
    """
    def __init__(self):
        # Initialize embedding model
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

        # Initialize ChromaDB (Vector Database)
        self.chroma_client = chromadb.Client()
        # Check if collection exists and delete if it does
        try:
            self.chroma_client.get_collection(name="school_knowledge")
            self.chroma_client.delete_collection(name="school_knowledge")
            print("Deleted existing 'school_knowledge' collection.")
        except:
            print("'school_knowledge' collection not found, creating new one.")

        self.collection = self.chroma_client.create_collection(name="school_knowledge")


        # Initialize data storage (simulating MongoDB)
        self.student_db = {}
        self.attendance_db = {}
        self.health_db = {}

        # Stop words for preprocessing
        self.stop_words = set(stopwords.words('english'))

        print("School AI RAG System initialized successfully!")

    # 1. DATA SOURCES - Simulate various school data
    def create_sample_data(self):
        """Create sample school data"""

        # Sample textbook content
        textbook_content = [
            "Mathematics: Algebra involves working with variables and equations. Linear equations have the form y = mx + b.",
            "Science: Photosynthesis is the process by which plants convert sunlight into energy using chlorophyll.",
            "History: king Tribhuwam is known as father of nation.",
            "English: Grammar rules include proper use of nouns, verbs, adjectives, and sentence structure.",
            "Physics: Newton's first law states that an object in motion stays in motion unless acted upon by an external force."
        ]

        # Sample student records
        students = [
            {"id": "S001", "name": "Ram ", "grade": 10, "subjects": ["Math", "Science", "English"]},
            {"id": "S002", "name": "Shyam", "grade": 10, "subjects": ["Math", "History", "Physics"]},
            {"id": "S003", "name": "Hari", "grade": 10, "subjects": ["Science", "English", "History"]}
        ]

        # Sample attendance data
        attendance_data = {}
        for student in students:
            attendance_data[student["id"]] = {
                "total_days": 100,
                "present_days": random.randint(85, 98),
                "last_week": [random.choice([0, 1]) for _ in range(7)]  # 1=present, 0=absent
            }

        # Sample health data
        health_data = {}
        for student in students:
            health_data[student["id"]] = {
                "height": random.randint(140, 180),
                "weight": random.randint(40, 80),
                "allergies": random.choice([[], ["Peanuts"], ["Dairy"], ["Dust"]]),
                "last_checkup": "2024-01-15"
            }

        return textbook_content, students, attendance_data, health_data

    # 2. OCR + NLP PREPROCESSING
    def preprocess_text(self, text: str) -> Dict:
        """OCR + NLP preprocessing pipeline"""

        # Simulate OCR (in real case, use pytesseract)
        # For demo, we assume text is already extracted

        # Tokenization using spaCy
        doc = nlp(text)
        tokens = [token.text.lower() for token in doc]


        # Remove stopwords and non-alphabetic tokens
        filtered_tokens = [word for word in tokens if word not in self.stop_words and word.isalpha()]

        # Named Entity Recognition
        entities = [(ent.text, ent.label_) for ent in doc.ents]

        return {
            "original_text": text,
            "tokens": filtered_tokens,
            "entities": entities,
            "processed_text": " ".join(filtered_tokens)
        }

    # 3. EMBEDDINGS
    def create_embeddings(self, texts: List[str]) -> np.ndarray:
        """Create embeddings using sentence transformers"""
        return self.embedding_model.encode(texts)

    # 4. DATABASE OPERATIONS
    def store_knowledge(self, documents: List[str]):
        """Store documents in vector database"""

        # Preprocess documents
        processed_docs = []
        for i, doc in enumerate(documents):
            processed = self.preprocess_text(doc)
            processed_docs.append(processed["processed_text"])

        # Create embeddings
        embeddings = self.create_embeddings(processed_docs)

        # Store in ChromaDB
        ids = [f"doc_{i}" for i in range(len(documents))]

        self.collection.add(
            embeddings=embeddings.tolist(),
            documents=documents,
            ids=ids
        )

        print(f"Stored {len(documents)} documents in knowledge base")

    def store_student_data(self, students, attendance, health):
        """Store structured data (simulating MongoDB)"""
        for student in students:
            self.student_db[student["id"]] = student

        self.attendance_db = attendance
        self.health_db = health

        print(f"Stored data for {len(students)} students")

    # 5. FEATURES IMPLEMENTATION
    def qa_system(self, question: str, top_k: int = 3) -> str:
        """Q&A system using RAG"""

        # Create embedding for question
        question_embedding = self.embedding_model.encode([question])

        # Retrieve relevant documents
        results = self.collection.query(
            query_embeddings=question_embedding.tolist(),
            n_results=top_k
        )

        # Simple generation (in real case, use LLM like GPT)
        relevant_docs = results['documents'][0]

        # Generate response based on retrieved documents
        response = f"Based on the curriculum: {' '.join(relevant_docs[:2])}"

        return response

    def behavior_tracking(self, student_id: str) -> Dict:
        """Track student behavior patterns"""
        if student_id not in self.attendance_db:
            return {"error": "Student not found"}

        attendance = self.attendance_db[student_id]
        attendance_rate = (attendance["present_days"] / attendance["total_days"]) * 100

        # Simple behavior analysis
        recent_attendance = sum(attendance["last_week"])

        behavior_score = (attendance_rate * 0.7) + (recent_attendance * 10 * 0.3)

        if behavior_score >= 90:
            status = "Excellent"
        elif behavior_score >= 75:
            status = "Good"
        elif behavior_score >= 60:
            status = "Needs Improvement"
        else:
            status = "At Risk"

        return {
            "student_id": student_id,
            "attendance_rate": round(attendance_rate, 2),
            "recent_performance": recent_attendance,
            "behavior_score": round(behavior_score, 2),
            "status": status
        }

    def career_guidance(self, student_id: str) -> Dict:
        """Provide career guidance based on student data"""
        if student_id not in self.student_db:
            return {"error": "Student not found"}

        student = self.student_db[student_id]
        subjects = student["subjects"]

        # Simple career mapping
        career_map = {
            ("Math", "Physics"): ["Engineering", "Data Science", "Physics Research"],
            ("Science", "Math"): ["Medicine", "Biotechnology", "Research"],
            ("English", "History"): ["Literature", "Journalism", "Education"],
            ("Science", "English"): ["Science Communication", "Medical Writing"]
        }

        recommendations = []
        for subject_combo, careers in career_map.items():
            if all(subj in subjects for subj in subject_combo):
                recommendations.extend(careers)

        if not recommendations:
            recommendations = ["Explore interdisciplinary fields", "Consider your interests"]

        return {
            "student_id": student_id,
            "current_subjects": subjects,
            "career_recommendations": recommendations[:3]
        }

    def health_alerts(self, student_id: str) -> Dict:
        """Generate health alerts and recommendations"""
        if student_id not in self.health_db:
            return {"error": "Student not found"}

        health = self.health_db[student_id]
        alerts = []

        # BMI calculation
        height_m = health["height"] / 100
        bmi = health["weight"] / (height_m ** 2)

        if bmi < 18.5:
            alerts.append("BMI indicates underweight - consult nutritionist")
        elif bmi > 25:
            alerts.append("BMI indicates overweight - consider exercise plan")

        # Allergy alerts
        if health["allergies"]:
            alerts.append(f"Allergies: {', '.join(health['allergies'])} - ensure cafeteria awareness")

        # Checkup reminder
        last_checkup = datetime.strptime(health["last_checkup"], "%Y-%m-%d")
        if (datetime.now() - last_checkup).days > 365:
            alerts.append("Annual health checkup overdue")

        return {
            "student_id": student_id,
            "bmi": round(bmi, 2),
            "alerts": alerts if alerts else ["No health alerts"]
        }

    # 6. RAG PIPELINE
    def rag_pipeline(self, query: str) -> Dict:
        """Complete RAG pipeline: Retrieve + Generate"""

        # Retrieve
        question_embedding = self.embedding_model.encode([query])
        results = self.collection.query(
            query_embeddings=question_embedding.tolist(),
            n_results=3
        )

        retrieved_docs = results['documents'][0]

        # Generate (simplified - in real case, use proper LLM)
        context = " ".join(retrieved_docs)
        response = f"Based on the school curriculum and data:\n\nQuery: {query}\n\nAnswer: {context}\n\nThis information comes from our educational database."

        return {
            "query": query,
            "retrieved_documents": retrieved_docs,
            "generated_response": response
        }

    # 7. DASHBOARD FUNCTIONS
    def teacher_dashboard(self) -> Dict:
        """Generate teacher dashboard data"""
        total_students = len(self.student_db)

        # Calculate class statistics
        attendance_rates = []
        for student_id in self.attendance_db:
            rate = (self.attendance_db[student_id]["present_days"] /
                   self.attendance_db[student_id]["total_days"]) * 100
            attendance_rates.append(rate)

        avg_attendance = sum(attendance_rates) / len(attendance_rates) if attendance_rates else 0

        return {
            "total_students": total_students,
            "average_attendance": round(avg_attendance, 2),
            "students_at_risk": sum(1 for rate in attendance_rates if rate < 75),
            "recent_alerts": ["Student S001 missed 3 days this week", "Health checkup due for S002"]
        }

    def parent_dashboard(self, student_id: str) -> Dict:
        """Generate parent dashboard for specific student"""
        if student_id not in self.student_db:
            return {"error": "Student not found"}

        student = self.student_db[student_id]
        behavior = self.behavior_tracking(student_id)
        health = self.health_alerts(student_id)

        return {
            "student_name": student["name"],
            "grade": student["grade"],
            "attendance_rate": behavior.get("attendance_rate", 0),
            "behavior_status": behavior.get("status", "Unknown"),
            "health_alerts": health.get("alerts", []),
            "subjects": student["subjects"]
        }

    def student_dashboard(self, student_id: str) -> Dict:
        """Generate student dashboard"""
        if student_id not in self.student_db:
            return {"error": "Student not found"}

        student = self.student_db[student_id]
        career = self.career_guidance(student_id)

        return {
            "welcome_message": f"Hello {student['name']}!",
            "grade": student["grade"],
            "subjects": student["subjects"],
            "career_suggestions": career.get("career_recommendations", []),
            "motivational_tip": "Stay curious and keep learning! 🌟"
        }

# DEMONSTRATION
def main():
    print("=== School AI RAG System Demo ===\n")

    # Initialize system
    school_ai = SchoolAIRAG()

    # Create and load sample data
    print("1. Loading sample data...")
    textbook_content, students, attendance_data, health_data = school_ai.create_sample_data()

    # Store knowledge and student data
    school_ai.store_knowledge(textbook_content)
    school_ai.store_student_data(students, attendance_data, health_data)

    print("\n2. Testing Q&A System:")
    question = "What is photosynthesis?"
    answer = school_ai.qa_system(question)
    print(f"Q: {question}")
    print(f"A: {answer}")

    print("\n3. Testing Behavior Tracking:")
    behavior = school_ai.behavior_tracking("S001")
    print(json.dumps(behavior, indent=2))

    print("\n4. Testing Career Guidance:")
    career = school_ai.career_guidance("S002")
    print(json.dumps(career, indent=2))

    print("\n5. Testing Health Alerts:")
    health = school_ai.health_alerts("S001")
    print(json.dumps(health, indent=2))

    print("\n6. Testing RAG Pipeline:")
    rag_result = school_ai.rag_pipeline("Tell me about algebra")
    print(f"Query: {rag_result['query']}")
    print(f"Response: {rag_result['generated_response']}")

    print("\n7. Teacher Dashboard:")
    teacher_data = school_ai.teacher_dashboard()
    print(json.dumps(teacher_data, indent=2))

    print("\n8. Parent Dashboard:")
    parent_data = school_ai.parent_dashboard("S001")
    print(json.dumps(parent_data, indent=2))

    print("\n9. Student Dashboard:")
    student_data = school_ai.student_dashboard("S001")
    print(json.dumps(student_data, indent=2))

    print("\n=== Demo completed successfully! ===")

if __name__ == "__main__":
    main()

=== School AI RAG System Demo ===



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

'school_knowledge' collection not found, creating new one.
School AI RAG System initialized successfully!
1. Loading sample data...
Stored 5 documents in knowledge base
Stored data for 3 students

2. Testing Q&A System:
Q: What is photosynthesis?
A: Based on the curriculum: Science: Photosynthesis is the process by which plants convert sunlight into energy using chlorophyll. Physics: Newton's first law states that an object in motion stays in motion unless acted upon by an external force.

3. Testing Behavior Tracking:
{
  "student_id": "S001",
  "attendance_rate": 87.0,
  "recent_performance": 5,
  "behavior_score": 75.9,
  "status": "Good"
}

4. Testing Career Guidance:
{
  "student_id": "S002",
  "current_subjects": [
    "Math",
    "History",
    "Physics"
  ],
  "career_recommendations": [
    "Engineering",
    "Data Science",
    "Physics Research"
  ]
}

5. Testing Health Alerts:
{
  "student_id": "S001",
  "bmi": 18.99,
  "alerts": [
    "Allergies: Dairy - ensure cafeteria a

In [6]:
# School AI RAG System - Compact Implementation for Google Colab
# Install required packages (run this cell first)

!pip install sentence-transformers chromadb pymongo pandas numpy scikit-learn transformers torch
!pip install pytesseract pillow nltk spacy
!python -m spacy download en_core_web_sm


import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
import json
from datetime import datetime, timedelta
import random
from typing import List, Dict
import re

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load spaCy model for NER
nlp = spacy.load("en_core_web_sm")

class SchoolAIRAG:
    def __init__(self):
        # Initialize embedding model
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

        # Initialize ChromaDB (Vector Database)
        self.chroma_client = chromadb.Client()
        # Check if collection exists and delete if it does
        try:
            self.chroma_client.get_collection(name="school_knowledge")
            self.chroma_client.delete_collection(name="school_knowledge")
            print("Deleted existing 'school_knowledge' collection.")
        except:
            print("'school_knowledge' collection not found, creating new one.")

        self.collection = self.chroma_client.create_collection(name="school_knowledge")

        # Initialize data storage (simulating MongoDB)
        self.student_db = {}
        self.attendance_db = {}
        self.health_db = {}

        # Stop words for preprocessing
        self.stop_words = set(stopwords.words('english'))

        print("School AI RAG System initialized successfully!")

    # 1. DATA SOURCES - Simulate various school data
    def create_sample_data(self):
        """Create sample school data"""

        # Sample textbook content
        textbook_content = [
            "Mathematics: Algebra involves working with variables and equations. Linear equations have the form y = mx + b.",
            "Science: Photosynthesis is the process by which plants convert sunlight into energy using chlorophyll.",
            "History: The Renaissance was a period of cultural rebirth in Europe from 14th to 17th century.",
            "English: Grammar rules include proper use of nouns, verbs, adjectives, and sentence structure.",
            "Physics: Newton's first law states that an object in motion stays in motion unless acted upon by an external force."
        ]

        # Sample student records
        students = [
            {"id": "S001", "name": "Ram", "grade": 10, "subjects": ["Math", "Science", "English"]},
            {"id": "S002", "name": "Sam", "grade": 10, "subjects": ["Math", "History", "Physics"]},
            {"id": "S003", "name": "Hari", "grade": 10, "subjects": ["Science", "English", "History"]}
        ]

        # Sample attendance data
        attendance_data = {}
        for student in students:
            attendance_data[student["id"]] = {
                "total_days": 100,
                "present_days": random.randint(85, 98),
                "last_week": [random.choice([0, 1]) for _ in range(7)]  # 1=present, 0=absent
            }

        # Sample health data
        health_data = {}
        for student in students:
            health_data[student["id"]] = {
                "height": random.randint(140, 180),
                "weight": random.randint(40, 80),
                "allergies": random.choice([[], ["Peanuts"], ["Dairy"], ["Dust"]]),
                "last_checkup": "2024-01-15"
            }

        return textbook_content, students, attendance_data, health_data

    # 2. OCR + NLP PREPROCESSING
    def preprocess_text(self, text: str) -> Dict:
        """OCR + NLP preprocessing pipeline"""

        # Simulate OCR (in real case, use pytesseract)
        # For demo, we assume text is already extracted

        # Tokenization
        tokens = word_tokenize(text.lower())

        # Remove stopwords
        filtered_tokens = [word for word in tokens if word not in self.stop_words and word.isalpha()]

        # Named Entity Recognition
        doc = nlp(text)
        entities = [(ent.text, ent.label_) for ent in doc.ents]

        return {
            "original_text": text,
            "tokens": filtered_tokens,
            "entities": entities,
            "processed_text": " ".join(filtered_tokens)
        }

    # 3. EMBEDDINGS
    def create_embeddings(self, texts: List[str]) -> np.ndarray:
        """Create embeddings using sentence transformers"""
        return self.embedding_model.encode(texts)

    # 4. DATABASE OPERATIONS
    def store_knowledge(self, documents: List[str]):
        """Store documents in vector database"""

        # Preprocess documents
        processed_docs = []
        for i, doc in enumerate(documents):
            processed = self.preprocess_text(doc)
            processed_docs.append(processed["processed_text"])

        # Create embeddings
        embeddings = self.create_embeddings(processed_docs)

        # Store in ChromaDB
        ids = [f"doc_{i}" for i in range(len(documents))]

        self.collection.add(
            embeddings=embeddings.tolist(),
            documents=documents,
            ids=ids
        )

        print(f"Stored {len(documents)} documents in knowledge base")

    def store_student_data(self, students, attendance, health):
        """Store structured data (simulating MongoDB)"""
        for student in students:
            self.student_db[student["id"]] = student

        self.attendance_db = attendance
        self.health_db = health

        print(f"Stored data for {len(students)} students")

    # 5. FEATURES IMPLEMENTATION
    def qa_system(self, question: str, top_k: int = 3) -> str:
        """Q&A system using RAG"""

        # Create embedding for question
        question_embedding = self.embedding_model.encode([question])

        # Retrieve relevant documents
        results = self.collection.query(
            query_embeddings=question_embedding.tolist(),
            n_results=top_k
        )

        # Simple generation (in real case, use LLM like GPT)
        relevant_docs = results['documents'][0]

        # Generate response based on retrieved documents
        response = f"Based on the curriculum: {' '.join(relevant_docs[:2])}"

        return response

    def behavior_tracking(self, student_id: str) -> Dict:
        """Track student behavior patterns"""
        if student_id not in self.attendance_db:
            return {"error": "Student not found"}

        attendance = self.attendance_db[student_id]
        attendance_rate = (attendance["present_days"] / attendance["total_days"]) * 100

        # Simple behavior analysis
        recent_attendance = sum(attendance["last_week"])

        behavior_score = (attendance_rate * 0.7) + (recent_attendance * 10 * 0.3)

        if behavior_score >= 90:
            status = "Excellent"
        elif behavior_score >= 75:
            status = "Good"
        elif behavior_score >= 60:
            status = "Needs Improvement"
        else:
            status = "At Risk"

        return {
            "student_id": student_id,
            "attendance_rate": round(attendance_rate, 2),
            "recent_performance": recent_attendance,
            "behavior_score": round(behavior_score, 2),
            "status": status
        }

    def career_guidance(self, student_id: str) -> Dict:
        """Provide career guidance based on student data"""
        if student_id not in self.student_db:
            return {"error": "Student not found"}

        student = self.student_db[student_id]
        subjects = student["subjects"]

        # Simple career mapping
        career_map = {
            ("Math", "Physics"): ["Engineering", "Data Science", "Physics Research"],
            ("Science", "Math"): ["Medicine", "Biotechnology", "Research"],
            ("English", "History"): ["Literature", "Journalism", "Education"],
            ("Science", "English"): ["Science Communication", "Medical Writing"]
        }

        recommendations = []
        for subject_combo, careers in career_map.items():
            if all(subj in subjects for subj in subject_combo):
                recommendations.extend(careers)

        if not recommendations:
            recommendations = ["Explore interdisciplinary fields", "Consider your interests"]

        return {
            "student_id": student_id,
            "current_subjects": subjects,
            "career_recommendations": recommendations[:3]
        }

    def health_alerts(self, student_id: str) -> Dict:
        """Generate health alerts and recommendations"""
        if student_id not in self.health_db:
            return {"error": "Student not found"}

        health = self.health_db[student_id]
        alerts = []

        # BMI calculation
        height_m = health["height"] / 100
        bmi = health["weight"] / (height_m ** 2)

        if bmi < 18.5:
            alerts.append("BMI indicates underweight - consult nutritionist")
        elif bmi > 25:
            alerts.append("BMI indicates overweight - consider exercise plan")

        # Allergy alerts
        if health["allergies"]:
            alerts.append(f"Allergies: {', '.join(health['allergies'])} - ensure cafeteria awareness")

        # Checkup reminder
        last_checkup = datetime.strptime(health["last_checkup"], "%Y-%m-%d")
        if (datetime.now() - last_checkup).days > 365:
            alerts.append("Annual health checkup overdue")

        return {
            "student_id": student_id,
            "bmi": round(bmi, 2),
            "alerts": alerts if alerts else ["No health alerts"]
        }

    # 6. RAG PIPELINE
    def rag_pipeline(self, query: str) -> Dict:
        """Complete RAG pipeline: Retrieve + Generate"""

        # Retrieve
        question_embedding = self.embedding_model.encode([query])
        results = self.collection.query(
            query_embeddings=question_embedding.tolist(),
            n_results=3
        )

        retrieved_docs = results['documents'][0]

        # Generate (simplified - in real case, use proper LLM)
        context = " ".join(retrieved_docs)
        response = f"Based on the school curriculum and data:\n\nQuery: {query}\n\nAnswer: {context}\n\nThis information comes from our educational database."

        return {
            "query": query,
            "retrieved_documents": retrieved_docs,
            "generated_response": response
        }

    # 7. DASHBOARD FUNCTIONS
    def teacher_dashboard(self) -> Dict:
        """Generate teacher dashboard data"""
        total_students = len(self.student_db)

        # Calculate class statistics
        attendance_rates = []
        for student_id in self.attendance_db:
            rate = (self.attendance_db[student_id]["present_days"] /
                   self.attendance_db[student_id]["total_days"]) * 100
            attendance_rates.append(rate)

        avg_attendance = sum(attendance_rates) / len(attendance_rates) if attendance_rates else 0

        return {
            "total_students": total_students,
            "average_attendance": round(avg_attendance, 2),
            "students_at_risk": sum(1 for rate in attendance_rates if rate < 75),
            "recent_alerts": ["Student S001 missed 3 days this week", "Health checkup due for S002"]
        }

    def parent_dashboard(self, student_id: str) -> Dict:
        """Generate parent dashboard for specific student"""
        if student_id not in self.student_db:
            return {"error": "Student not found"}

        student = self.student_db[student_id]
        behavior = self.behavior_tracking(student_id)
        health = self.health_alerts(student_id)

        return {
            "student_name": student["name"],
            "grade": student["grade"],
            "attendance_rate": behavior.get("attendance_rate", 0),
            "behavior_status": behavior.get("status", "Unknown"),
            "health_alerts": health.get("alerts", []),
            "subjects": student["subjects"]
        }

    def student_dashboard(self, student_id: str) -> Dict:
        """Generate student dashboard"""
        if student_id not in self.student_db:
            return {"error": "Student not found"}

        student = self.student_db[student_id]
        career = self.career_guidance(student_id)

        return {
            "welcome_message": f"Hello {student['name']}!",
            "grade": student["grade"],
            "subjects": student["subjects"],
            "career_suggestions": career.get("career_recommendations", []),
            "motivational_tip": "Stay curious and keep learning! 🌟"
        }

# DEMONSTRATION
def main():
    print("=== School AI RAG System Demo ===\n")

    # Initialize system
    school_ai = SchoolAIRAG()

    # Create and load sample data
    print("1. Loading sample data...")
    textbook_content, students, attendance_data, health_data = school_ai.create_sample_data()

    # Store knowledge and student data
    school_ai.store_knowledge(textbook_content)
    school_ai.store_student_data(students, attendance_data, health_data)

    print("\n2. Testing Q&A System:")
    question = "What is photosynthesis?"
    answer = school_ai.qa_system(question)
    print(f"Q: {question}")
    print(f"A: {answer}")

    print("\n3. Testing Behavior Tracking:")
    behavior = school_ai.behavior_tracking("S001")
    print(json.dumps(behavior, indent=2))

    print("\n4. Testing Career Guidance:")
    career = school_ai.career_guidance("S002")
    print(json.dumps(career, indent=2))

    print("\n5. Testing Health Alerts:")
    health = school_ai.health_alerts("S001")
    print(json.dumps(health, indent=2))

    print("\n6. Testing RAG Pipeline:")
    rag_result = school_ai.rag_pipeline("Tell me about algebra")
    print(f"Query: {rag_result['query']}")
    print(f"Response: {rag_result['generated_response']}")

    print("\n7. Teacher Dashboard:")
    teacher_data = school_ai.teacher_dashboard()
    print(json.dumps(teacher_data, indent=2))

    print("\n8. Parent Dashboard:")
    parent_data = school_ai.parent_dashboard("S001")
    print(json.dumps(parent_data, indent=2))

    print("\n9. Student Dashboard:")
    student_data = school_ai.student_dashboard("S001")
    print(json.dumps(student_data, indent=2))

    print("\n=== Demo completed successfully! ===")

if __name__ == "__main__":
    main()

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m121.7 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


=== School AI RAG System Demo ===

Deleted existing 'school_knowledge' collection.
School AI RAG System initialized successfully!
1. Loading sample data...
Stored 5 documents in knowledge base
Stored data for 3 students

2. Testing Q&A System:
Q: What is photosynthesis?
A: Based on the curriculum: Science: Photosynthesis is the process by which plants convert sunlight into energy using chlorophyll. Physics: Newton's first law states that an object in motion stays in motion unless acted upon by an external force.

3. Testing Behavior Tracking:
{
  "student_id": "S001",
  "attendance_rate": 89.0,
  "recent_performance": 7,
  "behavior_score": 83.3,
  "status": "Good"
}

4. Testing Career Guidance:
{
  "student_id": "S002",
  "current_subjects": [
    "Math",
    "History",
    "Physics"
  ],
  "career_recommendations": [
    "Engineering",
    "Data Science",
    "Physics Research"
  ]
}

5. Testing Health Alerts:
{
  "student_id": "S001",
  "bmi": 31.61,
  "alerts": [
    "BMI indicates