In [1]:
# --- Imports
import pandas as pd
import json
import random
from faker import Faker
from datetime import datetime, timedelta
import uuid
import os
import requests
from bs4 import BeautifulSoup

# --- Initialize Faker with Nigerian locale
fake = Faker("en_NG")

# --- Ensure data folder exists
os.makedirs("../data", exist_ok=True)

# --- Helper
def gen_id():
    return str(uuid.uuid4())


In [None]:
# -- Generate Synthetic Schools Data
regions = ["North Central", "North East", "North West", "South East", "South South", "South West"]
device_types = ["tablet", "laptop", "desktop"]
connectivity_levels = ["offline", "intermittent", "online"]

# Common Nigerian school name patterns
school_patterns = [
    "{} College",
    "{} Grammar School",
    "{} Secondary School",
    "{} Academy",
    "St. {} High School",
    "{} Comprehensive School",
    "Command Secondary School, {}",
    "Federal Government College, {}",
    "{} Memorial School"
]

# Generate synthetic schools
schools = []
for _ in range(372):
    name_base = fake.first_name() if random.random() > 0.4 else fake.last_name()
    region = random.choice(regions)
    city = fake.city()

    # Select a pattern and fill with Nigerian context
    pattern = random.choice(school_patterns)
    school_name = pattern.format(name_base, city)

    # Adjust connectivity bias (simulate rural/urban)
    if region in ["North East", "North West"]:
        connectivity = random.choices(connectivity_levels, weights=[0.6, 0.3, 0.1])[0]
    elif region in ["South East", "South South"]:
        connectivity = random.choices(connectivity_levels, weights=[0.3, 0.4, 0.3])[0]
    else:
        connectivity = random.choices(connectivity_levels, weights=[0.2, 0.3, 0.5])[0]

    schools.append({
        "id": gen_id(),
        "name": school_name,
        "region": region,
        "device_type": random.choice(device_types),
        "connectivity": connectivity
    })

df_schools = pd.DataFrame(schools)
df_schools.to_csv("../data/schools.csv", index=False)

print(f"Generated {len(df_schools)} synthetic schools.")
df_schools.sample(10)

In [None]:
tech_levels = ["low", "medium", "high"]

# helper: generate Nigerian phone number
def gen_nigerian_phone():
    prefix = random.choice(["080", "081", "070", "090", "091"])
    number = ''.join([str(random.randint(0, 9)) for _ in range(8)])
    return f"+234{prefix[1:]}{number}"

# language bias by region
region_lang_map = {
    "North Central": ["English", "Hausa"],
    "North East": ["English", "Hausa"],
    "North West": ["English", "Hausa"],
    "South East": ["English", "Igbo"],
    "South South": ["English", "Pidgin", "Igbo"],
    "South West": ["English", "Yoruba"]
}

teachers = []
for _ in range(40000):
    school_row = df_schools.sample(1).iloc[0]
    region = school_row["region"]
    
    name = fake.name()
    first = name.split()[0].lower()
    last = name.split()[-1].lower()
    email = f"{first}.{last}@eduai.ng"
    
    # bias tech level based on connectivity
    if school_row["connectivity"] == "online":
        tech_level = random.choices(tech_levels, weights=[0.1, 0.4, 0.5])[0]
    elif school_row["connectivity"] == "intermittent":
        tech_level = random.choices(tech_levels, weights=[0.3, 0.5, 0.2])[0]
    else:
        tech_level = random.choices(tech_levels, weights=[0.6, 0.3, 0.1])[0]
    
    teachers.append({
        "id": gen_id(),
        "name": name,
        "email": email,
        "phone": gen_nigerian_phone(),
        "school_id": school_row["id"],
        "languages": json.dumps(random.sample(region_lang_map[region], k=random.randint(1, len(region_lang_map[region])))),
        "tech_level": tech_level,
        "created_at": fake.date_time_this_year()
    })

df_teachers = pd.DataFrame(teachers)
df_teachers.to_csv("../data/teachers.csv", index=False)

print(f"Generated {len(df_teachers)} synthetic teachers.")
df_teachers.sample(10)


In [None]:

from bs4 import BeautifulSoup
from urllib.parse import urljoin
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

# === setup ===
save_dir = "../data/curriculum_pdfs"
os.makedirs(save_dir, exist_ok=True)

chrome_options = Options()
chrome_options.add_argument("--headless=new")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

urls = {
    "pri1-3": "https://nerdc.gov.ng/content_manager/pri1-3.html",
    "pri4-6": "https://nerdc.gov.ng/content_manager/pri4-6.html",
    "jss1-3": "https://nerdc.gov.ng/content_manager/jss1-3.html",
    "aep": "https://nerdc.gov.ng/content_manager/aep.html",
}
base_url = "https://nerdc.gov.ng/content_manager/"

def download_pdf(url, filename):
    path = os.path.join(save_dir, filename)
    try:
        r = requests.get(url, stream=True, timeout=60)
        r.raise_for_status()
        with open(path, "wb") as f:
            for chunk in r.iter_content(1024):
                f.write(chunk)
        print(f"Downloaded {filename}")
    except Exception as e:
        print(f"Failed {filename}: {e}")

for key, page in urls.items():
    print(f"\n Visiting {page}")
    driver.get(page)
    time.sleep(3)

    soup = BeautifulSoup(driver.page_source, "html.parser")
    links = []

    # Find any embedded/view links
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if href.lower().endswith(".pdf"):
            links.append(urljoin(base_url, href))
        elif "view_" in href:
            view_page = urljoin(base_url, href)
            driver.get(view_page)
            try:
                iframe = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.TAG_NAME, "iframe"))
                )
                pdf_url = iframe.get_attribute("src")
                if pdf_url:
                    pdf_full = urljoin(base_url, pdf_url)
                    links.append(pdf_full)
                    print(f"Found embedded PDF → {pdf_full.split('/')[-1]}")
            except:
                print(f"Couldn’t read iframe from {view_page}")
            driver.back()
            time.sleep(2)

    print(f"Found {len(links)} PDF(s) on {page}")

    for link in links:
        filename = os.path.basename(link.split("?pdf=")[-1])
        if not filename.lower().endswith(".pdf"):
            filename += ".pdf"
        download_pdf(link, f"{key}_{filename}")

driver.quit()
print("\n Done — check curriculum_pdfs folder.")

In [None]:
import re
import pytesseract
from pdf2image import convert_from_path
import pandas as pd
import spacy
from tqdm import tqdm

# Update this path to your Tesseract installation
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

nlp = spacy.load("en_core_web_sm")

pdf_folder = "curriculum_pdfs"
rows = []

def extract_text_from_pdf(pdf_path):
    pages = convert_from_path(pdf_path, dpi=300)
    text = ""
    for page in pages:
        text += pytesseract.image_to_string(page, lang='eng') + "\n"
    return text

def extract_curriculum_info(text, filename):
    doc = nlp(text)
    subject = re.search(r"(English|Mathematics|Basic Science|Social Studies|French|Igbo|Yoruba|Hausa|Arabic|Islamic|Prevocational|Civic|History)", text, re.I)
    level = re.search(r"(Primary\s?\d?[-–]?\d?|JSS\s?\d?[-–]?\d?|AEP)", text, re.I)
    subject = subject.group(0) if subject else os.path.splitext(filename)[0]
    level = level.group(0) if level else "Unknown"
    topics = re.findall(r"(?:Topic|Unit)\s*\d*[:\-]?\s*(.+)", text, re.I)
    subtopics = re.findall(r"(?:Sub-Topic|Subtopic)\s*\d*[:\-]?\s*(.+)", text, re.I)
    objectives = re.findall(r"(?:Objective|Learning Outcome|Aim)\s*[:\-]?\s*(.+)", text, re.I)

    if not topics:
        chunks = [sent.text for sent in doc.sents if len(sent.text.split()) > 3]
        topics = chunks[:min(len(chunks), 5)]

    for i, t in enumerate(topics):
        rows.append({
            "Level": level,
            "Subject": subject,
            "Topic": t.strip(),
            "Subtopic": subtopics[i] if i < len(subtopics) else "",
            "Objective": objectives[i] if i < len(objectives) else ""
        })

for pdf in tqdm([f for f in os.listdir(pdf_folder) if f.endswith(".pdf")]):
    text = extract_text_from_pdf(os.path.join(pdf_folder, pdf))
    extract_curriculum_info(text, pdf)

df = pd.DataFrame(rows)
df.to_csv("curriculum_units.csv", index=False)
print("Done — curriculum_units.csv created successfully.")

In [None]:
import pandas as pd
import json
import uuid
import re

# --- File path
file_path = "curriculum.txt"

# --- Helper function to generate unique IDs
def gen_id():
    return str(uuid.uuid4())

# --- Read the text file
with open(file_path, "r", encoding="utf-8") as f:
    text = f.read()

curriculum_units = []

# --- Define levels
levels = {
    "Primary 1-3": r"Primary 1-3 Basic Education Subject List(.*?)(?=Primary 4-6|JSS|SSS|$)",
    "Primary 4-6": r"Primary 4-6(.*?)(?=Junior Secondary School|JSS|SSS|$)",
    "JSS 1-3": r"Junior Secondary School 1(.*?)(?=Senior Secondary School|SSS|$)",
    "SSS": r"Senior Secondary School Subject List(.*)$"
}

# --- Extract subjects for each level
for level_name, pattern in levels.items():
    match = re.search(pattern, text, re.DOTALL)
    if match:
        content = match.group(1)
        # Split by line numbers or bullet points
        lines = re.split(r"\n\d+\s|•\s|[0-9]+\.\s", content)
        for line in lines:
            line = line.strip()
            if line:
                curriculum_units.append({
                    "id": gen_id(),
                    "title": line[:100],  # limit title length
                    "subject": line.split()[0] if len(line.split())>0 else line,  # first word as subject
                    "grade_level": level_name,
                    "source_doc": file_path,
                    "canonical_learning_outcomes": json.dumps({"outcomes": []})
                })

# --- Convert to DataFrame
df_curriculum = pd.DataFrame(curriculum_units)

# --- Save CSV
df_curriculum.to_csv(r"C:\Users\HP\Desktop\EduAi\data\curriculum_units.csv", index=False)
df_curriculum.head(20)


In [None]:
csv_path = r"C:\Users\HP\Desktop\EduAi\data\curriculum_units.csv"

# Use latin-1 to handle Windows encoding
df = pd.read_csv(csv_path, encoding="latin-1")

In [None]:
def gen_id():
    return str(uuid.uuid4())

df["id"] = df["id"].fillna("").astype(str)
df["id"] = df["id"].apply(lambda x: x if x.strip() != "" else gen_id())

if df["id"].duplicated().any():
    seen = set()
    new_ids = []
    for val in df["id"]:
        if val in seen:
            new_ids.append(gen_id())
        else:
            new_ids.append(val)
            seen.add(val)
    df["id"] = new_ids

df.to_csv(csv_path, index=False, encoding="utf-8")
print("All rows now have ID populated successfully.")

In [None]:

df_curriculum = pd.read_csv(r"C:\Users\HP\Desktop\EduAi\data\curriculum_units.csv", encoding="utf-8")
df_teachers = pd.read_csv(r"C:\Users\HP\Desktop\EduAi\data\teachers.csv", encoding="utf-8")

# Function to generate uniqu IDs
def gen_id():
    return str(uuid.uuid4())

# Generate lessons
lessons = []
for _ in range(2800): 
    lessons.append({
        "id": gen_id(),
        "curriculum_unit_id": random.choice(df_curriculum["id"]),
        "teacher_id": random.choice(df_teachers["id"]),
        "content": json.dumps({"body": fake.paragraph(nb_sentences=4)}),
        "assets": json.dumps({"images": [fake.image_url() for _ in range(2)]}),
        "lesson_metadata": json.dumps({
            "duration": f"{random.choice([30, 45, 60])} mins",
            "objectives": fake.sentences(nb=random.randint(1,3))
        }),
        "generated_at": fake.date_time_this_year().isoformat()
    })

# Convert to DataFrame
df_lessons = pd.DataFrame(lessons)

# Save to CSV
df_lessons.to_csv(r"C:\Users\HP\Desktop\EduAi\data\lessons.csv", index=False)

# Quick check
df_lessons.head()

In [None]:
# Load question bank
json_path = r"C:\Users\HP\Desktop\EduAi\notebooks\questions_bank.json"
with open(json_path, "r", encoding="utf-8") as f:
    questions_bank = json.load(f)

# Generate assessments
assessments = []

for _ in range(3896):  # total assessments
    lesson = random.choice(df_lessons.to_dict("records"))
    # Try to pick a subject from lesson if available, otherwise random subject
    subject = lesson.get("subject") or random.choice(list(questions_bank.keys()))
    question_pool = questions_bank.get(subject, [])

    if not question_pool:  # fallback in case subject not in bank
        continue

    num_questions = random.randint(2, 4)
    sampled = random.sample(question_pool, min(num_questions, len(question_pool)))

    assessments.append({
        "id": gen_id(),
        "lesson_id": lesson["id"],
        "items": json.dumps({
            "questions": [q["question"] for q in sampled],
            "answers": [q["answer"] for q in sampled]
        })
    })

# Convert to DataFrame
df_assessments = pd.DataFrame(assessments)

# Save to CSV
df_assessments.to_csv(r"C:\Users\HP\Desktop\EduAi\data\assessments.csv", index=False)

# Quick check
df_assessments.head()

In [None]:
skills = ["AI basics", "Python for teaching", "Robotics intro", "Lesson design", "STEM integration", "Excel for educators", "Words", "PowerPoint", "Google Classroom", "Digital literacy"]

progress = []
for _ in range(13200):
    progress.append({
        "id": gen_id(),
        "teacher_id": random.choice(df_teachers["id"]),
        "skill": random.choice(skills),
        "level": random.choice(["beginner", "intermediate", "advanced"]),
        "last_practiced": fake.date_time_this_year()
    })

df_progress = pd.DataFrame(progress)
df_progress.to_csv(r"C:\Users\HP\Desktop\EduAi\data\teacher_progress.csv", index=False)
df_progress.head()

In [None]:
actions = ["generate_lesson", "refine_prompt", "download_pdf", "login", "logout"]

# Generate audit logs
audit = []
for _ in range(11780):
    audit.append({
        "id": gen_id(),
        "user_id": random.choice(df_teachers["id"]),
        "action": random.choice(actions),
        "prompt_hash": fake.sha1(),
        "model_used": random.choice(["gpt-4o-mini", "gpt-4", "mistral", "claude"]),
        "output_ref": fake.uri(),
        "timestamp": fake.date_time_this_year()
    })

# Convert to DataFrame
df_audit = pd.DataFrame(audit)

# Save to CSV
df_audit.to_csv(r"C:\Users\HP\Desktop\EduAi\data\audit_log.csv", index=False)

# Quick check
df_audit.head()

In [4]:
# --- Connect to DB
db_path = r"C:\Users\HP\Desktop\EduAi\db\edu_ai.db"
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# 1. Add new columns if they don't exist
try:
    cursor.execute("ALTER TABLE teacher_progress ADD COLUMN chapter_reached INTEGER;")
    cursor.execute("ALTER TABLE teacher_progress ADD COLUMN total_chapters INTEGER;")
    print("Added 'chapter_reached' and 'total_chapters' columns.")
except Exception as e:
    print("Columns may already exist:", e)

# 2. Backfill data with random values
for row_id in cursor.execute("SELECT id FROM teacher_progress").fetchall():
    total = random.randint(5, 15)
    reached = random.randint(1, total)
    cursor.execute("""
        UPDATE teacher_progress
        SET total_chapters = ?, chapter_reached = ?
        WHERE id = ?;
    """, (total, reached, row_id[0]))

conn.commit()

✅ Added 'chapter_reached' and 'total_chapters' columns.


In [6]:
# test the update
df_test = pd.read_sql_query("SELECT * FROM teacher_progress LIMIT 5;", conn)
df_test

Unnamed: 0,id,teacher_id,skill,level,last_practiced,chapter_reached,total_chapters
0,c7626b77-86be-4a78-8f42-754ec6736e21,e59fc076-f6f8-4b74-b556-9e3ec0d2e336,Lesson design,intermediate,2025-09-22 16:24:59,7,11
1,289dcf8a-d7ac-412a-9b95-cd201e3af877,0e97f849-b145-4580-924e-51dbb636ff84,PowerPoint,beginner,2025-09-08 00:25:56,3,9
2,836765fc-751b-464b-aab6-0ff9645f88fa,51b926ab-0081-4cc9-83c9-f5b08edd4960,Words,intermediate,2025-07-18 17:04:04,9,14
3,f7b98dbc-f14a-4d5b-a142-d8dd0498ba0b,a111e987-763c-460a-9875-767c06987111,Excel for educators,advanced,2025-08-29 22:40:41,5,13
4,3877b5ba-0335-4c1c-8495-3000bb447421,2c4917f6-eb1d-4c4b-98e6-0f8808b862c2,PowerPoint,intermediate,2025-07-03 19:35:49,7,7
