In [7]:
# --- Imports
import pandas as pd
import json
import random
from faker import Faker
from datetime import datetime, timedelta
import uuid
import os
import requests
from bs4 import BeautifulSoup

# --- Initialize Faker with Nigerian locale
fake = Faker("en_NG")

# --- Ensure data folder exists
os.makedirs("../data", exist_ok=True)

# --- Helper
def gen_id():
    return str(uuid.uuid4())


In [2]:
# -- Generate Synthetic Schools Data
regions = ["North Central", "North East", "North West", "South East", "South South", "South West"]
device_types = ["tablet", "laptop", "desktop"]
connectivity_levels = ["offline", "intermittent", "online"]

# Common Nigerian school name patterns
school_patterns = [
    "{} College",
    "{} Grammar School",
    "{} Secondary School",
    "{} Academy",
    "St. {} High School",
    "{} Comprehensive School",
    "Command Secondary School, {}",
    "Federal Government College, {}",
    "{} Memorial School"
]

# Generate synthetic schools
schools = []
for _ in range(372):
    name_base = fake.first_name() if random.random() > 0.4 else fake.last_name()
    region = random.choice(regions)
    city = fake.city()

    # Select a pattern and fill with Nigerian context
    pattern = random.choice(school_patterns)
    school_name = pattern.format(name_base, city)

    # Adjust connectivity bias (simulate rural/urban)
    if region in ["North East", "North West"]:
        connectivity = random.choices(connectivity_levels, weights=[0.6, 0.3, 0.1])[0]
    elif region in ["South East", "South South"]:
        connectivity = random.choices(connectivity_levels, weights=[0.3, 0.4, 0.3])[0]
    else:
        connectivity = random.choices(connectivity_levels, weights=[0.2, 0.3, 0.5])[0]

    schools.append({
        "id": gen_id(),
        "name": school_name,
        "region": region,
        "device_type": random.choice(device_types),
        "connectivity": connectivity
    })

df_schools = pd.DataFrame(schools)
df_schools.to_csv("../data/schools.csv", index=False)

print(f"✅ Generated {len(df_schools)} synthetic schools.")
df_schools.sample(10)

✅ Generated 372 synthetic schools.


Unnamed: 0,id,name,region,device_type,connectivity
60,28ceea98-3c57-4740-bffc-7372bc7d53d4,Chukwu College,South West,tablet,intermittent
197,2db37ab1-c0d1-4f46-ad63-451e73cc9a8a,Mark Academy,North West,tablet,online
151,73fd08a4-14b6-43e3-830c-ffc4c8d794ba,St. Michael High School,North Central,tablet,online
92,fdb79794-63a5-4e42-957b-afcccbbfa092,Nnamani Memorial School,South East,laptop,intermittent
207,0a3d8077-748c-4bbe-9ba9-e25fe9f0c431,"Federal Government College, Obasanjo",North Central,laptop,online
350,9fa8d230-fa47-41a7-9f40-c83983ec2c5d,St. Agnes High School,South East,laptop,online
272,c0596933-7a79-4892-a365-a27d4a3991ac,"Command Secondary School, Obasanjo",North East,desktop,offline
205,c50dfa13-e0c4-4298-aee9-02885f8959ef,St. Benjamin High School,South East,laptop,offline
79,e1c203eb-8222-44b0-9624-59d04cbd19ce,Angela Grammar School,North East,laptop,offline
283,139dc393-1843-4d5e-8e6d-8a4933c01e88,Olawale Academy,North East,desktop,offline


In [3]:
tech_levels = ["low", "medium", "high"]

# helper: generate Nigerian phone number
def gen_nigerian_phone():
    prefix = random.choice(["080", "081", "070", "090", "091"])
    number = ''.join([str(random.randint(0, 9)) for _ in range(8)])
    return f"+234{prefix[1:]}{number}"

# language bias by region
region_lang_map = {
    "North Central": ["English", "Hausa"],
    "North East": ["English", "Hausa"],
    "North West": ["English", "Hausa"],
    "South East": ["English", "Igbo"],
    "South South": ["English", "Pidgin", "Igbo"],
    "South West": ["English", "Yoruba"]
}

teachers = []
for _ in range(40000):
    school_row = df_schools.sample(1).iloc[0]
    region = school_row["region"]
    
    name = fake.name()
    first = name.split()[0].lower()
    last = name.split()[-1].lower()
    email = f"{first}.{last}@eduai.ng"
    
    # bias tech level based on connectivity
    if school_row["connectivity"] == "online":
        tech_level = random.choices(tech_levels, weights=[0.1, 0.4, 0.5])[0]
    elif school_row["connectivity"] == "intermittent":
        tech_level = random.choices(tech_levels, weights=[0.3, 0.5, 0.2])[0]
    else:
        tech_level = random.choices(tech_levels, weights=[0.6, 0.3, 0.1])[0]
    
    teachers.append({
        "id": gen_id(),
        "name": name,
        "email": email,
        "phone": gen_nigerian_phone(),
        "school_id": school_row["id"],
        "languages": json.dumps(random.sample(region_lang_map[region], k=random.randint(1, len(region_lang_map[region])))),
        "tech_level": tech_level,
        "created_at": fake.date_time_this_year()
    })

df_teachers = pd.DataFrame(teachers)
df_teachers.to_csv("../data/teachers.csv", index=False)

print(f"✅ Generated {len(df_teachers)} synthetic teachers.")
df_teachers.sample(10)


✅ Generated 40000 synthetic teachers.


Unnamed: 0,id,name,email,phone,school_id,languages,tech_level,created_at
6914,24dd725f-b5a9-42a2-af79-3ea160bc283a,Joy Nnamani,joy.nnamani@eduai.ng,2349130286149,9b45145c-39ea-456b-a55b-9d3837beefa3,"[""English"", ""Hausa""]",low,2025-02-23 05:57:03
16245,8ee75f2f-1ddf-435d-b8a8-772176abcc36,Paul Uche,paul.uche@eduai.ng,2349167279433,003e92c4-a362-4fd4-9833-49865a18fe98,"[""English"", ""Igbo""]",medium,2025-06-27 01:32:31
25332,b3a68fe7-d3f1-4adf-b731-c75eccfab661,James Eze,james.eze@eduai.ng,2348010783218,20f1f966-1aa9-442a-8314-23b7ec0b5b54,"[""Hausa""]",high,2025-01-09 09:09:11
3464,3bf3a1e4-73c5-40af-a0ae-e399b233aa8d,Blessing Ekwueme,blessing.ekwueme@eduai.ng,2349192575143,ae96c64d-4b29-4534-869b-f9d95e16d874,"[""English"", ""Yoruba""]",low,2025-07-09 23:16:01
22714,1f1a6aba-c7d0-44f9-8339-34e1f12915dc,Faith Obi,faith.obi@eduai.ng,2349052295377,34859581-36f8-4209-b6c1-40ab9998a6de,"[""English"", ""Yoruba""]",high,2025-02-11 02:05:32
25245,9e1e035b-2df1-433c-b0c1-25a62fec9893,Faith Ogunleye,faith.ogunleye@eduai.ng,2348125050513,d8eaed9e-54fd-4336-8059-2cd562e3fbdf,"[""English""]",low,2025-01-11 18:20:26
7122,58443221-9869-48a8-822e-e8d8157efe49,Mary Eze,mary.eze@eduai.ng,2349127929929,3724f184-c052-4c22-bb0b-f016fc6c06db,"[""English"", ""Hausa""]",medium,2025-03-04 15:50:40
32590,e0160c69-daa8-4f56-990d-ddb571acf94a,Michael Uche,michael.uche@eduai.ng,2348036757894,6da08453-1acd-4ae1-9f25-d4c746b43e16,"[""English"", ""Hausa""]",low,2025-01-05 22:51:11
38482,e6e78655-92a0-4af1-a52e-7e4f0b0952eb,Cornelius Eze,cornelius.eze@eduai.ng,2347076650824,2a347a02-c3ea-4d76-9e14-a244245a2edc,"[""Hausa"", ""English""]",low,2025-06-26 02:47:35
10839,e29f9a05-a3c5-43a3-a090-cc47727654c0,Emmanuel Obi,emmanuel.obi@eduai.ng,2349123347184,093b8160-d893-42f6-bb8c-cf4fc762d660,"[""Igbo""]",high,2025-06-17 16:44:52


In [21]:

from bs4 import BeautifulSoup
from urllib.parse import urljoin
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

# === setup ===
save_dir = "../data/curriculum_pdfs"
os.makedirs(save_dir, exist_ok=True)

chrome_options = Options()
chrome_options.add_argument("--headless=new")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

urls = {
    "pri1-3": "https://nerdc.gov.ng/content_manager/pri1-3.html",
    "pri4-6": "https://nerdc.gov.ng/content_manager/pri4-6.html",
    "jss1-3": "https://nerdc.gov.ng/content_manager/jss1-3.html",
    "aep": "https://nerdc.gov.ng/content_manager/aep.html",
}
base_url = "https://nerdc.gov.ng/content_manager/"

def download_pdf(url, filename):
    path = os.path.join(save_dir, filename)
    try:
        r = requests.get(url, stream=True, timeout=60)
        r.raise_for_status()
        with open(path, "wb") as f:
            for chunk in r.iter_content(1024):
                f.write(chunk)
        print(f"✅ Downloaded {filename}")
    except Exception as e:
        print(f"⚠️ Failed {filename}: {e}")

for key, page in urls.items():
    print(f"\n📖 Visiting {page}")
    driver.get(page)
    time.sleep(3)

    soup = BeautifulSoup(driver.page_source, "html.parser")
    links = []

    # 🧩 Find any embedded/view links
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if href.lower().endswith(".pdf"):
            links.append(urljoin(base_url, href))
        elif "view_" in href:
            view_page = urljoin(base_url, href)
            driver.get(view_page)
            try:
                iframe = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.TAG_NAME, "iframe"))
                )
                pdf_url = iframe.get_attribute("src")
                if pdf_url:
                    pdf_full = urljoin(base_url, pdf_url)
                    links.append(pdf_full)
                    print(f"🧩 Found embedded PDF → {pdf_full.split('/')[-1]}")
            except:
                print(f"⚠️ Couldn’t read iframe from {view_page}")
            driver.back()
            time.sleep(2)

    print(f"📚 Found {len(links)} PDF(s) on {page}")

    for link in links:
        filename = os.path.basename(link.split("?pdf=")[-1])
        if not filename.lower().endswith(".pdf"):
            filename += ".pdf"
        download_pdf(link, f"{key}_{filename}")

driver.quit()
print("\n🎯 Done — check curriculum_pdfs folder.")


📖 Visiting https://nerdc.gov.ng/content_manager/pri1-3.html
📚 Found 0 PDF(s) on https://nerdc.gov.ng/content_manager/pri1-3.html

📖 Visiting https://nerdc.gov.ng/content_manager/pri4-6.html
📚 Found 0 PDF(s) on https://nerdc.gov.ng/content_manager/pri4-6.html

📖 Visiting https://nerdc.gov.ng/content_manager/jss1-3.html
📚 Found 0 PDF(s) on https://nerdc.gov.ng/content_manager/jss1-3.html

📖 Visiting https://nerdc.gov.ng/content_manager/aep.html
📚 Found 21 PDF(s) on https://nerdc.gov.ng/content_manager/aep.html
✅ Downloaded aep_AEP Basic Science and Tech Level 1 Version A.pdf
✅ Downloaded aep_AEP Basic Science and Tech Level 2 Version A.pdf
✅ Downloaded aep_AEP Basic Science and Tech Level 3 Version A.pdf
✅ Downloaded aep_AEP English Level 1 Version A.pdf
✅ Downloaded aep_AEP English Level 2 Version A.pdf
✅ Downloaded aep_AEP English Level 3 Version A.pdf
✅ Downloaded aep_AEP YORUBA LEVEL 1 Version A.pdf
✅ Downloaded aep_AEP YORUBA LEVEL 2 Version A.pdf
✅ Downloaded aep_AEP YORUBA LEVEL 

In [30]:
import re
import pytesseract
from pdf2image import convert_from_path
import pandas as pd
import spacy
from tqdm import tqdm

# Update this path to your Tesseract installation
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

nlp = spacy.load("en_core_web_sm")

pdf_folder = "curriculum_pdfs"
rows = []

def extract_text_from_pdf(pdf_path):
    pages = convert_from_path(pdf_path, dpi=300)
    text = ""
    for page in pages:
        text += pytesseract.image_to_string(page, lang='eng') + "\n"
    return text

def extract_curriculum_info(text, filename):
    doc = nlp(text)
    subject = re.search(r"(English|Mathematics|Basic Science|Social Studies|French|Igbo|Yoruba|Hausa|Arabic|Islamic|Prevocational|Civic|History)", text, re.I)
    level = re.search(r"(Primary\s?\d?[-–]?\d?|JSS\s?\d?[-–]?\d?|AEP)", text, re.I)
    subject = subject.group(0) if subject else os.path.splitext(filename)[0]
    level = level.group(0) if level else "Unknown"
    topics = re.findall(r"(?:Topic|Unit)\s*\d*[:\-]?\s*(.+)", text, re.I)
    subtopics = re.findall(r"(?:Sub-Topic|Subtopic)\s*\d*[:\-]?\s*(.+)", text, re.I)
    objectives = re.findall(r"(?:Objective|Learning Outcome|Aim)\s*[:\-]?\s*(.+)", text, re.I)

    if not topics:
        chunks = [sent.text for sent in doc.sents if len(sent.text.split()) > 3]
        topics = chunks[:min(len(chunks), 5)]

    for i, t in enumerate(topics):
        rows.append({
            "Level": level,
            "Subject": subject,
            "Topic": t.strip(),
            "Subtopic": subtopics[i] if i < len(subtopics) else "",
            "Objective": objectives[i] if i < len(objectives) else ""
        })

for pdf in tqdm([f for f in os.listdir(pdf_folder) if f.endswith(".pdf")]):
    text = extract_text_from_pdf(os.path.join(pdf_folder, pdf))
    extract_curriculum_info(text, pdf)

df = pd.DataFrame(rows)
df.to_csv("curriculum_units.csv", index=False)
print("✅ Done — curriculum_units.csv created successfully.")

0it [00:00, ?it/s]

✅ Done — curriculum_units.csv created successfully.





In [31]:
import pandas as pd
import json
import uuid
import re

# --- File path
file_path = "curriculum.txt"

# --- Helper function to generate unique IDs
def gen_id():
    return str(uuid.uuid4())

# --- Read the text file
with open(file_path, "r", encoding="utf-8") as f:
    text = f.read()

curriculum_units = []

# --- Define levels
levels = {
    "Primary 1-3": r"Primary 1-3 Basic Education Subject List(.*?)(?=Primary 4-6|JSS|SSS|$)",
    "Primary 4-6": r"Primary 4-6(.*?)(?=Junior Secondary School|JSS|SSS|$)",
    "JSS 1-3": r"Junior Secondary School 1(.*?)(?=Senior Secondary School|SSS|$)",
    "SSS": r"Senior Secondary School Subject List(.*)$"
}

# --- Extract subjects for each level
for level_name, pattern in levels.items():
    match = re.search(pattern, text, re.DOTALL)
    if match:
        content = match.group(1)
        # Split by line numbers or bullet points
        lines = re.split(r"\n\d+\s|•\s|[0-9]+\.\s", content)
        for line in lines:
            line = line.strip()
            if line:
                curriculum_units.append({
                    "id": gen_id(),
                    "title": line[:100],  # limit title length
                    "subject": line.split()[0] if len(line.split())>0 else line,  # first word as subject
                    "grade_level": level_name,
                    "source_doc": file_path,
                    "canonical_learning_outcomes": json.dumps({"outcomes": []})
                })

# --- Convert to DataFrame
df_curriculum = pd.DataFrame(curriculum_units)

# --- Save CSV
df_curriculum.to_csv(r"C:\Users\HP\Desktop\EduAi\data\curriculum_units.csv", index=False)
df_curriculum.head(20)


Unnamed: 0,id,title,subject,grade_level,source_doc,canonical_learning_outcomes
0,c541f986-ae45-4b46-8a87-94dffd6ee183,9 – 10 subjects 15 – 17 subjects,9,Primary 4-6,curriculum.txt,"{""outcomes"": []}"
1,9e9deda2-5e14-4f72-bf46-8b805311477d,English Studies English Studies English Studie...,English,JSS 1-3,curriculum.txt,"{""outcomes"": []}"
2,f575bebe-0447-45f7-b462-d4ef1154858d,Hausa,Hausa,JSS 1-3,curriculum.txt,"{""outcomes"": []}"
3,c92113a2-dbce-405d-be23-3b1f54ed2bec,Yoruba,Yoruba,JSS 1-3,curriculum.txt,"{""outcomes"": []}"
4,c8be543c-6e6b-4b26-913f-d7384d8f0b09,Igbo (One Language only),Igbo,JSS 1-3,curriculum.txt,"{""outcomes"": []}"
5,1ec8ee07-16fd-4cd2-90a1-cb8f4d97fafd,Hausa,Hausa,JSS 1-3,curriculum.txt,"{""outcomes"": []}"
6,96ad7187-d57c-4e88-b236-c4ee8363c41c,Yoruba,Yoruba,JSS 1-3,curriculum.txt,"{""outcomes"": []}"
7,40f59546-68f4-4d2b-b36f-f2b52b1da775,Igbo (One Language only),Igbo,JSS 1-3,curriculum.txt,"{""outcomes"": []}"
8,d2288c36-e28a-4e4a-9780-26231fdd696d,Hausa,Hausa,JSS 1-3,curriculum.txt,"{""outcomes"": []}"
9,4275b582-0e89-42d1-bd4e-c2dd53c34803,Yoruba,Yoruba,JSS 1-3,curriculum.txt,"{""outcomes"": []}"


In [39]:
csv_path = r"C:\Users\HP\Desktop\EduAi\data\curriculum_units.csv"

# Use latin-1 to handle Windows encoding
df = pd.read_csv(csv_path, encoding="latin-1")

In [40]:
def gen_id():
    return str(uuid.uuid4())

df["id"] = df["id"].fillna("").astype(str)
df["id"] = df["id"].apply(lambda x: x if x.strip() != "" else gen_id())

if df["id"].duplicated().any():
    seen = set()
    new_ids = []
    for val in df["id"]:
        if val in seen:
            new_ids.append(gen_id())
        else:
            new_ids.append(val)
            seen.add(val)
    df["id"] = new_ids

df.to_csv(csv_path, index=False, encoding="utf-8")
print("✅ All rows now have ID populated successfully.")

✅ All rows now have ID populated successfully.


In [41]:

df_curriculum = pd.read_csv(r"C:\Users\HP\Desktop\EduAi\data\curriculum_units.csv", encoding="utf-8")
df_teachers = pd.read_csv(r"C:\Users\HP\Desktop\EduAi\data\teachers.csv", encoding="utf-8")

# Function to generate uniqu IDs
def gen_id():
    return str(uuid.uuid4())

# Generate lessons
lessons = []
for _ in range(2800): 
    lessons.append({
        "id": gen_id(),
        "curriculum_unit_id": random.choice(df_curriculum["id"]),
        "teacher_id": random.choice(df_teachers["id"]),
        "content": json.dumps({"body": fake.paragraph(nb_sentences=4)}),
        "assets": json.dumps({"images": [fake.image_url() for _ in range(2)]}),
        "lesson_metadata": json.dumps({
            "duration": f"{random.choice([30, 45, 60])} mins",
            "objectives": fake.sentences(nb=random.randint(1,3))
        }),
        "generated_at": fake.date_time_this_year().isoformat()
    })

# Convert to DataFrame
df_lessons = pd.DataFrame(lessons)

# Save to CSV
df_lessons.to_csv(r"C:\Users\HP\Desktop\EduAi\data\lessons.csv", index=False)

# Quick check
df_lessons.head()

Unnamed: 0,id,curriculum_unit_id,teacher_id,content,assets,lesson_metadata,generated_at
0,6a64edb0-35fc-4f22-93a1-0d97d3740b12,10783124-a5d8-4032-9e38-616f5b9d7ba3,d35e0db6-cbea-452b-9789-0f9b2924ddd7,"{""body"": ""Libero distinctio optio vero. Iure q...","{""images"": [""https://picsum.photos/232/576"", ""...","{""duration"": ""45 mins"", ""objectives"": [""Hic an...",2025-09-28T01:02:32
1,1f8aeffa-d9e9-498d-b116-2f57e03771db,abce7c80-c97b-41c0-a86f-109f98920f91,79463430-c1cb-4afd-972d-2867a2370123,"{""body"": ""In dolore natus accusantium. Odit cu...","{""images"": [""https://placekitten.com/49/713"", ...","{""duration"": ""60 mins"", ""objectives"": [""Fugit ...",2025-02-16T05:09:16
2,51d1c088-9979-410d-84f7-811ee0d38c6b,abce7c80-c97b-41c0-a86f-109f98920f91,44c2f51f-7bc5-41fd-89a1-6ea1b56ef005,"{""body"": ""Deleniti commodi ab quod amet. Fugia...","{""images"": [""https://picsum.photos/677/439"", ""...","{""duration"": ""45 mins"", ""objectives"": [""Tenetu...",2025-04-14T08:22:20
3,c84e5c5c-d6bf-430d-88ae-6c7102f333dc,10783124-a5d8-4032-9e38-616f5b9d7ba3,f119061c-d353-4283-a077-d8232ebb26bb,"{""body"": ""Sequi enim nisi qui. Impedit sed dis...","{""images"": [""https://picsum.photos/142/761"", ""...","{""duration"": ""30 mins"", ""objectives"": [""Conseq...",2025-05-19T19:38:46
4,8fa3d92c-9140-464e-ae61-23792cf5c9b1,20b9de58-9a2d-4338-a65e-0ced7e7eefbb,e580c391-3417-4f67-8d4f-0529025d5f76,"{""body"": ""Non dignissimos animi inventore temp...","{""images"": [""https://dummyimage.com/991x187"", ...","{""duration"": ""30 mins"", ""objectives"": [""Beatae...",2025-06-21T16:00:40


In [43]:
# Load question bank
json_path = r"C:\Users\HP\Desktop\EduAi\notebooks\questions_bank.json"
with open(json_path, "r", encoding="utf-8") as f:
    questions_bank = json.load(f)

# Generate assessments
assessments = []

for _ in range(3896):  # total assessments
    lesson = random.choice(df_lessons.to_dict("records"))
    # Try to pick a subject from lesson if available, otherwise random subject
    subject = lesson.get("subject") or random.choice(list(questions_bank.keys()))
    question_pool = questions_bank.get(subject, [])

    if not question_pool:  # fallback in case subject not in bank
        continue

    num_questions = random.randint(2, 4)
    sampled = random.sample(question_pool, min(num_questions, len(question_pool)))

    assessments.append({
        "id": gen_id(),
        "lesson_id": lesson["id"],
        "items": json.dumps({
            "questions": [q["question"] for q in sampled],
            "answers": [q["answer"] for q in sampled]
        })
    })

# Convert to DataFrame
df_assessments = pd.DataFrame(assessments)

# Save to CSV
df_assessments.to_csv(r"C:\Users\HP\Desktop\EduAi\data\assessments.csv", index=False)

# Quick check
df_assessments.head()

Unnamed: 0,id,lesson_id,items
0,961f96d3-4516-4b0e-b86c-0ef166e51593,233d4067-f635-4752-a666-a4a8b87f972c,"{""questions"": [""What is sin(90\u00b0)?"", ""What..."
1,d44cc0c8-eab1-45c1-a7da-679e0529340e,77f19a1f-7715-47c8-9d78-7f1889d225c7,"{""questions"": [""What does 'bias' mean in fabri..."
2,4327c967-3504-4bde-868b-a6e4ed45592d,c5f711fb-0b85-4fbb-a48c-02a7f76328e2,"{""questions"": [""What is sin(90\u00b0)?"", ""What..."
3,35124a2a-5b0b-4684-8173-27a5da1615e7,b0faca8d-7aba-492f-987e-c9df8e2d0f98,"{""questions"": [""What is market research?"", ""De..."
4,d2f8d944-61ed-472e-b3ac-587afff9af75,8eeeed20-7cf7-4f45-ad8d-c9591a6b8629,"{""questions"": [""What is punctuation?"", ""Define..."


In [48]:
skills = ["AI basics", "Python for teaching", "Robotics intro", "Lesson design", "STEM integration", "Excel for educators", "Words", "PowerPoint", "Google Classroom", "Digital literacy"]

progress = []
for _ in range(13200):
    progress.append({
        "id": gen_id(),
        "teacher_id": random.choice(df_teachers["id"]),
        "skill": random.choice(skills),
        "level": random.choice(["beginner", "intermediate", "advanced"]),
        "last_practiced": fake.date_time_this_year()
    })

df_progress = pd.DataFrame(progress)
df_progress.to_csv(r"C:\Users\HP\Desktop\EduAi\data\teacher_progress.csv", index=False)
df_progress.head()

Unnamed: 0,id,teacher_id,skill,level,last_practiced
0,c7626b77-86be-4a78-8f42-754ec6736e21,e59fc076-f6f8-4b74-b556-9e3ec0d2e336,Lesson design,intermediate,2025-09-22 16:24:59
1,289dcf8a-d7ac-412a-9b95-cd201e3af877,0e97f849-b145-4580-924e-51dbb636ff84,PowerPoint,beginner,2025-09-08 00:25:56
2,836765fc-751b-464b-aab6-0ff9645f88fa,51b926ab-0081-4cc9-83c9-f5b08edd4960,Words,intermediate,2025-07-18 17:04:04
3,f7b98dbc-f14a-4d5b-a142-d8dd0498ba0b,a111e987-763c-460a-9875-767c06987111,Excel for educators,advanced,2025-08-29 22:40:41
4,3877b5ba-0335-4c1c-8495-3000bb447421,2c4917f6-eb1d-4c4b-98e6-0f8808b862c2,PowerPoint,intermediate,2025-07-03 19:35:49


In [50]:
actions = ["generate_lesson", "refine_prompt", "download_pdf", "login", "logout"]

# Generate audit logs
audit = []
for _ in range(11780):
    audit.append({
        "id": gen_id(),
        "user_id": random.choice(df_teachers["id"]),
        "action": random.choice(actions),
        "prompt_hash": fake.sha1(),
        "model_used": random.choice(["gpt-4o-mini", "gpt-4", "mistral", "claude"]),
        "output_ref": fake.uri(),
        "timestamp": fake.date_time_this_year()
    })

# Convert to DataFrame
df_audit = pd.DataFrame(audit)

# Save to CSV
df_audit.to_csv(r"C:\Users\HP\Desktop\EduAi\data\audit_log.csv", index=False)

# Quick check
df_audit.head()

Unnamed: 0,id,user_id,action,prompt_hash,model_used,output_ref,timestamp
0,816c3a05-21ec-4d8d-98d0-935083de5362,6f6fdc97-aacb-4ef0-912a-e9833e5babbd,login,db85c113a9429c475201d6aac095f3cdfa61c5cd,claude,https://www.oshodi.com/searchindex.htm,2025-04-07 09:16:55
1,9bb9ae06-caa3-4e6e-ba40-4a45b3a1bffe,05b7c4db-b2e1-4738-93a8-1bdcb4ce41e2,logout,53246890b4b5293e677756f28503422097415575,claude,https://nnamani.biz/app/categorylogin.jsp,2025-05-21 12:11:52
2,f6a6a0fa-6b98-43f6-bc61-1621921a87d6,f72b9ee4-4b68-4a87-9a04-73c15cf10dcb,generate_lesson,299c7e0014685639f76064e604bd8299c497749b,claude,http://chukwu.com/category/mainterms.html,2025-04-28 05:53:41
3,eacce60e-e68e-4383-8792-720b7f78db1f,99c8e8b6-4ce5-4c9c-b288-2bde9e0faa6a,login,483071e09748bd9d91fe5afc39a2bfebc38f3d16,gpt-4,https://abiola.com/tag/categories/blogterms.html,2025-10-04 12:56:26
4,75e41f67-fac2-420d-92be-3fa3f48a3125,63eaa4c4-c37a-47c4-af1e-6f83fd102915,logout,e89164cb32b827902de0794435679d4ebd406fed,mistral,http://ibrahim-akinwale.com/app/categoriessear...,2025-06-12 13:34:58
