In [1]:
import pandas as pd
import numpy as np
import random

np.random.seed(42)
random.seed(42)

NUM_RECORDS = 5500


In [2]:
students = pd.DataFrame({
    "student_id": [f"S{100000+i}" for i in range(NUM_RECORDS)],
    "gender": np.random.choice(["Male", "Female"], NUM_RECORDS),
    "date_of_birth": pd.to_datetime(
        np.random.choice(pd.date_range("2000-01-01", "2008-12-31"), NUM_RECORDS)
    ),
    "region": np.random.choice(["North", "South", "East", "West", "Central"], NUM_RECORDS),
    "school_id": np.random.choice([f"SCH_{i}" for i in range(1, 21)], NUM_RECORDS),
    "grade_level": np.random.randint(6, 13, NUM_RECORDS),
    "enrollment_year": np.random.choice([2019, 2020, 2021, 2022, 2023], NUM_RECORDS),
    "program_type": np.random.choice(["Science", "Commerce", "Arts"], NUM_RECORDS)
})

students.to_csv(r"D:\capstone_project\Datasets\students_master.csv", index=False)


In [3]:
academic = pd.DataFrame({
    "student_id": students["student_id"],
    "academic_year": np.random.choice(["2019-20", "2020-21", "2021-22", "2022-23"], NUM_RECORDS),
    "semester": np.random.choice([1, 2], NUM_RECORDS),
    "avg_score": np.round(np.random.normal(65, 15, NUM_RECORDS), 2),
    "failed_subjects": np.random.choice([0, 1, 2, 3], NUM_RECORDS, p=[0.6, 0.2, 0.15, 0.05])
})

academic["avg_score"] = academic["avg_score"].clip(30, 95)
academic["improvement_flag"] = academic["avg_score"] > 60

academic["performance_band"] = pd.cut(
    academic["avg_score"],
    bins=[0, 50, 70, 100],
    labels=["Low", "Medium", "High"]
)

academic.to_csv(r"D:\capstone_project\Datasets\academic_performance.csv", index=False)


In [4]:
attendance = pd.DataFrame({
    "student_id": students["student_id"],
    "academic_year": academic["academic_year"],
    "total_days": 180,
    "days_present": np.random.randint(60, 180, NUM_RECORDS)
})

attendance["attendance_pct"] = np.round(
    (attendance["days_present"] / attendance["total_days"]) * 100, 2
)

attendance["chronic_absentee"] = attendance["attendance_pct"] < 75

attendance["attendance_trend"] = np.random.choice(
    ["Improving", "Stable", "Declining"],
    NUM_RECORDS,
    p=[0.3, 0.4, 0.3]
)

attendance.to_csv(r"D:\capstone_project\Datasets\attendance_records.csv", index=False)


In [5]:
socio = pd.DataFrame({
    "student_id": students["student_id"],
    "family_income_band": np.random.choice(
        ["Low", "Middle", "High"],
        NUM_RECORDS,
        p=[0.4, 0.4, 0.2]
    ),
    "parental_education": np.random.choice(
        ["No Formal Education", "High School", "Graduate", "Post Graduate"],
        NUM_RECORDS
    ),
    "access_to_internet": np.random.choice([True, False], NUM_RECORDS, p=[0.7, 0.3]),
    "working_part_time": np.random.choice([True, False], NUM_RECORDS, p=[0.25, 0.75]),
    "scholarship_status": np.random.choice([True, False], NUM_RECORDS, p=[0.3, 0.7])
})

socio.to_csv(r"D:\capstone_project\Datasets\socio_economic_data.csv", index=False)


In [6]:
retention = pd.DataFrame({
    "student_id": students["student_id"],
    "academic_year": academic["academic_year"]
})

# Dropout probability logic (REALISTIC)
risk_score = (
    (academic["avg_score"] < 50).astype(int) +
    (attendance["attendance_pct"] < 75).astype(int) +
    (socio["family_income_band"] == "Low").astype(int) +
    (socio["working_part_time"]).astype(int)
)

retention["dropout_flag"] = risk_score >= 3

retention["dropout_reason"] = np.where(
    retention["dropout_flag"],
    np.random.choice(
        ["Poor Academic Performance", "Low Attendance", "Financial Issues", "Personal Reasons"],
        NUM_RECORDS
    ),
    "N/A"
)

retention["intervention_required"] = retention["dropout_flag"]

retention["retention_status"] = np.where(
    retention["dropout_flag"],
    "Dropped",
    "Retained"
)

retention.to_csv(r"D:\capstone_project\Datasets\retention_status.csv", index=False)


In [7]:
# Introduce inconsistent regions
idx = students.sample(frac=0.08).index
students.loc[idx, "region"] = students.loc[idx, "region"].str.lower()

# Missing gender
students.loc[students.sample(frac=0.03).index, "gender"] = None

# Invalid grade levels
students.loc[students.sample(frac=0.02).index, "grade_level"] = 15


In [8]:
academic.loc[academic.sample(frac=0.05).index, "avg_score"] = None
academic.loc[academic.sample(frac=0.01).index, "avg_score"] = -10
academic.loc[academic.sample(frac=0.02).index, "failed_subjects"] = 6


In [9]:
attendance.loc[attendance.sample(frac=0.03).index, "days_present"] = 200
attendance.loc[attendance.sample(frac=0.04).index, "days_present"] = None
attendance.loc[attendance.sample(frac=0.05).index, "attendance_pct"] = 150


In [10]:
socio.loc[socio.sample(frac=0.06).index, "family_income_band"] = None
socio.loc[socio.sample(frac=0.08).index, "access_to_internet"] = "Yes"
socio.loc[socio.sample(frac=0.04).index, "parental_education"] = "Unknown Level"


  socio.loc[socio.sample(frac=0.08).index, "access_to_internet"] = "Yes"


In [11]:
retention.loc[retention.sample(frac=0.02).index, "dropout_flag"] = None
retention.loc[retention.sample(frac=0.01).index, "retention_status"] = "Retained"
retention.loc[retention.sample(frac=0.03).index, "dropout_reason"] = None


  retention.loc[retention.sample(frac=0.02).index, "dropout_flag"] = None
