In [1]:
#########################################################################
#Title  : Dummy Data Generation to test Machine Learning Models
#Author : Kevin Ryan Noronha

#Editors: Disha Khurana- Changed the distribution of marks for the test data



##########################################################################

In [1]:
import pandas as pd
import numpy as np
from random import choice, sample, uniform

# Subject dictionary: {code: (title, credits)}
subjects = {
    "COMM2747": ("Digital Media Studio 3: Experimentation", 24),
    "OART1013": ("Interactive Media 1", 12),
    "COMM2753": ("Digital Media Specialisation 1", 12),
    "COMM2591": ("Digital Media Studio 4: Client Solutions", 24),
    "COMM2595": ("Digital Narrative Theory and Practice", 12),
    "COMM2751": ("Digital Media Specialisation 2", 12),
    "COMM2589": ("Digital Media Studio 5: Specialisation Project", 24),
    "COMM2587": ("Digital Media Professional Practice", 12),
    "COMM2585": ("Digital Media Major Project", 24),
    "COMM2583": ("Digital Media Internship", 24),
    "COMM2749": ("Digital Media Studio 1: Production", 24),
    "COMM2755": ("Digital Video", 12),
    "COMM2301": ("Media Cultures 1", 12),
    "GRAP2588": ("Digital Media Studio 2: Collaboration", 24),
    "COMM2598": ("Sound Design for Digital Media", 12),
    "COMM2745": ("Emerging Digital Cultures", 12),
}

# Subject lists for each semester
sem1_subjects = ["COMM2747", "OART1013", "COMM2753", "COMM2591", "COMM2595", "COMM2751"]
sem2_subjects = list(subjects.keys())

# Function to assign marks based on distribution
def assign_mark():
    r = uniform(0, 1)
    if r < 0.15:      # 15% below 50
        return np.random.randint(0, 50)
    elif r < 0.50:    # 35% between 50–59
        return np.random.randint(50, 60)
    elif r < 0.75:    # 25% between 60–69
        return np.random.randint(60, 70)
    elif r < 0.90:    # 15% between 70–79
        return np.random.randint(70, 80)
    else:             # 10% 80+
        return np.random.randint(80, 101)

# Function to determine grade
def get_grade(mark):
    if mark < 50:
        return "NN"
    elif mark < 60:
        return "PA"
    elif mark < 70:
        return "CR"
    elif mark < 80:
        return "DI"
    else:
        return "HD"

# Generate data for 100 students
data = []
for student_id in range(10001, 10101):  # Emplid from 1001 to 1100
    name = f"Student {student_id - 10000}"
    taken_subjects = set()

    # Sem 1 2024: 2-3 subjects totaling 48 credits
    if np.random.random() < 0.5:  # 50% chance for two 24-credit subjects
        sem1 = sample([s for s in sem1_subjects if subjects[s][1] == 24], 2)
    else:  # One 24-credit and two 12-credit subjects
        sem1_24 = choice([s for s in sem1_subjects if subjects[s][1] == 24])
        sem1_12 = sample([s for s in sem1_subjects if subjects[s][1] == 12 and s != sem1_24], 2)
        sem1 = [sem1_24] + sem1_12
    taken_subjects.update(sem1)

    # Sem 2 2024: 3-4 subjects totaling 48 credits
    sem2 = []
    credits_needed = 48
    available = [s for s in sem2_subjects if s not in taken_subjects]
    # Aim for 3-4 subjects
    target_count = np.random.choice([3, 4])
    while len(sem2) < target_count and credits_needed > 0 and available:
        subject = choice(available)
        if subjects[subject][1] <= credits_needed:
            sem2.append(subject)
            credits_needed -= subjects[subject][1]
            taken_subjects.add(subject)
            available.remove(subject)
    # Adjust if credits_needed > 0
    while credits_needed > 0 and available:
        subject = choice([s for s in available if subjects[s][1] <= credits_needed])
        sem2.append(subject)
        credits_needed -= subjects[subject][1]
        taken_subjects.add(subject)
        available.remove(subject)

    # Combine semester subjects
    terms = [("Sem 1 2024", sem1), ("Sem 2 2024", sem2)]
    student_records = []
    for term, courses in terms:
        for course in courses:
            mark = assign_mark()
            grade = get_grade(mark)
            unit_value = subjects[course][1]
            credits = unit_value if mark >= 50 else 0
            student_records.append({
                "Emplid": student_id,
                "Name": name,
                "Career": "Undergraduate",
                "Acad Program": "BP309",
                "Program Descr": "Bachelor of Design (Digital Media)",
                "Admit Term": "Sem 1 2024",
                "Acad Plan": "BP309",
                "Plan Descr": "Bachelor of Design (Digital Media)",
                "Term Descr": term,
                "Course": course,
                "Course Descr": subjects[course][0],
                "Mark": mark,
                "Grade": grade,
                "Unit Value": unit_value,
                "Credits": credits
            })

    # Calculate Program Status
    total_credits = sum(r["Credits"] for r in student_records)
    program_status = "Completed" if total_credits == 96 else "Pending"
    for record in student_records:
        record["Program Status"] = program_status
        data.append(record)

# Create DataFrame and save to Excel
df = pd.DataFrame(data)
df.to_excel("student_records_pending.xlsx", index=False)
print("Excel sheet 'student_records_96_credits.xlsx' has been generated with records for 100 students.")

Excel sheet 'student_records_96_credits.xlsx' has been generated with records for 100 students.
