In [1]:
#########################################################################
#Title  : Dummy Data Generation to train Machine Learning Models
#Author : Kevin Ryan Noronha

#Editors: Disha khurana- Changed the distribution of marks assigned to each student




##########################################################################

In [2]:
import pandas as pd
import numpy as np
from random import choice, sample, uniform

# Subject dictionary: {code: (title, credits)}
subjects = {
    "COMM2749": ("Digital Media Studio 1: Production", 24),
    "COMM2755": ("Digital Video", 12),
    "COMM2301": ("Media Cultures 1", 12),
    "GRAP2588": ("Digital Media Studio 2: Collaboration", 24),
    "COMM2598": ("Sound Design for Digital Media", 12),
    "COMM2745": ("Emerging Digital Cultures", 12),
    "COMM2747": ("Digital Media Studio 3: Experimentation", 24),
    "OART1013": ("Interactive Media 1", 12),
    "COMM2753": ("Digital Media Specialisation 1", 12),
    "COMM2591": ("Digital Media Studio 4: Client Solutions", 24),
    "COMM2595": ("Digital Narrative Theory and Practice", 12),
    "COMM2751": ("Digital Media Specialisation 2", 12),
    "COMM2589": ("Digital Media Studio 5: Specialisation Project", 24),
    "COMM2587": ("Digital Media Professional Practice", 12),
    "COMM2585": ("Digital Media Major Project", 24),
    "COMM2583": ("Digital Media Internship", 24),
}

# Sem 1 2023 subject list
sem1_subjects = ["COMM2747", "OART1013", "COMM2753", "COMM2591", "COMM2595", "COMM2751"]
# Remaining subjects for Sem 2 2023 and Sem 1 2024
remaining_subjects = ["COMM2749", "COMM2755", "COMM2301", "GRAP2588", "COMM2598", "COMM2745",
                     "COMM2747", "OART1013", "COMM2753", "COMM2591", "COMM2595", "COMM2751"]

# Function to assign marks based on specified distribution
def assign_mark():
    r = uniform(0, 1)
    if r < 0.15:      # ↑ 15% below 50 (was 5%)
        return np.random.randint(0, 50)
    elif r < 0.50:    # ↑ 35% get 50–59 (was 25%)
        return np.random.randint(50, 60)
    elif r < 0.75:    # ↓ 25% get 60–69 (was 45%)
        return np.random.randint(60, 70)
    elif r < 0.90:    # ↓ 15% get 70–79 (was 20%)
        return np.random.randint(70, 80)
    else:             # = 10% get 80–100 (was 5%)
        return np.random.randint(80, 100)
    
# Function to determine grade based on mark
def get_grade(mark):
    if mark < 50:
        return "NN"
    elif mark < 60:
        return "PA"
    elif mark < 70:
        return "CR"
    elif mark < 80:
        return "DI"
    else:
        return "HD"

# Generate data for 3000 students
data = []
for student_id in range(1001, 4001):  # Emplid from 1001 to 4000
    name = f"Student {student_id - 1000}"
    taken_subjects = set()

    # Sem 1 2023: 48 credits, 2-3 subjects
    if np.random.random() < 0.5:  # 50% chance for two 24-credit subjects
        sem1 = ["COMM2747", "COMM2591"]
    else:  # One 24-credit and two 12-credit subjects
        sem1_24 = choice(["COMM2747", "COMM2591"])
        sem1_12 = sample([s for s in sem1_subjects if subjects[s][1] == 12], 2)
        sem1 = [sem1_24] + sem1_12
    taken_subjects.update(sem1)

    # Determine if student is in 95% or 5% group
    is_95_percent = np.random.random() < 0.95

    if is_95_percent:
        # Sem 2 2023: COMM2589, COMM2587, and one 12-credit subject
        sem2 = ["COMM2589", "COMM2587"]
        available = [s for s in remaining_subjects if s not in taken_subjects and subjects[s][1] == 12]
        sem2.append(choice(available) if available else "COMM2755")
        taken_subjects.update(sem2)

        # Sem 1 2024: COMM2585 or COMM2583, and two 12-credit subjects
        sem3 = [choice(["COMM2585", "COMM2583"])]
        available = [s for s in remaining_subjects if s not in taken_subjects and subjects[s][1] == 12]
        sem3.extend(sample(available, 2) if len(available) >= 2 else ["COMM2755", "COMM2301"])
    else:
        # Sem 2 2023: COMM2585 or COMM2583, and two 12-credit subjects
        sem2 = [choice(["COMM2585", "COMM2583"])]
        available = [s for s in remaining_subjects if s not in taken_subjects and subjects[s][1] == 12]
        sem2.extend(sample(available, 2) if len(available) >= 2 else ["COMM2755", "COMM2301"])
        taken_subjects.update(sem2)

        # Sem 1 2024: COMM2589, COMM2587, and one 12-credit subject
        sem3 = ["COMM2589", "COMM2587"]
        available = [s for s in remaining_subjects if s not in taken_subjects and subjects[s][1] == 12]
        sem3.append(choice(available) if available else "COMM2755")

    # Combine all semester subjects
    terms = [("Sem 1 2023", sem1), ("Sem 2 2023", sem2), ("Sem 1 2024", sem3)]
    student_records = []
    for term, courses in terms:
        for course in courses:
            mark = assign_mark()
            grade = get_grade(mark)
            unit_value = subjects[course][1]
            credits = unit_value if mark >= 50 else 0
            student_records.append({
                "Emplid": student_id,
                "Name": name,
                "Career": "Undergraduate",
                "Acad Program": "BP309",
                "Program Descr": "Bachelor of Design (Digital Media)",
                "Admit Term": "Sem 1 2023",
                "Acad Plan": "BP309",
                "Plan Descr": "Bachelor of Design (Digital Media)",
                "Term Descr": term,
                "Course": course,
                "Course Descr": subjects[course][0],
                "Mark": mark,
                "Grade": grade,
                "Unit Value": unit_value,
                "Credits": credits
            })

    # Calculate Program Status
    total_credits = sum(r["Credits"] for r in student_records)
    program_status = "Completed" if total_credits == 144 else "Pending"
    for record in student_records:
        record["Program Status"] = program_status
        data.append(record)

# Create DataFrame and save to Excel
df = pd.DataFrame(data)
df.to_excel("student_records.xlsx", index=False)
print("Excel sheet 'student_records.xlsx' has been generated with records for 3000 students.")

Excel sheet 'student_records.xlsx' has been generated with records for 3000 students.
