#### IMPORTS

In [173]:
from faker import Faker
import pandas as pd
import random
import re
from datetime import date, timedelta, datetime



fake = Faker()

def clean_name(name):
    # Remove prefixes like Dr., Mr., Mrs., Prof., etc. (case-insensitive)
    return re.sub(r'^(Dr\.|Mr\.|Mrs\.|Ms\.|Prof\.)\s+', '', name, flags=re.IGNORECASE)

### Mentors

In [174]:
academic_mentor = []
industry_mentor = []
universities = [
    "University of Dundee",
    "University of Edinburgh",
    "University of Glasgow"
]
uni_domain = {"University of Dundee" :"dundee.ac.uk",
    "University of Edinburgh":"edinburgh.ac.uk",
    "University of Glasgow":"glasgow.ac.uk"}
employers = [
    "Tata Consultancy Services",
    "Amazon Development Centre Scotland",
    "JP Morgan Chase & Co.",
    "Barclays Technology Centre",
    "Skyscanner",
    "Deloitte UK",
    "IBM UK Ltd.",
    "BAE Systems Applied Intelligence",
    "Morgan Stanley",
    "Accenture Scotland"
]

for i in range(20):
    name = fake.name()
    cleaned = clean_name(name)
    email_name = cleaned.lower().replace(" ",".")
    univerty = random.choice(universities)
    domain = uni_domain[univerty]
    academic_mentor.append({
        "academic_mentor_id" : f'AM{i+1:03d}',
        "name": name,
        "email":f'{email_name}@{domain}',
        "university": univerty
    })
for i in range(20):
    name = fake.name()
    cleaned = clean_name(name)
    industry_mentor.append({
        "industry_mentor_id" : f'IM{i+1:03d}',
        "name" : cleaned,
        "email" : fake.company_email(),
        "employer" : random.choice(employers)
    })

academic_mentor_df = pd.DataFrame(academic_mentor)
industry_mentor_df = pd.DataFrame(industry_mentor)
print("academic_mentor columns",academic_mentor_df.columns)

academic_mentor_df.to_csv("academic_mentor.csv",index = False)
industry_mentor_df.to_csv("industry_mentor.csv",index = False)

print("CSV file saved: academic_mentor.csv and industry_mentor.csv ")

academic_mentor columns Index(['academic_mentor_id', 'name', 'email', 'university'], dtype='object')
CSV file saved: academic_mentor.csv and industry_mentor.csv 


### Creating Students Profiles

### Students

In [175]:
#students.csv
courses = [
    "Applied Data Science",
    "Software Engineering Principles",
    "Computational Modelling and Programming",
    "Human-Centred Computing",
    "Cybersecurity and Ethical Hacking",
    "Professional Project Management",
    "Research Methods in Computing",
    "Interactive Media Design",
    "Big Data Analysis and Visualisation",
    "Artificial Intelligence and Machine Learning",
    "Business Intelligence and Strategy",
    "Information Systems Management"
]
students = []
years = [1,2,3,4]

In [176]:
# Fixed term start and end dates
term_start_date = datetime(2025, 4, 1).date()
term_end_date = term_start_date + timedelta(weeks=12)

for i in range(100):  # no of students 100
    name = fake.name()
    cleaned = clean_name(name)
    students_email = cleaned.lower().replace(" ", ".")

    students.append({
        "student_id": f"S{i+1:03d}",
        "name": cleaned,
        "email": f'{students_email}@gmail.com',
        "course": random.choice(courses),
        "current_year": random.choice(years),
        "academic_mentor_id": random.choice(academic_mentor)["academic_mentor_id"],
        "industry_mentor_id": random.choice(industry_mentor)["industry_mentor_id"],
        "term_start_date": term_start_date,
        "term_end_date": term_end_date
    })


### Converting to DataFrames

In [177]:
students_df= pd.DataFrame(students) #Converting the raw data in the form of Dataframe/Tables

### Exporting the .csv

In [178]:

students_df.to_csv("students.csv", index=False)
print("CSV file saved: students.csv")

CSV file saved: students.csv


### Assignment

In [179]:
assignment = []

def get_grade(submission_delays):
    if submission_delays == 0:
        return "A1"
    elif submission_delays == 1:
        return "A2"
    elif submission_delays == 2:
        return "B1"
    elif submission_delays == 3:
        return "B2"
    elif submission_delays == 4:
        return "C1"
    elif submission_delays == 5:
        return "C2"
    else:
        return "Fail"

for i, student in enumerate(students):
    assignment_id = f'AS{i+1:03d}'
    student_id = student["student_id"]
    
    term_end_date = pd.to_datetime(students_df.loc[students_df["student_id"] == student_id, "term_end_date"].values[0])
    
   
    
    submission_date = term_end_date - pd.Timedelta(weeks=1)
    

    submission_delays = random.randint(0, 6)
    

    submitted_date = submission_date + pd.Timedelta(days=submission_delays)
    
    assignment.append({
        "assignment_id": assignment_id,
        "student_id": student_id,
        "submission_date": submission_date.date(),
        "submitted_date": submitted_date.date(),
        "assignment_grades": get_grade(submission_delays)
    })

assignment_df = pd.DataFrame(assignment)
assignment_df.to_csv("assignment.csv", index=False)
print("assignment.csv ")


assignment.csv 


### Academic progress


In [180]:
academic_progress = []

attendance = []

for i in range(100):  
    student_id = students_df.iloc[i]["student_id"]
    assignment_row = assignment_df[assignment_df["student_id"] == student_id]
    assignment_grade = assignment_row.iloc[0]["assignment_grades"]
    
    no_of_courses = random.randint(1,3)
    classes_per_course = random.randint(8,15)
    weeks = 12
    
    total_required_classes = no_of_courses*classes_per_course
    classes_attended = round(random.triangular(
    low=total_required_classes * 0.4, # Set a minimum of 40%
    high=total_required_classes,      
    mode=total_required_classes * 0.85 # Most students will have around 85% attendance
))
    
    attendance_percentage = (classes_attended/total_required_classes)*100
    
    
    academic_progress.append({
        "academic_progress_id" : f'AP{i+1:03d}',
        "student_id": student_id,
        "number_of_courses":no_of_courses,
        "academic_percentage": attendance_percentage,
        "assignment": assignment_grade, 
        "total_required_attendance":total_required_classes,
        "classes_attended":classes_attended,
    }
    )

academic_progress_df = pd.DataFrame(academic_progress)
academic_progress_df.to_csv("academic_progress.csv",index=False)
print("CSV file saved: academic_progress.csv")

CSV file saved: academic_progress.csv


### Industry Log

In [181]:
industry_progress = []

for i in range(100):
    student_id = students_df.iloc[i]["student_id"]

    industry_progress.append({
         "industry_id": f'IL{i+1:03d}',
         "student_id" : student_id,
         "hours_per_week" : random.randint(60,150),
         "no_of_tasks"  : random.randint(5,20)
    })
    
industry_progress_df = pd.DataFrame(industry_progress)
industry_progress_df.to_csv("industry_progress.csv",index=False)
    

### Meetings

In [182]:
def generate_random_date():
    if random.random() < 0.2: 
        return None
    start_date = date.today() - timedelta(days=180)
    random_date = fake.date_between(start_date=start_date, end_date='today')
    return random_date.strftime("%Y-%m-%d")


meetings = []

for i in range(100):
    student_id = students_df.iloc[i]["student_id"]
    
    student_meetings = {"student_id": student_id}


    if random.random() > 0.3: 
        student_meetings["academic_meeting_id"] = f'AMEET{i+1:03d}'
        student_meetings["academic_meeting_date"] = generate_random_date()
    else:
        student_meetings["academic_meeting_id"] = None
        student_meetings["academic_meeting_date"] = None

    
    if random.random() > 0.4: 
        student_meetings["industry_meeting_id"] = f'IMEET{i+1:03d}'
        student_meetings["industry_meeting_date"] = generate_random_date()
    else:
        student_meetings["industry_meeting_id"] = None
        student_meetings["industry_meeting_date"] = None

    meetings.append(student_meetings)

meetings_df = pd.DataFrame(meetings)
meetings_df.fillna({
    "academic_meeting_id": "No Meeting",
    "academic_meeting_date": "NULL",
    "industry_meeting_id": "No Meeting",
    "industry_meeting_date": "NULL"
}, inplace=True)

meetings_df.to_csv("meetings.csv", index=False)

### Request Log

In [183]:
requests = []
request_types = [
    # Attendance requests
    {"code": "EXC001", "reason": "Medical Emergency", "category": "attendance"},
    {"code": "EXC002", "reason": "Family Emergency", "category": "attendance"},
    {"code": "EXC003", "reason": "Corporate Overwork / High Workload", "category": "attendance"},
    {"code": "EXC004", "reason": "Approved Leave (Vacation, Wedding, etc.)", "category": "attendance"},
    {"code": "EXC007", "reason": "Mental Health Leave / Counselling", "category": "attendance"},
    {"code": "EXC010", "reason": "Visa or Legal Paperwork Delays", "category": "attendance"},
    
    # Assignment request
    {"code": "EXC001", "reason": "Medical Emergency", "category": "assignment"},
    {"code": "EXC002", "reason": "Family Emergency", "category": "assignment"},
    {"code": "EXC003", "reason": "Corporate Overwork / High Workload", "category": "assignment"},
    {"code": "EXC008", "reason": "Miscommunication or Delay in Assignment Communication", "category": "assignment"},
    {"code": "EXC009", "reason": "Extension Approved by Academic Mentor", "category": "assignment"},
    
    # Meeting request
    {"code": "EXC006", "reason": "Mentor Unavailability / Meeting Canceled by Mentor", "category": "meeting"},
    {"code": "EXC001", "reason": "Medical Emergency", "category": "meeting"},
    {"code": "EXC002", "reason": "Family Emergency", "category": "meeting"},
    {"code": "EXC003", "reason": "Corporate Overwork / High Workload", "category": "meeting"},
]

request_id_counter = 1

for _, row in students_df.iterrows():
    student_id = row["student_id"]
    term_start = pd.to_datetime(row["term_start_date"])
    term_end = pd.to_datetime(row["term_end_date"])

    num_requests = random.randint(0, 4)
    for _ in range(num_requests):
        chosen = random.choice(request_types)
        request_date = fake.date_between(start_date=term_start, end_date=term_end)

        requests.append({
            "request_id": f"RQ{request_id_counter:03d}",
            "student_id": student_id,
            "reason": chosen["reason"],
            "category": chosen["category"],
            "request_date": request_date
        })
        request_id_counter += 1


request_df = pd.DataFrame(requests)
request_df.to_csv("request.csv", index=False)
print("request.csv generated ")


request.csv generated 


### Merged dataset

In [184]:
merged_df = pd.merge(students_df, assignment_df, on="student_id", how="left")
merged_df = pd.merge(merged_df, academic_progress_df, on="student_id", how="left")
merged_df = pd.merge(merged_df, industry_progress_df, on="student_id", how="left")
merged_df = pd.merge(merged_df, meetings_df, on="student_id", how="left")

request_grouped = request_df.groupby("student_id").agg({
    "request_id": [
        lambda x: ', '.join(x.astype(str)),  
        'count'                             
    ],
    "reason": lambda x: ', '.join(x),
    "category": lambda x: ', '.join(x)
}).reset_index()

request_grouped.columns = [
    "student_id",
    "request_id",            
    "num_mitigation_requests",  
    "reason",
    "category"
]

merged_df = pd.merge(merged_df, request_grouped, on="student_id", how="left")

academic_mentor_df = academic_mentor_df.rename(columns={
    "name": "academic_mentor_name",
    "email": "academic_mentor_email",
    "university": "academic_mentor_university"
})

merged_df = pd.merge(merged_df, academic_mentor_df, on="academic_mentor_id", how="left")

industry_mentor_df = industry_mentor_df.rename(columns={
    "name": "industry_mentor_name",
    "email": "industry_mentor_email",
    "employer": "industry_mentor_employer"
})

merged_df = pd.merge(merged_df, industry_mentor_df, on="industry_mentor_id", how="left")


merged_df.fillna({
    "request_id": "None",
    "reason": "None",
    "category": "None",
    "num_mitigation_requests": 0
}, inplace=True)

merged_df.fillna({
    "academic_meeting_id": "No Meeting",
    "academic_meeting_date": "NULL",
    "industry_meeting_id": "No Meeting",
    "industry_meeting_date": "NULL"
}, inplace=True)

merged_df = merged_df[[
    "student_id", "name", "email", "course", "current_year", 
    "term_start_date", "term_end_date", 
    "academic_mentor_id", "academic_mentor_name", "academic_mentor_email", "academic_mentor_university",
    "industry_mentor_id", "industry_mentor_name", "industry_mentor_email", "industry_mentor_employer",
    "academic_progress_id", "number_of_courses", "total_required_attendance", "classes_attended",
    "assignment_id", "submission_date", "submitted_date" , "assignment_grades",
    "industry_id", "hours_per_week", "no_of_tasks",
    "academic_meeting_id", "academic_meeting_date",
    "industry_meeting_id", "industry_meeting_date",
    "request_id", "num_mitigation_requests", "reason", "category"
]]

merged_df.to_csv("master_mmms.csv", index=False)
print("Final master_mmms.csv saved successfully")


Final master_mmms.csv saved successfully
