Library importing

In [196]:
import pandas as pd
import glob
import os

# Folders
cleaned_folder = "cleaned_datasets"
silver_folder = "transformed_datasets"

# -----------------------------
# Load cleaned datasets
# -----------------------------
files = glob.glob(os.path.join(cleaned_folder, "*.csv"))
students_list, courses_list, assessments_list = [], [], []

for file in files:
    df = pd.read_csv(file)
    dtype = df['Dataset_Type'].iloc[0]
    
    if dtype == "students":
        students_list.append(df)
    elif dtype == "courses":
        courses_list.append(df)
    elif dtype == "assessments":
        assessments_list.append(df)

students_df = pd.concat(students_list, ignore_index=True)
courses_df = pd.concat(courses_list, ignore_index=True)
assessments_df = pd.concat(assessments_list, ignore_index=True)

# Load silver dataset
silver_files = glob.glob(os.path.join(silver_folder, "*.csv"))
silver_df = pd.read_csv(silver_files[0])  # assuming only one silver dataset

print("Datasets loaded for integration:")
print("Students:", students_df.shape)
print("Courses:", courses_df.shape)
print("Assessments:", assessments_df.shape)
print("Silver:", silver_df.shape)

Datasets loaded for integration:
Students: (120, 11)
Courses: (12, 7)
Assessments: (143, 12)
Silver: (143, 19)


clean merge keys

In [197]:
# -----------------------------
# Standardize merge keys
# -----------------------------
for df in [students_df, courses_df, assessments_df, silver_df]:
    if 'Student_ID' in df.columns:
        df['Student_ID'] = df['Student_ID'].astype(str).str.strip().str.upper()
    if 'Campus_ID' in df.columns:
        df['Campus_ID'] = df['Campus_ID'].astype(str).str.strip()
    if 'Course_Code' in df.columns:
        df['Course_Code'] = df['Course_Code'].astype(str).str.strip().str.upper().str.replace("-", "")

print("Merge keys cleaned.")

Merge keys cleaned.


Resolve conflicts

In [198]:
# -----------------------------
# Keep latest Intake_Year and resolve Full_Name/Program conflicts
# -----------------------------
students_df = students_df.sort_values('Intake_Year').drop_duplicates(subset=['Student_ID','Campus_ID'], keep='last')

# Resolve Full_Name conflicts
students_df['Full_Name'] = students_df.groupby(['Student_ID','Campus_ID'])['Full_Name'] \
    .transform(lambda x: x.mode()[0] if not x.mode().empty else x.iloc[0])

# Resolve Program conflicts
students_df['Program'] = students_df.groupby(['Student_ID','Campus_ID'])['Program'] \
    .transform(lambda x: x.mode()[0] if not x.mode().empty else x.iloc[0])

print("Duplicate students resolved. Shape:", students_df.shape)

Duplicate students resolved. Shape: (120, 11)


Merge silver dataset with students and courses

In [199]:
# -----------------------------
# Merge silver dataset with students
# -----------------------------
integrated_df = silver_df.merge(
    students_df,
    on=['Student_ID','Campus_ID'],
    how='left',
    suffixes=('_silver','_student')
)

# Merge courses info
integrated_df = integrated_df.merge(
    courses_df,
    on=['Course_Code','Campus_ID'],
    how='left',
    suffixes=('', '_course')
)

print(" integration complete. Integrated dataset shape:", integrated_df.shape)

 integration complete. Integrated dataset shape: (143, 33)


Remove duplicates

In [200]:
gold_df = gold_df.drop_duplicates(
    subset=['Student_ID', 'Course_Code', 'Academic_Year', 'Semester'],
    keep='last'
).reset_index(drop=True)

Save integrated gold dataset

In [201]:
os.makedirs("integrated_datasets", exist_ok=True)
gold_df.to_csv("integrated_datasets/gold_integrated.csv", index=False)
print("Gold integrated dataset saved successfully!")
print("Gold dataset shape:", gold_df.shape)
gold_df.head()

Gold integrated dataset saved successfully!
Gold dataset shape: (143, 33)


Unnamed: 0,Student_ID,Course_Code,Mark,Assessment_Date,Academic_Year,Semester,Attendance_Rate,Source_Campus_File_x,Campus_ID,Dataset_Type_x,...,Level,Intake_Year,Source_Campus_File_y,Campus_Name_x,Dataset_Type_y,Course_Title,Credits,Source_Campus_File,Campus_Name_y,Dataset_Type
0,HUY-26125,CS104,66.5,2023-06-16,2024/2025,1,0.8,Huyeassessmentscsv,HUY-26,assessments,...,1,2024,Huyestudentscsv,Huye,students,Machine Learning,4.0,Huyecoursescsv,Huye,courses
1,HUY-26118,CS103,62.0,2023-01-08,2024/2025,2,0.6,Huyeassessmentscsv,HUY-26,assessments,...,2,2023,Huyestudentscsv,Huye,students,Cyber security,3.0,Huyecoursescsv,Huye,courses
2,HUY-26115,CS101,69.0,2023-01-08,2024/2025,1,0.6,Huyeassessmentscsv,HUY-26,assessments,...,3,2022,Huyestudentscsv,Huye,students,Programming,3.0,Huyecoursescsv,Huye,courses
3,HUY-26107,CS102,66.5,2023-01-08,2024/2025,2,0.8,Huyeassessmentscsv,HUY-26,assessments,...,1,2022,Huyestudentscsv,Huye,students,Databases,4.0,Huyecoursescsv,Huye,courses
4,HUY-26105,CS101,58.0,2023-01-08,2024/2025,2,1.0,Huyeassessmentscsv,HUY-26,assessments,...,3,2024,Huyestudentscsv,Huye,students,Programming,3.0,Huyecoursescsv,Huye,courses
