Import libraries

In [28]:
import pandas as pd
import glob
import os

Adding metadata and loading datasets

In [29]:
files = glob.glob("../raw_data/*.csv")
print("Found files:", files)

all_datasets = []
for file in files:
    df = pd.read_csv(file)
    
    filename = os.path.basename(file)
    campus_name = filename.split("_")[0]
    dataset_type = filename.split("_")[1].split(".")[0]
    
    df["Source_Campus_File"] = filename
    df["Campus_Name"] = campus_name
    
    campus_map = {
        "Kigali": "KIG-25",
        "Huye": "HUY-26",
        "Musanze": "MUS-27"
    }
    
    df["Campus_ID"] = campus_map.get(campus_name, "Unknown")
    df["Dataset_Type"] = dataset_type
    
    all_datasets.append(df)

print("Datasets loaded:", len(all_datasets))

Found files: ['../raw_data\\Huye_assessments.csv', '../raw_data\\Huye_courses.csv', '../raw_data\\Huye_students.csv', '../raw_data\\Kigali_assessments.csv', '../raw_data\\Kigali_courses.csv', '../raw_data\\Kigali_students.csv', '../raw_data\\Musanze_assessments.csv', '../raw_data\\Musanze_courses.csv', '../raw_data\\Musanze_students.csv']
Datasets loaded: 9


Quick Data Check (Profiling)

In [None]:
# Full profiling for all datasets
# Ensure all rows and columns show up
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 20)

# Load CSV files
files = glob.glob("../raw_data/*.csv")  # adjust path if needed
print(f"Found {len(files)} files:", files, "\n")

all_datasets = []

campus_map = {
    "Kigali": "KIG-25",
    "Huye": "HUY-26",
    "Musanze": "MUS-27"
}

for file in files:
    df = pd.read_csv(file)
    
    filename = os.path.basename(file)
    campus_name = filename.split("_")[0]
    dataset_type = filename.split("_")[1].split(".")[0]
    
    # Add metadata
    df["Source_Campus_File"] = filename
    df["Campus_Name"] = campus_name
    df["Campus_ID"] = campus_map.get(campus_name, "Unknown")
    df["Dataset_Type"] = dataset_type
    
    all_datasets.append(df)

print(f"Loaded {len(all_datasets)} datasets successfully!\n")

# ----------------------------
# Profile each dataset
# ----------------------------
for i, df in enumerate(all_datasets, 1):
    dataset_name = df['Dataset_Type'].iloc[0]
    campus_name = df['Campus_Name'].iloc[0]
    
    print("="*80)
    print(f"Dataset {i}/{len(all_datasets)}: {dataset_name} | Campus: {campus_name}")
    print("-"*80)
    
    # Shape and column types
    print(f"Shape: {df.shape}\n")
    print("Column types:")
    print(df.dtypes, "\n")
    
    # Missing values
    print("Missing values per column:")
    print(df.isnull().sum(), "\n")
    
# Numeric summary
    numeric_cols = df.select_dtypes(include='number').columns
    if len(numeric_cols) > 0:
        print("Numeric summary:")
        print(df[numeric_cols].describe(), "\n")
    
    # Categorical summary
    cat_cols = df.select_dtypes(include=['object', 'string']).columns
    if len(cat_cols) > 0:
        print("Categorical summary:")
        for col in cat_cols:
            print(f"{col}: {df[col].nunique()} unique values")
    
    print("\n\n")

Found 9 files: ['../raw_data\\Huye_assessments.csv', '../raw_data\\Huye_courses.csv', '../raw_data\\Huye_students.csv', '../raw_data\\Kigali_assessments.csv', '../raw_data\\Kigali_courses.csv', '../raw_data\\Kigali_students.csv', '../raw_data\\Musanze_assessments.csv', '../raw_data\\Musanze_courses.csv', '../raw_data\\Musanze_students.csv'] 

Loaded 9 datasets successfully!

Dataset 1/9: assessments | Campus: Huye
--------------------------------------------------------------------------------
Shape: (50, 12)

Column types:
Student_ID                str
Course_Code               str
Assessment_Type           str
Mark                  float64
Assessment_Date           str
Academic_Year             str
Semester                int64
Attendance_Rate       float64
Source_Campus_File        str
Campus_Name               str
Campus_ID                 str
Dataset_Type              str
dtype: object 

Missing values per column:
Student_ID             0
Course_Code            0
Assessment_Type  