In [12]:
import json
import pandas as pd

with open('../data/course_nature_training.json', 'r') as f:
    assessment_data = json.load(f)

# convert JSON to structured DataFrame
label_rows = []
for code, values in assessment_data.items():
    row = {
        "code": code,
        "name": values["name"],
        "Conceptual": values["CoPr"]["Conceptual"],
        "Practical": values["CoPr"]["Practical"],
        "Analytical": values["AnCr"]["Analytical"],
        "Creative": values["AnCr"]["Creative"],
        "Individual": values["InCo"]["Individual"],
        "Collaborative": values["InCo"]["Collaborative"],
        "Rigid": values["RiFl"]["Rigid"],
        "Flexible": values["RiFl"]["Flexible"],
        "Quantitative": values["NtLi"]["Quantitative"],
        "Qualitative": values["NtLi"]["Qualitative"],
    }
    label_rows.append(row)

labels_df = pd.DataFrame(label_rows)

print(labels_df.head())
print("Labels shape:", labels_df.shape)

       code                                  name  Conceptual  Practical  \
0  CSCA08H3    Introduction to Computer Science I          40         60   
1  CSCA20H3           Introduction to Programming          20         80   
2  CSCA67H3                  Discrete Mathematics          70         30   
3  MATA31H3  Calculus I for Mathematical Sciences          90         10   
4  PHYA10H3   Physics I for the Physical Sciences          80         20   

   Analytical  Creative  Individual  Collaborative  Rigid  Flexible  \
0          80        20          90             10     80        20   
1          80        20          60             40     50        50   
2          95         5          80             20     70        30   
3          95         5          80             20     80        20   
4          90        10          60             40     70        30   

   Quantitative  Qualitative  
0            80           20  
1            70           30  
2            90        

In [13]:
with open('../data/course_info_test.json', 'r') as f:
    courses_info = json.load(f)

df = pd.DataFrame(courses_info)

# only keep courses that exist in training data
df = df[df['code'].isin(labels_df['code'])]

# extract text features (TF-IDF)
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
X = vectorizer.fit_transform(df['description']).toarray()

print("X shape:", X.shape)

X shape: (20, 404)


In [14]:
y = labels_df.drop(columns=["code", "name"]).values
print("y shape:", y.shape)

y shape: (20, 10)


In [15]:
from sklearn.model_selection import train_test_split

# split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Train Set:", X_train.shape, y_train.shape)
print("Test Set:", X_test.shape, y_test.shape)

Train Set: (16, 404) (16, 10)
Test Set: (4, 404) (4, 10)


In [16]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LogisticRegression

model = MultiOutputRegressor(LogisticRegression(max_iter=1000))

# train model
model.fit(X_train, y_train)  

# predict
y_pred = model.predict(X_test)

print("Model trained successfully")

Model trained successfully


In [17]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# compute errors
mae = mean_absolute_error(y_test, y_pred, multioutput='raw_values')
mse = mean_squared_error(y_test, y_pred, multioutput='raw_values')
r2 = r2_score(y_test, y_pred, multioutput='raw_values')

# results per aspect
aspects = ["Conceptual", "Practical", "Analytical", "Creative", "Individual", "Collaborative", "Rigid", "Flexible", "Quantitative", "Qualitative"]
for i, aspect in enumerate(aspects):
    print(f"{aspect}: MAE={mae[i]:.2f}, MSE={mse[i]:.2f}, R²={r2[i]:.2f}")

# overall performance
print("\nOverall Model Performance:")
print(f"Mean MAE: {np.mean(mae):.2f}")
print(f"Mean MSE: {np.mean(mse):.2f}")
print(f"Mean R²: {np.mean(r2):.2f}")

Conceptual: MAE=30.00, MSE=1050.00, R²=-6.00
Practical: MAE=30.00, MSE=1050.00, R²=-6.00
Analytical: MAE=12.50, MSE=275.00, R²=-3.00
Creative: MAE=12.50, MSE=275.00, R²=-3.00
Individual: MAE=15.00, MSE=450.00, R²=-0.29
Collaborative: MAE=15.00, MSE=450.00, R²=-0.29
Rigid: MAE=22.50, MSE=625.00, R²=-0.96
Flexible: MAE=22.50, MSE=625.00, R²=-0.96
Quantitative: MAE=30.00, MSE=1250.00, R²=-2.57
Qualitative: MAE=30.00, MSE=1250.00, R²=-2.57

Overall Model Performance:
Mean MAE: 22.00
Mean MSE: 730.00
Mean R²: -2.56


In [18]:
import pandas as pd

# convert predictions & actual values into DataFrames
pred_df = pd.DataFrame(y_pred, columns=aspects)
true_df = pd.DataFrame(y_test, columns=aspects)

# recover test indices from df
X_test_indices = df.index[len(y_train):]  # Get last test set indices

# get course codes from `df`
test_codes = df.loc[X_test_indices, 'code'].reset_index(drop=True)

# add course codes to predictions
comparison = pd.concat([test_codes, true_df, pred_df], axis=1, keys=["Course Code", "Actual", "Predicted"])

for index, row in comparison.iterrows():
    print(f"\n--- Course: {row['Course Code']} ---")
    
    for aspect in aspects:
        actual_value = row["Actual"][aspect]
        predicted_value = row["Predicted"][aspect]
        
        print(f"{aspect:<15} | Actual: {actual_value:<3} | Predicted: {predicted_value:<3}")


--- Course: code    CSCC01H3
Name: 0, dtype: object ---
Conceptual      | Actual: 40  | Predicted: 70 
Practical       | Actual: 60  | Predicted: 30 
Analytical      | Actual: 80  | Predicted: 90 
Creative        | Actual: 20  | Predicted: 10 
Individual      | Actual: 90  | Predicted: 50 
Collaborative   | Actual: 10  | Predicted: 50 
Rigid           | Actual: 80  | Predicted: 60 
Flexible        | Actual: 20  | Predicted: 40 
Quantitative    | Actual: 80  | Predicted: 90 
Qualitative     | Actual: 20  | Predicted: 10 

--- Course: code    EESC30H3
Name: 1, dtype: object ---
Conceptual      | Actual: 50  | Predicted: 70 
Practical       | Actual: 50  | Predicted: 30 
Analytical      | Actual: 70  | Predicted: 70 
Creative        | Actual: 30  | Predicted: 30 
Individual      | Actual: 40  | Predicted: 50 
Collaborative   | Actual: 60  | Predicted: 50 
Rigid           | Actual: 50  | Predicted: 70 
Flexible        | Actual: 50  | Predicted: 30 
Quantitative    | Actual: 60  | Predicte

Flexible        | Actual: 50  | Predicted: 40 
Quantitative    | Actual: 70  | Predicted: 90 
Qualitative     | Actual: 30  | Predicted: 10 


In [19]:
# ------ START OF ACTUAL ASSESSMENT ------

import json
import pandas as pd

# load new course data
with open('../data/course_info_all.json', 'r') as f:
    new_courses_data = json.load(f)

# convert to DataFrame
new_courses_df = pd.DataFrame(new_courses_data)

print(new_courses_df.head())

       code                                 name  \
0  MDSC22H3               Understanding Scandals   
1  MGEB11H3  Quantitative Methods in Economics I   
2  MATD01H3                    Fields and Groups   
3  MATC01H3                  Groups and Symmetry   
4  MATD93H3                  Mathematics Project   

                                         description  \
0  This course focuses on modern-day scandals, ra...   
1  An introduction to probability and statistics ...   
2  Abstract group theory: Sylow theorems, groups ...   
3  Congruences and fields. Permutations and permu...   
4  A significant project in any area of mathemati...   

                                             prereqs  \
0  [Enrolment in the Major program in Media and C...   
1  [MGEA02H3 and MGEA06H3 and MATA34H3] or [MGEA0...   
2                                           MATC01H3   
3   [MATA36H3 or MATA37H3] and [MATB24H3 or MAT224H]   
4  [1.5 credits at the C-level in MAT courses] an...   

             

In [20]:
# use same vectorizer from training to transform new descriptions
X_new = vectorizer.transform(new_courses_df['description']).toarray()

# check the shape of transformed data
print("X_new shape:", X_new.shape)

X_new shape: (1867, 404)


In [21]:
# predict assessment aspects for new courses
y_new_pred = model.predict(X_new)

# convert predictions to DataFrame
predicted_aspects_df = pd.DataFrame(y_new_pred, columns=aspects)

# combine predictions with course information
results_df = pd.concat([new_courses_df[['code', 'name', 'description']], predicted_aspects_df], axis=1)

print(results_df.head())

       code                                 name  \
0  MDSC22H3               Understanding Scandals   
1  MGEB11H3  Quantitative Methods in Economics I   
2  MATD01H3                    Fields and Groups   
3  MATC01H3                  Groups and Symmetry   
4  MATD93H3                  Mathematics Project   

                                         description  Conceptual  Practical  \
0  This course focuses on modern-day scandals, ra...          70         30   
1  An introduction to probability and statistics ...          70         30   
2  Abstract group theory: Sylow theorems, groups ...          70         30   
3  Congruences and fields. Permutations and permu...          70         30   
4  A significant project in any area of mathemati...          70         30   

   Analytical  Creative  Individual  Collaborative  Rigid  Flexible  \
0          70        30          50             50     40        60   
1          90        10          50             50     70        30   

In [22]:
# convert results DataFrame to dictionary
results_dict = results_df.to_dict(orient='records')

# save results to JSON file
with open('../data/course_nature_results.json', 'w') as f:
    json.dump(results_dict, f, indent=4)

print("Results saved to ../data/course_nature_results.json")

Results saved to ../data/course_nature_results.json
