In [22]:
conda install numpy=1.26 scikit-learn -c conda-forge

Retrieving notices: done
Channels:
 - conda-forge
 - defaults
Platform: osx-arm64
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /opt/anaconda3

  added / updated specs:
    - numpy=1.26
    - scikit-learn


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2025.10.5  |       hbd8a1cb_0         152 KB  conda-forge
    certifi-2025.10.5          |     pyhd8ed1ab_0         156 KB  conda-forge
    openssl-3.5.4              |       h5503f6c_0         2.9 MB  conda-forge
    scikit-learn-1.7.2         |  py312h79e0ffc_0         8.5 MB  conda-forge
    ------------------------------------------------------------
                                           Total:        11.7 MB

The following packages will be UPDATED:

  ca-certificates                       2025.8.3-hbd8a1cb_0 --> 2025.10.5-hbd8a1cb_0 
  ce

In [3]:
!pip uninstall numpy scikit-learn -y
!pip install numpy scikit-learn

Found existing installation: numpy 2.2.6
Uninstalling numpy-2.2.6:
  Successfully uninstalled numpy-2.2.6
Found existing installation: scikit-learn 1.7.2
Uninstalling scikit-learn-1.7.2:
  Successfully uninstalled scikit-learn-1.7.2
Collecting numpy
  Downloading numpy-2.3.4-cp312-cp312-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp312-cp312-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting numpy
  Using cached numpy-2.2.6-cp312-cp312-macosx_14_0_arm64.whl.metadata (62 kB)
Downloading scikit_learn-1.7.2-cp312-cp312-macosx_12_0_arm64.whl (8.6 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m[31m5.8 MB/s[0m eta [36m0:00:01[0m
[?25hUsing cached numpy-2.2.6-cp312-cp312-macosx_14_0_arm64.whl (5.1 MB)
Installing collected packages: numpy, scikit-learn
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are inst

In [1]:
pip install "numpy<2"

Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import pickle
import json

np.random.seed(42)
random.seed(42)

# === Step 1: Generate Synthetic Dataset ===
print("Generating synthetic dataset...")

# Students
students = pd.DataFrame({
    'student_id': [f"S{i:04d}" for i in range(1, 501)],
    'branch_code': np.random.choice(['CSE', 'ECE', 'ME', 'CE'], 500),
    'cgpa': np.round(np.random.uniform(5, 10, 500), 2)
})

# Interests
domains = ['AI', 'Web', 'DBMS', 'ML', 'Networks']
student_interests = []
for sid in students['student_id']:
    interests = random.sample(domains, np.random.randint(1, 4))
    for d in interests:
        student_interests.append([sid, d, 1])

interests_df = pd.DataFrame(student_interests, columns=['student_id', 'domain', 'interest_level'])

# Courses
courses = pd.DataFrame({
    'course_id': [f"C{i:02d}" for i in range(1, 21)],
    'domain_tags': np.random.choice(domains, 20),
    'difficulty_level': np.random.randint(1, 6, 20)
})

# Grades
grades_records = []
for sid in students['student_id']:
    taken_courses = np.random.choice(courses['course_id'], np.random.randint(5, 10), replace=False)
    student_cgpa = students.loc[students['student_id'] == sid, 'cgpa'].values[0]
    student_tags = interests_df[interests_df['student_id'] == sid]['domain'].tolist()
    
    for c in taken_courses:
        course_domain = courses.loc[courses['course_id'] == c, 'domain_tags'].values[0]
        interest_bonus = 1 if course_domain in student_tags else 0
        grade = np.clip(np.random.normal(student_cgpa + interest_bonus, 1), 5, 10)
        grades_records.append([sid, c, round(grade, 2)])

student_course_grades = pd.DataFrame(grades_records, columns=['student_id', 'course_id', 'grade'])

# Save datasets
student_course_grades.to_csv("student_course_data.csv", index=False)
students.to_csv("students.csv", index=False)
courses.to_csv("courses.csv", index=False)
interests_df.to_csv("interests.csv", index=False)
print(f"✓ Dataset saved: {len(student_course_grades)} grade records")

# === Step 2: Preprocess / Feature Encoding ===
print("\nPreprocessing data...")

students_encoded = pd.get_dummies(students, columns=['branch_code'], prefix='branch')
courses_encoded = pd.get_dummies(courses, columns=['domain_tags'], prefix='domain')

data = student_course_grades.merge(students_encoded, on='student_id')
data = data.merge(courses_encoded, on='course_id')

# Add interest features
for domain in domains:
    student_interest_domain = interests_df[interests_df['domain'] == domain][['student_id', 'interest_level']].rename(
        columns={'interest_level': f'interest_{domain}'}
    )
    data = data.merge(student_interest_domain, on='student_id', how='left')
    data[f'interest_{domain}'] = data[f'interest_{domain}'].fillna(0)

print(f"✓ Features created: {data.shape[1] - 3} features")  # Exclude student_id, course_id, grade

# === Step 3: Train Feature-based Model ===
print("\nTraining model...")

# Define feature columns dynamically
branch_cols = [col for col in data.columns if col.startswith('branch_')]
domain_cols = [col for col in data.columns if col.startswith('domain_')]
interest_cols = [col for col in data.columns if col.startswith('interest_')]

feature_cols = ['cgpa', 'difficulty_level'] + branch_cols + domain_cols + interest_cols

X = data[feature_cols]
y = data['grade']

# Split data for validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
feature_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
feature_model.fit(X_train, y_train)

# === Step 4: Evaluate Model ===
print("\nModel Evaluation:")
print("-" * 50)

# Training performance
train_pred = feature_model.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
train_mae = mean_absolute_error(y_train, train_pred)
train_r2 = r2_score(y_train, train_pred)

print(f"Training Set:")
print(f"  RMSE: {train_rmse:.4f}")
print(f"  MAE:  {train_mae:.4f}")
print(f"  R²:   {train_r2:.4f}")

# Testing performance
test_pred = feature_model.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, test_pred))
test_mae = mean_absolute_error(y_test, test_pred)
test_r2 = r2_score(y_test, test_pred)

print(f"\nTest Set:")
print(f"  RMSE: {test_rmse:.4f}")
print(f"  MAE:  {test_mae:.4f}")
print(f"  R²:   {test_r2:.4f}")

# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': feature_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 5 Most Important Features:")
print("-" * 50)
for idx, row in feature_importance.head().iterrows():
    print(f"  {row['feature']:<25} {row['importance']:.4f}")

# === Step 5: Save Model and Metadata ===
print("\nSaving model and metadata...")

# Save model
with open("feature_model.pkl", "wb") as f:
    pickle.dump(feature_model, f)

# Save feature names for inference
with open("feature_columns.json", "w") as f:
    json.dump(feature_cols, f)

# Save metadata
metadata = {
    "model_type": "RandomForestRegressor",
    "n_estimators": 100,
    "features": feature_cols,
    "training_samples": len(X_train),
    "test_samples": len(X_test),
    "test_rmse": float(test_rmse),
    "test_mae": float(test_mae),
    "test_r2": float(test_r2),
    "domains": domains
}

with open("model_metadata.json", "w") as f:
    json.dump(metadata, f, indent=2)

print("✓ Model saved as feature_model.pkl")
print("✓ Feature columns saved as feature_columns.json")
print("✓ Metadata saved as model_metadata.json")

# === Step 6: Example Prediction Function ===
def predict_grade(student_id, course_id):
    """
    Predict grade for a student-course pair
    """
    # Load data
    student_info = students[students['student_id'] == student_id].iloc[0]
    course_info = courses[courses['course_id'] == course_id].iloc[0]
    student_interests_list = interests_df[interests_df['student_id'] == student_id]['domain'].tolist()
    
    # Create feature vector
    features = {
        'cgpa': student_info['cgpa'],
        'difficulty_level': course_info['difficulty_level']
    }
    
    # Branch encoding
    for branch in ['CE', 'CSE', 'ECE', 'ME']:
        features[f'branch_{branch}'] = 1 if student_info['branch_code'] == branch else 0
    
    # Domain encoding
    for domain in domains:
        features[f'domain_{domain}'] = 1 if course_info['domain_tags'] == domain else 0
    
    # Interest encoding
    for domain in domains:
        features[f'interest_{domain}'] = 1 if domain in student_interests_list else 0
    
    # Create DataFrame with correct column order
    feature_vector = pd.DataFrame([features])[feature_cols]
    
    # Predict
    predicted_grade = feature_model.predict(feature_vector)[0]
    
    return predicted_grade

# Example prediction
print("\n" + "=" * 50)
print("Example Prediction:")
print("=" * 50)
example_student = students['student_id'].iloc[0]
example_course = courses['course_id'].iloc[0]
predicted = predict_grade(example_student, example_course)
print(f"Student: {example_student}")
print(f"Course:  {example_course}")
print(f"Predicted Grade: {predicted:.2f}")

# Check if actual grade exists
actual = data[(data['student_id'] == example_student) & (data['course_id'] == example_course)]
if not actual.empty:
    print(f"Actual Grade:    {actual['grade'].values[0]:.2f}")

print("\n" + "=" * 50)
print("Pipeline completed successfully!")
print("=" * 50)

Generating synthetic dataset...
✓ Dataset saved: 356 grade records

Preprocessing data...
✓ Features created: 16 features

Training model...

Model Evaluation:
--------------------------------------------------
Training Set:
  RMSE: 0.4366
  MAE:  0.3351
  R²:   0.9314

Test Set:
  RMSE: 1.0402
  MAE:  0.8425
  R²:   0.6687

Top 5 Most Important Features:
--------------------------------------------------
  cgpa                      0.7826
  difficulty_level          0.0538
  domain_Networks           0.0235
  domain_ML                 0.0196
  domain_Web                0.0140

Saving model and metadata...
✓ Model saved as feature_model.pkl
✓ Feature columns saved as feature_columns.json
✓ Metadata saved as model_metadata.json

Example Prediction:
Student: S001
Course:  C01
Predicted Grade: 9.17

Pipeline completed successfully!
