In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import joblib

# Step 1: Load data
df = pd.read_csv("data.csv")

# Drop unnecessary columns
columns_to_drop = ['Learning_Disabilities', 'Gender', 'School_Type']
df.drop(columns=[col for col in columns_to_drop if col in df.columns], inplace=True)


# Label encode categorical features
label_encoders = {}
for col in df.select_dtypes(include='object').columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Step 2: Add many synthetic "high-quality" cases
synthetic_rows = []
for hours in range(5, 13):  # Hours_Studied from 5 to 12
    for i in range(15):     # 15 samples per hour level
        score = 70 + (hours - 5) * 4 + np.random.randint(-2, 3)
        row = {
            'Hours_Studied': hours,
            'Attendance': np.random.randint(88, 100),
            'Parental_Involvement': np.random.randint(7, 10),
            'Access_to_Resources': np.random.randint(7, 10),
            'Extracurricular_Activities': np.random.randint(5, 10),
            'Sleep_Hours': np.random.randint(6, 9),
            'Previous_Scores': 90,
            'Motivation_Level': np.random.randint(7, 10),
            'Internet_Access': label_encoders['Internet_Access'].transform(['Yes'])[0],
            'Tutoring_Sessions': np.random.randint(2, 5),
            'Family_Income': np.random.randint(60, 100),
            'Teacher_Quality': np.random.randint(7, 10),
            'Peer_Influence': label_encoders['Peer_Influence'].transform(['Positive'])[0],
            'Physical_Activity': np.random.randint(4, 8),
            'Parental_Education_Level': label_encoders['Parental_Education_Level'].transform(['College'])[0],
            'Distance_from_Home': np.random.randint(1, 8),
            'Exam_Score': min(score, 100)
        }
        synthetic_rows.append(row)

synthetic_df = pd.DataFrame(synthetic_rows)
df = pd.concat([df, synthetic_df], ignore_index=True)

# Step 3: Define features and target
target = 'Exam_Score'
features = df.columns.drop(target)

X = df[features].fillna(df[features].median())
y = df[target]

# Step 4: Scaling and train-test split
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# Step 5: Model training
model = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
model.fit(X_train, y_train)

# Step 6: Evaluate
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"✅ MAE: {mae:.2f}")
print(f"✅ R² score: {r2:.2f}")

# Step 7: Save artifacts
joblib.dump(model, "exam_score_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(label_encoders, "label_encoders.pkl")
print("✅ Model, scaler, and encoders saved.")


✅ MAE: 1.37
✅ R² score: 0.89
✅ Model, scaler, and encoders saved.
