In [3]:
import pandas as pd
import numpy as np
import warnings
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingRegressor
from skopt import BayesSearchCV
from joblib import dump

warnings.filterwarnings("ignore")

# Load training dataset
df = pd.read_csv("../train_dataset.csv")

# Drop irrelevant columns
df = df.drop(columns=['Student ID', 'Mentor-1', 'Mentor-2', 'Mentor-3', 'Roll-2', 'Roll-3',
                      'DE Theory', 'DE Practical', 'FSD Theory', 'FSD Practical',
                      'Python Theory', 'Python Practical', 'Communication Theory', 'Law Theory'])

# Feature Engineering
sem1_columns = ['Math-1 Theory', 'Physics Theory', 'Java-1 Theory', 'Software Engineering Theory']
sem2_columns = ['Math-2 Theory', 'Data Structures using Java Theory', 'DBMS Theory',
                'Fundamental of Electronics and Electrical Theory', 'Java-2 Theory']

df['Sem 1 Percentage'] = df[sem1_columns].mean(axis=1).round(2)
df['Sem 2 Percentage'] = df[sem2_columns].mean(axis=1).round(2)

# Rename and extract section letters
df = df.rename(columns={'Div-1': 'Section-1', 'Div-2': 'Section-2', 'Div-3': 'Section-3'})
for section in ['Section-1', 'Section-2', 'Section-3']:
    df[section] = df[section].astype(str).str[0]

# Prepare features and target
target_col = 'Math-3 Theory'
X = df.drop(columns=[target_col])
y = df[target_col]

categorical_cols = ['Gender', 'Religion', 'Branch', 'Section-1', 'Section-2', 'Section-3']
numeric_cols = [col for col in X.columns if col not in categorical_cols]

# Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols),
    ('num', RobustScaler(), numeric_cols)
])

# Base pipelines for tuning
ridge_pipeline = Pipeline([('preprocessor', preprocessor), ('regressor', Ridge())])
lasso_pipeline = Pipeline([('preprocessor', preprocessor), ('regressor', Lasso(max_iter=10000))])
elastic_pipeline = Pipeline([('preprocessor', preprocessor), ('regressor', ElasticNet(max_iter=10000))])

# Cross-validation strategy
kf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=42)

# Search spaces
param_space = {
    'ridge': {'regressor__alpha': (1e-3, 1e3, 'log-uniform')},
    'lasso': {'regressor__alpha': (1e-3, 1e3, 'log-uniform')},
    'elastic': {
        'regressor__alpha': (1e-3, 1e3, 'log-uniform'),
        'regressor__l1_ratio': (0.05, 1.0, 'uniform')
    }
}

# Hyperparameter search
ridge_search = BayesSearchCV(ridge_pipeline, param_space['ridge'], n_iter=50, cv=kf,
                             scoring='neg_mean_absolute_error', random_state=42)
ridge_search.fit(X, y)

lasso_search = BayesSearchCV(lasso_pipeline, param_space['lasso'], n_iter=50, cv=kf,
                             scoring='neg_mean_absolute_error', random_state=42)
lasso_search.fit(X, y)

elastic_search = BayesSearchCV(elastic_pipeline, param_space['elastic'], n_iter=50, cv=kf,
                               scoring='neg_mean_absolute_error', random_state=42)
elastic_search.fit(X, y)

# Final ensemble regressor
ensemble_model = VotingRegressor([
    ('ridge', ridge_search.best_estimator_['regressor']),
    ('lasso', lasso_search.best_estimator_['regressor']),
    ('elastic', elastic_search.best_estimator_['regressor'])
])

# Final pipeline: preprocess + ensemble
full_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('ensemble', ensemble_model)
])

# Fit final pipeline
full_pipeline.fit(X, y)

# Save complete pipeline
dump(full_pipeline, "math3_estimator.joblib")
print("Model training complete and saved as 'math3_estimator.joblib'.")

Model training complete and saved as 'math3_estimator.joblib'.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math3_predictor import predict_math3  # Assuming your logic is in this file

# Load test data
test_df = pd.read_csv("../test_dataset.csv")

# Confirm the target exists
target_col = 'Math-3 Theory'
if target_col not in test_df.columns:
    raise ValueError(f"'{target_col}' column not found in test dataset.")

# Store y_test
y_test = test_df[target_col]

# Get predictions using your black-box wrapper (which handles preprocessing + dropping target)
try:
    y_pred = predict_math3(test_df)  # <-- Pass full test_df, NOT X_test
    print("Prediction completed.")
except Exception as e:
    print(f"Prediction failed: {e}")
    exit()

# Plot Actual vs Predicted
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.7, label='Predicted vs Actual')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', label='Perfect Prediction')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Ensemble Model Predictions on Test Data')
plt.legend()
plt.grid(True)

# Save the plot
output_filename = "math_model_performance.png"
try:
    plt.savefig(output_filename)
    print(f"Plot saved successfully as '{output_filename}'")
except Exception as e:
    print(f"Error saving plot: {e}")

plt.show()

# Calculate MAE
mae = np.mean(np.abs(y_test - y_pred))
print(f"Test MAE: {mae:.4f}")


Prediction failed: Pipeline is not fitted yet.


NameError: name 'y_pred' is not defined

<Figure size 1000x600 with 0 Axes>

: 