exam score prediction

In [None]:
import numpy as np

In [None]:
import pandas as pd

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
!pip install kagglehub[pandas-datasets]

In [None]:
%pip install kagglehub[pandas-datasets]

In [None]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Point to the specific file inside the dataset
file_path = "Exam_Score_Prediction.csv"

df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "kundanbedmutha/exam-score-prediction-dataset",
  file_path,
)

print(df.head())

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Select only columns that contain text (object type)
categorical_cols = df.select_dtypes(include=['object']).columns

for col in categorical_cols:
    print(f"Unique values in '{col}':")
    print(df[col].unique())
    print("-" * 30)

we assign numbers that represent a "scale" (0 or 1 to 3)

In [None]:
# Mapping for ranked variables
ordinal_mapping = {
    'sleep_quality': {'poor': 1, 'average': 2, 'good': 3},
    'facility_rating': {'low': 1, 'medium': 2, 'high': 3},
    'exam_difficulty': {'easy': 1, 'moderate': 2, 'hard': 3},
    'internet_access': {'no': 0, 'yes': 1}
}

# Apply the mapping
for col, val_map in ordinal_mapping.items():
    df[col] = df[col].map(val_map)

One-Hot Encoding for Nominal Data

In [None]:
# Convert remaining text columns into 0 and 1 columns
df_final = pd.get_dummies(df, columns=['gender', 'course', 'study_method'], drop_first=True)

# Display the first few rows to see the new structure
print(df_final.head())

In [None]:
df_final.head()

Distribution of the Target (exam_score)

In [None]:
%pip install seaborn

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8, 5))
sns.histplot(df_final['exam_score'], kde=True, color='teal')
plt.title('Distribution of Exam Scores')
plt.show()

Correlation Heatmap
Close to 1: As the feature goes up, the score goes up (e.g., Study Hours).

Close to -1: As the feature goes up, the score goes down (e.g., Difficulty).

Close to 0: The feature has almost no impact on the score.

In [None]:
plt.figure(figsize=(12, 10))
# Calculate correlation
corr = df_final.corr()
# Plot heatmap
sns.heatmap(corr[['exam_score']].sort_values(by='exam_score', ascending=False), 
            annot=True, cmap='coolwarm', center=0)
plt.title('Correlation of All Features with Exam Score')
plt.show()

for Linear Regression: Feature SelectionBased on the Correlation Heatmap, I decided to focus only on high-impact features and ignore "noise" variables with correlations close to zero (between $0.045$ and $-0.063$).Features Kept:study_hours (Strongest Predictor: 0.72)class_attendancesleep_hours & sleep_qualityfacility_ratingstudy_method_self-study

Study Hours vs. Score (with Regression Line)

In [None]:
plt.figure(figsize=(8, 5))
sns.regplot(data=df_final, x='study_hours', y='exam_score', 
            scatter_kws={'alpha':0.3}, line_kws={'color':'red'})
plt.title('Relationship: Study Hours vs. Exam Score')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

# Define the target variable
y = df_final['exam_score']

# --- VERSION 1: High-Impact Subset (For Linear/Ridge/Lasso) ---
linear_features = ['study_hours', 'class_attendance', 'sleep_hours', 
                   'sleep_quality', 'facility_rating', 'study_method_self-study']
X_linear = df_final[linear_features]

X_train_lin, X_test_lin, y_train, y_test = train_test_split(
    X_linear, y, test_size=0.2, random_state=42
)

# --- VERSION 2: All Features (For Decision Tree/Random Forest) ---
X_forest = df_final.drop(['exam_score', 'student_id'], axis=1)

X_train_rf, X_test_rf, _, _ = train_test_split(
    X_forest, y, test_size=0.2, random_state=42
)

print("Data splits created successfully!")

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score

# Dictionary of models with different characteristics
models = {
    "Linear Regression (Subset)": LinearRegression(),
    "Ridge (Subset)": Ridge(alpha=1.0), 
    "Lasso (Subset)": Lasso(alpha=0.1),
    "Decision Tree (Full)": DecisionTreeRegressor(max_depth=5, random_state=42),
    "Random Forest (Full)": RandomForestRegressor(n_estimators=100, random_state=42)
}

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
import pandas as pd

# Initialize Models
models = {
    "Linear Regression (Subset)": LinearRegression(),
    "Ridge (Subset)": Ridge(alpha=1.0), 
    "Lasso (Subset)": Lasso(alpha=0.1),
    "Decision Tree (Full)": DecisionTreeRegressor(max_depth=5, random_state=42),
    "Random Forest (Full)": RandomForestRegressor(n_estimators=100, random_state=42)
}

results = []

for name, model in models.items():
    # Select the correct split based on the model type
    if "(Subset)" in name:
        X_train_current, X_test_current = X_train_lin, X_test_lin
    else:
        X_train_current, X_test_current = X_train_rf, X_test_rf
    
    # 5-Fold Cross Validation
    cv_scores = cross_val_score(model, X_train_current, y_train, cv=5, scoring='r2')
    
    # Train and test
    model.fit(X_train_current, y_train)
    test_score = model.score(X_test_current, y_test)
    
    results.append({
        "Model": name,
        "CV R2 Mean": cv_scores.mean(),
        "Test R2": test_score
    })

# Format and display the results
results_df = pd.DataFrame(results).sort_values(by="Test R2", ascending=False)
print(results_df)

In [None]:
# After your loop finishes:
best_model_entry = max(results, key=lambda x: x['Test R2'])

print("---  BEST MODEL PERFORMANCE ---")
print(f"Model Name: {best_model_entry['Model']}")
print(f"Test R2 Score: {best_model_entry['Test R2']:.4f}")
print(f"Cross-Validation R2: {best_model_entry['CV R2 Mean']:.4f}")

improve  Random Forest, we  to combine Hyperparameter Tuning with Cross-Validation

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# 1. Define the parameters you want to "tune"
param_grid = {
    'n_estimators': [100, 300, 500],           # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],           # How deep each tree can grow
    'min_samples_split': [2, 5, 10],           # Minimum samples required to split a node
    'max_features': ['sqrt', 'log2', None]     # Number of features to consider at each split
}

# 2. Initialize the Grid Search
# cv=5 means it will do 5-fold cross-validation for EACH combination
grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_grid=param_grid,
    cv=5, 
    scoring='r2',
    n_jobs=-1, # Uses all your CPU cores to speed it up
    verbose=2  # Prints progress so you can see it working
)

# 3. Fit to your data
# Using X_train_rf (the one with all features)
grid_search.fit(X_train_rf, y_train)

# 4. Results
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best CV R2 Score: {grid_search.best_score_:.4f}")

In [None]:
# Use the best model found by the search
best_rf = grid_search.best_estimator_

# Predict on the test set
final_pred = best_rf.predict(X_test_rf)

from sklearn.metrics import r2_score, mean_absolute_error
print(f"Final Tuned Test R2: {r2_score(y_test, final_pred):.4f}")
print(f"Final Tuned MAE: {mean_absolute_error(y_test, final_pred):.2f}")