# Exam Score Prediction - Regression Analysis

This notebook implements multiple regression models to predict exam scores based on student data. The models used include:
1. Linear Regression
2. K-Neighbors Regressor (KNN)
3. Decision Tree Regressor
4. Random Forest Regressor

We will use simple, beginner-friendly methods.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Set plot style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 2. Load and Explore Data

In [None]:
df = pd.read_csv('Exam_Score_Prediction.csv')
print("First 5 rows:")
display(df.head())
print("\nData Info:")
print(df.info())

## 3. Data Preprocessing (Simple)

In [None]:
# 1. Handle Missing Values (if any)
# We will fill missing numeric values with the mean and categorical with the mode
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].fillna(df[col].mode()[0])
    else:
        df[col] = df[col].fillna(df[col].mean())

# 2. Encode Categorical Variables
# We use LabelEncoder to convert text to numbers
le = LabelEncoder()
for col in df.select_dtypes(include='object').columns:
    df[col] = le.fit_transform(df[col])
    print(f"Encoded {col}")

# Drop student_id if it exists as it's not useful for prediction
if 'student_id' in df.columns:
    df = df.drop('student_id', axis=1)

print("\nProcessed Data:")
display(df.head())

## 4. Train-Test Split

In [None]:
X = df.drop('exam_score', axis=1)
y = df['exam_score']

# Split the data: 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features (optional but good for KNN)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("Training Shape:", X_train.shape)
print("Testing Shape:", X_test.shape)

## 5. Model Implementation

In [None]:
# Dictionary to store results
results = {}

### 5.1 Linear Regression

In [None]:
model_lr = LinearRegression()
model_lr.fit(X_train, y_train)
y_pred_lr = model_lr.predict(X_test)

r2_lr = r2_score(y_test, y_pred_lr)
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
results['Linear Regression'] = r2_lr

print("Linear Regression R2 Score:", r2_lr)
print("Linear Regression RMSE:", rmse_lr)

### 5.2 K-Nearest Neighbors (KNN)

In [None]:
model_knn = KNeighborsRegressor(n_neighbors=5)
model_knn.fit(X_train, y_train)
y_pred_knn = model_knn.predict(X_test)

r2_knn = r2_score(y_test, y_pred_knn)
rmse_knn = np.sqrt(mean_squared_error(y_test, y_pred_knn))
results['KNN'] = r2_knn

print("KNN R2 Score:", r2_knn)
print("KNN RMSE:", rmse_knn)

### 5.3 Decision Tree Regression

In [None]:
model_dt = DecisionTreeRegressor(random_state=42)
model_dt.fit(X_train, y_train)
y_pred_dt = model_dt.predict(X_test)

r2_dt = r2_score(y_test, y_pred_dt)
rmse_dt = np.sqrt(mean_squared_error(y_test, y_pred_dt))
results['Decision Tree'] = r2_dt

print("Decision Tree R2 Score:", r2_dt)
print("Decision Tree RMSE:", rmse_dt)

### 5.4 Random Forest Regression

In [None]:
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)

r2_rf = r2_score(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
results['Random Forest'] = r2_rf

print("Random Forest R2 Score:", r2_rf)
print("Random Forest RMSE:", rmse_rf)

## 6. Model Comparison

In [None]:
models = list(results.keys())
scores = list(results.values())

plt.figure(figsize=(10, 5))
sns.barplot(x=scores, y=models, palette='viridis')
plt.xlabel('R2 Score')
plt.title('Model comparison')
plt.show()