# Student Performance Prediction

This notebook trains a **Linear Regression** model to predict the final grade (G3).

In [None]:

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import joblib


In [None]:

# Load dataset
df = pd.read_csv('../data/student_data.csv')
df.head()


In [None]:

# Basic info and missing values check
print(df.info())
print('\nMissing values per column:\n', df.isnull().sum())


In [None]:

# Encode categorical columns using LabelEncoder where appropriate
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])
print('Categorical columns encoded:', cat_cols)


In [None]:

# Quick statistics
df.describe().T


In [None]:

# Exploratory Data Analysis - Correlation heatmap
plt.figure(figsize=(12,10))
sns.heatmap(df.corr(), annot=False)
plt.title('Correlation Heatmap')
plt.show()


In [None]:

# Pairplot for a few selected features
sns.pairplot(df[['studytime','absences','G1','G2','G3']])
plt.show()


In [None]:

# Prepare X and y
X = df.drop('G3', axis=1)
y = df['G3']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:

# Train Linear Regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Predict
y_pred = model.predict(X_test_scaled)

# Evaluate
r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'RÂ² Score: {r2:.4f}')
print(f'RMSE: {rmse:.4f}')


In [None]:

# Plot Actual vs Predicted
plt.figure(figsize=(8,6))
plt.scatter(y_test, y_pred)
plt.xlabel('Actual G3')
plt.ylabel('Predicted G3')
plt.title('Actual vs Predicted Final Grade (G3)')
plt.plot([0,20],[0,20])
plt.show()


In [None]:

# Save model and scaler
joblib.dump(model, '../student_performance_model.pkl')
joblib.dump(scaler, '../scaler.pkl')
print('Model and scaler saved to project root.')
