# Welcome to Colab!

NAME : OMAID BIN EJAZ
ROLL NO : 22F-BSAI-45
STUDENT DATA SET

In [None]:
# student_performance_predictor.py
# ------------------------------------------------------------
# Student Performance Predictor
#
#  1. Load and combine UCI student-mat and student-por datasets (remove 382 duplicates)
#  2. Select features: studytime, absences, G1, G2, and target G3
#  3. Create pass/fail label (>=10 pass)
#  4. Preprocess (train/test split + scaling)
#  5. Train Linear Regression for G3 prediction
#  6. Train Logistic Regression and Decision Tree for pass/fail
#  7. Evaluate and detect/explain overfitting (concise)
# ------------------------------------------------------------

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score, precision_score, recall_score, confusion_matrix

# ---------------------------
# 1) first i'm Loading  and merging datasets
# ---------------------------
# Files use ';' separator
mat = pd.read_csv('student-mat.csv', sep=';')
por = pd.read_csv('student-por.csv', sep=';')
print(f"Loaded: math={mat.shape}, portuguese={por.shape}")

# i just  Merge both datasets on common student-identifying attributes to remove duplicates right!
merge_cols = [
    'school','sex','age','address','famsize','Pstatus',
    'Medu','Fedu','Mjob','Fjob','reason','guardian'
]
merged = pd.merge(mat, por, on=merge_cols, suffixes=('_mat', '_por'), how='outer')
print(f"Merged shape (unique students): {merged.shape}")

# ---------------------------
# 2)in this we Select relevant columns
# ---------------------------
# Some students may appear only in one course so use available columns safely
# We will  prefer math grades where available otherwise Portuguese.
merged['studytime'] = merged['studytime_mat'].fillna(merged['studytime_por'])
merged['absences']  = merged['absences_mat'].fillna(merged['absences_por'])
merged['G1'] = merged['G1_mat'].fillna(merged['G1_por'])
merged['G2'] = merged['G2_mat'].fillna(merged['G2_por'])
merged['G3'] = merged['G3_mat'].fillna(merged['G3_por'])

data = merged[['studytime', 'absences', 'G1', 'G2', 'G3']].dropna()
print('Final dataset shape after merge:', data.shape)

# ---------------------------
# 3) now we Create classification label
# ---------------------------
data['pass_fail'] = (data['G3'] >= 10).astype(int)

# ---------------------------
# 4)we basically Prepare features, targets, split, and scale on this
# ---------------------------
X = data[['studytime', 'absences', 'G1', 'G2']]
Y_reg = data['G3']         # regression target
Y_clf = data['pass_fail']  # classification target

# Train-test split (same indices for regression/classification for fair comparison)
X_train, X_test, y_train_reg, y_test_reg = train_test_split(
    X, Y_reg, test_size=0.2, random_state=42
)
_, _, y_train_clf, y_test_clf = train_test_split(
    X, Y_clf, test_size=0.2, random_state=42
)

# i'm doing Scaling features so all variables will be on same scale
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

# ---------------------------
# 5) Linear Regression: predict G3 (final marks)
# ---------------------------
lin = LinearRegression()
lin.fit(X_train_s, y_train_reg)

# Predictions
y_pred_train_reg = lin.predict(X_train_s)
y_pred_test_reg = lin.predict(X_test_s)

# Evaluation
mae_train = mean_absolute_error(y_train_reg, y_pred_train_reg)
mae_test = mean_absolute_error(y_test_reg, y_pred_test_reg)
r2_train = r2_score(y_train_reg, y_pred_train_reg)
r2_test = r2_score(y_test_reg, y_pred_test_reg)

print("\nLinear Regression:")
print(f" Train MAE={mae_train:.2f}, R2={r2_train:.2f}")
print(f" Test MAE={mae_test:.2f}, R2={r2_test:.2f}")

# Overfitting heuristic
if (r2_train - r2_test) > 0.1:
    print("  Regression overfitting detected (train R2 >> test R2).")
    print("Possible cause: correlated features (G1, G2 ~ G3).")
else:
    print("  Regression shows no strong overfitting.")

# ---------------------------
# 6) Logistic Regression: classify pass/fail
# ---------------------------
log = LogisticRegression(max_iter=1000)
log.fit(X_train_s, y_train_clf)

y_pred_train_log = log.predict(X_train_s)
y_pred_test_log = log.predict(X_test_s)

acc_train_log = accuracy_score(y_train_clf, y_pred_train_log)
acc_test_log = accuracy_score(y_test_clf, y_pred_test_log)
prec_test_log = precision_score(y_test_clf, y_pred_test_log)
rec_test_log = recall_score(y_test_clf, y_pred_test_log)

print("\nLogistic Regression:")
print(f" Train Acc={acc_train_log:.2f} | Test Acc={acc_test_log:.2f}")
print(f" Test Precision={prec_test_log:.2f}, Recall={rec_test_log:.2f}")
print(" Confusion Matrix (test):\n", confusion_matrix(y_test_clf, y_pred_test_log))

if (acc_train_log - acc_test_log) > 0.05:
    print("  Logistic Regression may overfit (train acc higher). Consider regularization or more data.")
else:
    print("  Logistic Regression generalizes well.")

# ---------------------------
# 7) Decision Tree: classify pass/fail
# ---------------------------
clf_tree = DecisionTreeClassifier(random_state=42, max_depth=4)
clf_tree.fit(X_train_s, y_train_clf)

y_pred_train_tree = clf_tree.predict(X_train_s)
y_pred_test_tree = clf_tree.predict(X_test_s)

acc_train_tree = accuracy_score(y_train_clf, y_pred_train_tree)
acc_test_tree = accuracy_score(y_test_clf, y_pred_test_tree)
prec_test_tree = precision_score(y_test_clf, y_pred_test_tree)
rec_test_tree = recall_score(y_test_clf, y_pred_test_tree)

print("\nDecision Tree:")
print(f" Train Acc={acc_train_tree:.2f} | Test Acc={acc_test_tree:.2f}")
print(f" Test Precision={prec_test_tree:.2f}, Recall={rec_test_tree:.2f}")
print(" Confusion Matrix (test):\n", confusion_matrix(y_test_clf, y_pred_test_tree))

if (acc_train_tree - acc_test_tree) > 0.05:
    print("  Decision Tree likely overfitting (train > test). Try smaller max_depth or use ensembles.")
else:
    print("  Decision Tree generalizes reasonably well with max_depth=4.")

# ---------------------------
# 8)finalizing
# ---------------------------
print("\nSummary:")
print(f" - Linear Regression → Test R2={r2_test:.2f}, MAE={mae_test:.2f}")
print(f" - Logistic Regression → Acc={acc_test_log:.2f}, Prec={prec_test_log:.2f}, Recall={rec_test_log:.2f}")
print(f" - Decision Tree → Acc={acc_test_tree:.2f}, Prec={prec_test_tree:.2f}, Recall={rec_test_tree:.2f}")

print("\nNotes on overfitting:")
print(" - Overfitting = model learns noise, not true patterns.")
print(" - Signs: much better train metrics than test metrics.")
print(" - Causes: correlated inputs (G1,G2), small data, deep trees.")
print(" - Fixes: simpler model, regularization, constrain tree depth, use cross-validation, or more data.")
print("\nScript finished.")

Loaded: math=(395, 33), portuguese=(649, 33)
Merged shape (unique students): (682, 54)
Final dataset shape after merge: (682, 5)

Linear Regression:
 Train MAE=0.98, R2=0.84
 Test MAE=1.18, R2=0.82
  Regression shows no strong overfitting.

Logistic Regression:
 Train Acc=0.89 | Test Acc=0.93
 Test Precision=0.92, Recall=0.98
 Confusion Matrix (test):
 [[33  8]
 [ 2 94]]
  Logistic Regression generalizes well.

Decision Tree:
 Train Acc=0.91 | Test Acc=0.94
 Test Precision=0.94, Recall=0.98
 Confusion Matrix (test):
 [[35  6]
 [ 2 94]]
  Decision Tree generalizes reasonably well with max_depth=4.

Summary:
 - Linear Regression → Test R2=0.82, MAE=1.18
 - Logistic Regression → Acc=0.93, Prec=0.92, Recall=0.98
 - Decision Tree → Acc=0.94, Prec=0.94, Recall=0.98

Notes on overfitting:
 - Overfitting = model learns noise, not true patterns.
 - Signs: much better train metrics than test metrics.
 - Causes: correlated inputs (G1,G2), small data, deep trees.
 - Fixes: simpler model, regulariz

In [None]:
#MY COMPARASION ANALYSIS
# Linear Regression accurately predicts final grades (since G1 and G2 are strong predictors of G3).
# Logistic Regression classifies pass/fail reliably without overfitting.
# Decision Tree performs well but overfits if depth isn’t controlled.