# Student Risk Prediction using Logistic Regression and Random Forest Classifier

**UCI Student Performance Dataset (student-mat.csv)**

This notebook implements binary classification models to predict student risk based on demographic, social, and academic features.

## 1. Import Required Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score,
    confusion_matrix, 
    classification_report,
    roc_auc_score,
    roc_curve
)
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

ModuleNotFoundError: No module named 'seaborn'

## 2. Load and Explore Dataset

In [None]:
# Load the dataset
data_path = '../Student Performance Prediction/Data/student-mat.csv'
df = pd.read_csv(data_path, sep=';')

print("Dataset loaded successfully!")
print(f"\nDataset shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()

In [None]:
# Explore the dataset
print("Dataset Info:")
print(df.info())
print("\n" + "="*60)
print("\nMissing values:")
print(df.isnull().sum())
print("\n" + "="*60)
print("\nBasic Statistics:")
df.describe()

## 3. Create Target Variable for Risk Prediction

In [None]:
# Create binary target variable
# Students with final grade (G3) below 12 are classified as 'at risk'
threshold = 12
y = (df['G3'] < threshold).astype(int)

print(f"Creating target variable (threshold = {threshold}):")
print(f"\nTarget distribution:")
print(y.value_counts())
print(f"\nRisk prevalence: {y.mean()*100:.2f}% of students are at risk")

## 4. Prepare Features

In [None]:
# Select features (exclude target and intermediate grades)
exclude_cols = ['G1', 'G2', 'G3']
feature_cols = [col for col in df.columns if col not in exclude_cols]

X = df[feature_cols].copy()

print(f"Number of features: {len(X.columns)}")
print(f"\nFeatures selected:")
print(list(X.columns))

## 5. Preprocess Features

In [None]:
# Identify categorical and numeric columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
print(f"Categorical columns: {categorical_cols}\n")

# Encode categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le
    print(f"Encoded '{col}': {len(le.classes_)} classes")

print("\nFeatures preprocessed successfully!")

In [None]:
# Handle missing values
print("Handling missing values...")
for col in X.columns:
    if X[col].isnull().sum() > 0:
        if X[col].dtype in ['int64', 'float64']:
            X[col].fillna(X[col].median(), inplace=True)
        else:
            X[col].fillna(X[col].mode()[0], inplace=True)

print(f"\nMissing values after handling:")
print(X.isnull().sum().sum())

## 6. Split Data into Training and Testing Sets

In [None]:
# Split data with stratification to maintain class distribution
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")
print(f"\nTraining set class distribution:")
print(y_train.value_counts())
print(f"\nTesting set class distribution:")
print(y_test.value_counts())

## 7. Scale Features

In [None]:
# Apply StandardScaler to normalize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled successfully!")
print(f"\nScaled training set statistics:")
print(f"Mean: {X_train_scaled.mean(axis=0).mean():.6f}")
print(f"Std: {X_train_scaled.std(axis=0).mean():.6f}")

## 8. Train Logistic Regression Model

In [None]:
# Train Logistic Regression
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_scaled, y_train)

print("Logistic Regression model trained successfully!")
print(f"Number of iterations: {lr_model.n_iter_[0]}")

## 9. Train Random Forest Classifier Model

In [None]:
# Train Random Forest
rf_model = RandomForestClassifier(
    n_estimators=100, 
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train_scaled, y_train)

print("Random Forest model trained successfully!")
print(f"Number of trees: {rf_model.n_estimators}")

## 10. Evaluate Logistic Regression Model

In [None]:
# Make predictions
y_pred_lr = lr_model.predict(X_test_scaled)
y_pred_proba_lr = lr_model.predict_proba(X_test_scaled)[:, 1]

# Calculate metrics
lr_accuracy = accuracy_score(y_test, y_pred_lr)
lr_precision = precision_score(y_test, y_pred_lr)
lr_recall = recall_score(y_test, y_pred_lr)
lr_f1 = f1_score(y_test, y_pred_lr)
lr_roc_auc = roc_auc_score(y_test, y_pred_proba_lr)

print("="*60)
print("LOGISTIC REGRESSION - EVALUATION METRICS")
print("="*60)
print(f"\nAccuracy:  {lr_accuracy:.4f}")
print(f"Precision: {lr_precision:.4f}")
print(f"Recall:    {lr_recall:.4f}")
print(f"F1-Score:  {lr_f1:.4f}")
print(f"ROC-AUC:   {lr_roc_auc:.4f}")

In [None]:
# Confusion Matrix and Classification Report
cm_lr = confusion_matrix(y_test, y_pred_lr)
print(f"\nConfusion Matrix:")
print(cm_lr)
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred_lr))

## 11. Evaluate Random Forest Classifier Model

In [None]:
# Make predictions
y_pred_rf = rf_model.predict(X_test_scaled)
y_pred_proba_rf = rf_model.predict_proba(X_test_scaled)[:, 1]

# Calculate metrics
rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_precision = precision_score(y_test, y_pred_rf)
rf_recall = recall_score(y_test, y_pred_rf)
rf_f1 = f1_score(y_test, y_pred_rf)
rf_roc_auc = roc_auc_score(y_test, y_pred_proba_rf)

print("="*60)
print("RANDOM FOREST - EVALUATION METRICS")
print("="*60)
print(f"\nAccuracy:  {rf_accuracy:.4f}")
print(f"Precision: {rf_precision:.4f}")
print(f"Recall:    {rf_recall:.4f}")
print(f"F1-Score:  {rf_f1:.4f}")
print(f"ROC-AUC:   {rf_roc_auc:.4f}")

In [None]:
# Confusion Matrix and Classification Report
cm_rf = confusion_matrix(y_test, y_pred_rf)
print(f"\nConfusion Matrix:")
print(cm_rf)
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred_rf))

## 12. Model Comparison

In [None]:
# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Logistic Regression': {
        'Accuracy': lr_accuracy,
        'Precision': lr_precision,
        'Recall': lr_recall,
        'F1-Score': lr_f1,
        'ROC-AUC': lr_roc_auc
    },
    'Random Forest': {
        'Accuracy': rf_accuracy,
        'Precision': rf_precision,
        'Recall': rf_recall,
        'F1-Score': rf_f1,
        'ROC-AUC': rf_roc_auc
    }
})

print("="*60)
print("MODEL COMPARISON")
print("="*60)
print(comparison_df.round(4))

## 13. Visualizations - Model Performance Comparison

In [None]:
# Create comparison plots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Model Performance Comparison', fontsize=16, fontweight='bold')

# Accuracy comparison
ax = axes[0, 0]
models = ['Logistic Regression', 'Random Forest']
accuracies = [lr_accuracy, rf_accuracy]
ax.bar(models, accuracies, color=['#3498db', '#e74c3c'])
ax.set_ylabel('Accuracy')
ax.set_title('Accuracy Comparison')
ax.set_ylim([0, 1])
for i, v in enumerate(accuracies):
    ax.text(i, v + 0.02, f'{v:.4f}', ha='center')

# Precision-Recall comparison
ax = axes[0, 1]
x = np.arange(len(models))
width = 0.35
precisions = [lr_precision, rf_precision]
recalls = [lr_recall, rf_recall]
ax.bar(x - width/2, precisions, width, label='Precision', color='#2ecc71')
ax.bar(x + width/2, recalls, width, label='Recall', color='#f39c12')
ax.set_ylabel('Score')
ax.set_title('Precision vs Recall')
ax.set_xticks(x)
ax.set_xticklabels(models)
ax.legend()
ax.set_ylim([0, 1])

# F1-Score comparison
ax = axes[1, 0]
f1_scores = [lr_f1, rf_f1]
ax.bar(models, f1_scores, color=['#3498db', '#e74c3c'])
ax.set_ylabel('F1-Score')
ax.set_title('F1-Score Comparison')
ax.set_ylim([0, 1])
for i, v in enumerate(f1_scores):
    ax.text(i, v + 0.02, f'{v:.4f}', ha='center')

# ROC-AUC comparison
ax = axes[1, 1]
roc_aucs = [lr_roc_auc, rf_roc_auc]
ax.bar(models, roc_aucs, color=['#3498db', '#e74c3c'])
ax.set_ylabel('ROC-AUC')
ax.set_title('ROC-AUC Comparison')
ax.set_ylim([0, 1])
for i, v in enumerate(roc_aucs):
    ax.text(i, v + 0.02, f'{v:.4f}', ha='center')

plt.tight_layout()
plt.show()

## 14. Visualizations - Confusion Matrices

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
fig.suptitle('Confusion Matrices', fontsize=16, fontweight='bold')

# Logistic Regression confusion matrix
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Blues', ax=axes[0], cbar=False)
axes[0].set_title('Logistic Regression')
axes[0].set_ylabel('True Label')
axes[0].set_xlabel('Predicted Label')

# Random Forest confusion matrix
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Reds', ax=axes[1], cbar=False)
axes[1].set_title('Random Forest Classifier')
axes[1].set_ylabel('True Label')
axes[1].set_xlabel('Predicted Label')

plt.tight_layout()
plt.show()

## 15. Visualizations - ROC Curves

In [None]:
plt.figure(figsize=(10, 7))

# Logistic Regression ROC curve
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_pred_proba_lr)
plt.plot(fpr_lr, tpr_lr, label=f"Logistic Regression (AUC = {lr_roc_auc:.4f})", 
        linewidth=2, color='#3498db')

# Random Forest ROC curve
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_proba_rf)
plt.plot(fpr_rf, tpr_rf, label=f"Random Forest (AUC = {rf_roc_auc:.4f})", 
        linewidth=2, color='#e74c3c')

# Diagonal line (random classifier)
plt.plot([0, 1], [0, 1], 'k--', linewidth=1, label='Random Classifier')

plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curves - Student Risk Prediction', fontsize=14, fontweight='bold')
plt.legend(loc='lower right', fontsize=11)
plt.grid(alpha=0.3)
plt.show()

## 16. Feature Importance Analysis (Random Forest)

In [None]:
# Get feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("="*60)
print("TOP 15 IMPORTANT FEATURES (Random Forest)")
print("="*60)
print(feature_importance.head(15).to_string(index=False))

In [None]:
# Plot feature importance
plt.figure(figsize=(10, 8))
top_features = feature_importance.head(15)
plt.barh(range(len(top_features)), top_features['importance'].values, color='#9b59b6')
plt.yticks(range(len(top_features)), top_features['feature'].values)
plt.xlabel('Importance Score', fontsize=12)
plt.title('Top 15 Feature Importance - Random Forest', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 17. Summary and Conclusions

In [None]:
print("="*70)
print("ANALYSIS SUMMARY - STUDENT RISK PREDICTION")
print("="*70)

print("\n1. DATASET OVERVIEW:")
print(f"   - Total students: {len(df)}")
print(f"   - At-risk students: {y.sum()} ({y.mean()*100:.2f}%)")
print(f"   - Total features used: {len(X.columns)}")

print("\n2. DATA SPLIT:")
print(f"   - Training samples: {len(X_train)} (80%)")
print(f"   - Testing samples: {len(X_test)} (20%)")

print("\n3. MODEL PERFORMANCE - LOGISTIC REGRESSION:")
print(f"   - Accuracy:  {lr_accuracy:.4f}")
print(f"   - Precision: {lr_precision:.4f}")
print(f"   - Recall:    {lr_recall:.4f}")
print(f"   - F1-Score:  {lr_f1:.4f}")
print(f"   - ROC-AUC:   {lr_roc_auc:.4f}")

print("\n4. MODEL PERFORMANCE - RANDOM FOREST:")
print(f"   - Accuracy:  {rf_accuracy:.4f}")
print(f"   - Precision: {rf_precision:.4f}")
print(f"   - Recall:    {rf_recall:.4f}")
print(f"   - F1-Score:  {rf_f1:.4f}")
print(f"   - ROC-AUC:   {rf_roc_auc:.4f}")

print("\n5. BEST MODEL:")
if rf_roc_auc > lr_roc_auc:
    print(f"   Random Forest outperforms with ROC-AUC: {rf_roc_auc:.4f}")
else:
    print(f"   Logistic Regression outperforms with ROC-AUC: {lr_roc_auc:.4f}")

print("\n" + "="*70)