# Student Dropout Prediction - Model Exploration

This notebook explores the dataset and builds a predictive model to identify students at risk of dropping out.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_curve, roc_auc_score

# Set plot style
plt.style.use('ggplot')
sns.set(style="whitegrid")

# Set pandas display options
pd.set_option('display.max_columns', None)

## 1. Load and Explore the Dataset

In [None]:
# Load the dataset
data_path = '../dataset.csv'
data = pd.read_csv(data_path)

# Display basic information about the dataset
print(f"Dataset shape: {data.shape}")
data.head()

In [None]:
# Check for missing values
missing_values = data.isnull().sum()
print("Missing values by column:")
missing_values[missing_values > 0]

In [None]:
# Explore the target variable
target_counts = data['Target'].value_counts()
print("Target variable distribution:")
print(target_counts)

# Plot the distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='Target', data=data)
plt.title('Target Variable Distribution')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 2. Data Preprocessing

In [None]:
# Filter data to include only Dropout and Graduate outcomes
filtered_data = data[data['Target'].isin(['Dropout', 'Graduate'])]
print(f"Filtered dataset shape: {filtered_data.shape}")

In [None]:
# Select features to be used in the model
features = [
    'Age at enrollment',
    'Curricular units 1st sem (enrolled)',
    'Curricular units 1st sem (approved)',
    'Curricular units 2nd sem (enrolled)',
    'Curricular units 2nd sem (approved)',
    'Unemployment rate'
]

X = filtered_data[features]
y = filtered_data['Target']

# Display the first few rows of the features
X.head()

## 3. Exploratory Data Analysis

In [None]:
# Analyze correlations between features
plt.figure(figsize=(12, 10))
correlation = X.corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

In [None]:
# Visualize the distribution of key features by target
fig, axes = plt.subplots(3, 2, figsize=(15, 18))
axes = axes.flatten()

for i, feature in enumerate(features):
    sns.boxplot(x='Target', y=feature, data=filtered_data, ax=axes[i])
    axes[i].set_title(f'Distribution of {feature} by Target')
    axes[i].set_xlabel('')
    if feature in ['Age at enrollment', 'Unemployment rate']:
        axes[i].set_ylabel(feature)
    else:
        axes[i].set_ylabel('Count')

plt.tight_layout()
plt.show()

In [None]:
# Check the approval rates in 1st and 2nd semesters
filtered_data['1st_sem_approval_rate'] = filtered_data['Curricular units 1st sem (approved)'] / filtered_data['Curricular units 1st sem (enrolled)']
filtered_data['2nd_sem_approval_rate'] = filtered_data['Curricular units 2nd sem (approved)'] / filtered_data['Curricular units 2nd sem (enrolled)']

# Handle NaN values from division by zero
filtered_data['1st_sem_approval_rate'] = filtered_data['1st_sem_approval_rate'].fillna(0)
filtered_data['2nd_sem_approval_rate'] = filtered_data['2nd_sem_approval_rate'].fillna(0)

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.boxplot(x='Target', y='1st_sem_approval_rate', data=filtered_data)
plt.title('1st Semester Approval Rate by Target')
plt.ylabel('Approval Rate')
plt.ylim(0, 1.05)

plt.subplot(1, 2, 2)
sns.boxplot(x='Target', y='2nd_sem_approval_rate', data=filtered_data)
plt.title('2nd Semester Approval Rate by Target')
plt.ylabel('Approval Rate')
plt.ylim(0, 1.05)

plt.tight_layout()
plt.show()

## 4. Model Training and Evaluation

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Train logistic regression model
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)
y_prob = model.predict_proba(X_test_scaled)[:, 1]  # Probability of the positive class

In [None]:
# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=model.classes_,
            yticklabels=model.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.show()

In [None]:
# ROC Curve
# Convert target to binary (0 and 1) for ROC computation
label_encoder = {class_name: i for i, class_name in enumerate(model.classes_)}
y_test_binary = y_test.map(label_encoder)

fpr, tpr, _ = roc_curve(y_test_binary, y_prob)
auc = roc_auc_score(y_test_binary, y_prob)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, marker='.', label=f'Logistic Regression (AUC = {auc:.3f})')
plt.plot([0, 1], [0, 1], linestyle='--', label='No Skill')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

## 5. Feature Importance Analysis

In [None]:
# Analyze feature importance
feature_importance = pd.DataFrame({
    'Feature': features,
    'Coefficient': model.coef_[0]
})

# Sort by absolute coefficient value
feature_importance['Abs_Coefficient'] = abs(feature_importance['Coefficient'])
feature_importance = feature_importance.sort_values('Abs_Coefficient', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(x='Coefficient', y='Feature', data=feature_importance)
plt.title('Feature Importance (Logistic Regression Coefficients)')
plt.xlabel('Coefficient Value')
plt.ylabel('Feature')
plt.axvline(x=0, color='black', linestyle='--')
plt.tight_layout()
plt.show()

## 6. Model Saving

In [None]:
import joblib
import os

# Create models directory if it doesn't exist
os.makedirs('../models', exist_ok=True)

# Save the model, scaler, and features
model_data = {
    'model': model,
    'scaler': scaler,
    'features': features
}

joblib.dump(model_data, '../models/dropout_predictor.pkl')
print("Model saved to ../models/dropout_predictor.pkl")