In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf

# Data Citation

Realinho, V., Vieira Martins, M., Machado, J., & Baptista, L. (2021). Predict Students' Dropout and Academic Success [Dataset]. UCI Machine Learning Repository. https://doi.org/10.24432/C5MC89.

Link: https://archive.ics.uci.edu/dataset/697/predict+students+dropout+and+academic+success

In [None]:
df = pd.read_csv('data.csv', sep=';')

In [None]:
df.head()

In [None]:
df['Target'].unique()

Since we are considering the success of students, I will only include students who graduated versus dropped out from college. This is due to students that have an enrolled status do not consider to be success nor non-success.

In [None]:
df = df[df['Target']!= 'Enrolled']

In [None]:
df['Target'].unique()

In [None]:
df['Target'] = (df['Target'] == 'Dropout').astype(int) #dropout == 1

In [None]:
df.head()

In [None]:
df.isna().sum() #examining whether there is null data, there is none.

# Visualization

Now that we have converted the Target column to binary numbers, we can graph out different factors and see how they impact student success.

In [None]:
for label in df.columns[:-1]:
    plt.hist(df[df['Target']==1][label], color = 'blue', label='Dropout', alpha=0.7, density=True)
    plt.hist(df[df['Target']==0][label], color = 'red', label='Graduated', alpha=0.7, density = True)
    plt.title(label)
    plt.ylabel('Probability')
    plt.xlabel(label)
    plt.legend()
    plt.show()

# Train, validation, test

In [None]:
train, valid, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))]) #60% train, 20% valid, 20% test

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
def scale_dataset(dataframe):
    X = dataframe[dataframe.columns[:-1]].values
    y = dataframe[dataframe.columns[-1]].values

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    data = np.hstack((X, np.reshape(y, (-1, 1))))
    return data, X, y

In [None]:
train, X_train, y_train = scale_dataset(train)

In [None]:
valid, X_valid, y_valid = scale_dataset(valid)

In [None]:
test, X_test, y_test = scale_dataset(test)

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
X_test.shape

# knn
determining the class by its distance to data points

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=1)
knn_model.fit(X_train, y_train)

In [None]:
y_pred = knn_model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test, y_pred))

### Evaluate

The accuracy is quite good, but we can test out other models to find the best one.

# Naive Bayes

(A|B)*P(B)/P(A)

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
nb_model = GaussianNB()
nb_model = nb_model.fit(X_train, y_train)

In [None]:
y_pred = nb_model.predict(X_test)
print(classification_report(y_test, y_pred))

### Evaluate

This accuracy is even better. With a higher recall (which means out of all the predicted dropouts, 87 percentage are actually dropouts, which is pretty good). Let's try the logistic regression.

# Logistic Regression

relationship between coefficients

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lg_model = LogisticRegression()
lg_model = lg_model.fit(X_train, y_train)

In [None]:
y_pred = lg_model.predict(X_test)
print(classification_report(y_test, y_pred))

### Evaluate

This one is the best so far, with 0.96 recall and 0.89 accuracy.

# SVM

draw a line to separate the two classes

In [None]:
from sklearn.svm import SVC

In [None]:
svm_model = SVC()
svm_model = svm_model.fit(X_train, y_train)

In [None]:
y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred))

### Evaluate

This one has a higher recall, but lower precision and accuracy. So I would still consider the logistic model to be the most optimal.

# Neutral Network

In [1]:
import torch

In [2]:
import torch.nn as nn
import torch.optim as optim

In [None]:
class StudentNN(nn.Module):
    def __init__(self, input_dim):
        super(StudentNN, self).__init__()
        # Define the architecture
        self.fc1 = nn.Linear(input_dim, 128)  # First layer (input_dim is number of features)
        self.fc2 = nn.Linear(128, 64)         # Second layer
        self.fc3 = nn.Linear(64, 32)          # Third layer
        self.fc4 = nn.Linear(32, 1)           # Output layer (1 for dropout or graduation)
        self.relu = nn.ReLU()                 # ReLU activation
        self.sigmoid = nn.Sigmoid()           # Sigmoid for binary classification

    def forward(self, x):
        x = self.fc1(x)        # First layer
        x = self.relu(x)       # Apply ReLU activation
        x = self.fc2(x)        # Second layer
        x = self.relu(x)       # Apply ReLU activation
        x = self.fc3(x)        # Third layer
        x = self.relu(x)       # Apply ReLU activation
        x = self.fc4(x)        # Output layer
        x = self.sigmoid(x)    # Sigmoid activation (output probability)
        return x

In [None]:
X_train_small = X_train[:1000]
y_train_small = y_train[:1000]

In [None]:
X_train_small.shape

In [None]:
y_test.shape

In [None]:
X_train_tensor = torch.tensor(X_train_small, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_small, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

In [None]:
# Initialize the model
model = StudentNN(input_dim=X_train.shape[1]) #X_train rows

# Loss function and optimizer
criterion = nn.BCELoss()  # Binary Cross Entropy loss for binary classification
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    optimizer.zero_grad()  # Clear previous gradients
    
    # Forward pass
    outputs = model(X_train_tensor)
    
    # Compute loss
    loss = criterion(outputs, y_train_tensor)
    
    # Backward pass (compute gradients)
    loss.backward()
    
    # Update model parameters
    optimizer.step()
    
    if (epoch + 1) % 10 == 0:  # Print loss every 10 epochs
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

In [None]:
model.eval()  # Set model to evaluation mode (disables dropout, batch norm, etc.)
with torch.no_grad():  # Disable gradient calculation for testing
    # Forward pass on test data
    outputs = model(X_test_tensor)
    
    # Convert outputs to predicted class (0 or 1)
    predicted = (outputs > 0.5).float()  # Sigmoid output > 0.5 means predicted 1, else 0
    
    # Calculate accuracy
    accuracy = (predicted == y_test_tensor).sum().item() / y_test_tensor.size(0)
    print(f'Accuracy on test data: {accuracy * 100:.2f}%')


# Based on the models, it seems like logistic model is the best fit since it has the highest accuracy, precision, and recall scores.

In [None]:
# Saving the model

import joblib
scaler = StandardScaler()

In [None]:
# Save the model to a file
joblib.dump(lg_model, 'logistic_regression_model.pkl')

# Optionally, save the scaler as well if you used it for scaling features
joblib.dump(scaler, 'scaler.pkl')

In [None]:
# Load the saved model
model = joblib.load('logistic_regression_model.pkl')

# Load the scaler (if applicable)
scaler = joblib.load('scaler.pkl')

# Assume you have new data for prediction (let's call it X_new)
# Don't forget to scale the new data the same way as the training data
X_new_scaled = scaler.transform(X_new)

# Make predictions using the loaded model
predictions = model.predict(X_new_scaled)

# If you want probability predictions (in case you're working with classification probabilities)
probabilities = model.predict_proba(X_new_scaled)