# Data Preprocesing Notebook

In [32]:
import numpy as np
import pandas as pd

In [33]:
# Importing raw data set of student dropout and academic success
raw_data = pd.read_csv('data/data.csv', sep=";")
raw_data.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [34]:
# Removing non-ordinal (nominal) features
nonordinal_features = ['Marital status', 'Application mode', 'Application order', 'Course', 'Daytime/evening attendance', 'Previous qualification', 'Nacionality', "Mother's qualification", "Father's qualification", "Mother's occupation", "Father's occupation", 'Displaced', 'Educational special needs', 'Debtor', 'Tuition fees up to date', 'Gender', 'Scholarship holder', 'International']

processed_dataset = raw_data.drop(columns=nonordinal_features, axis=1)
processed_dataset.head()

Unnamed: 0,Previous qualification (grade),Admission grade,Age at enrollment,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,122.0,127.3,20,0,0,0,0,0.0,0,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,160.0,142.5,19,0,6,6,6,14.0,0,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,122.0,124.8,19,0,6,0,0,0.0,0,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,122.0,119.6,20,0,6,8,6,13.428571,0,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,100.0,141.5,45,0,6,9,5,12.333333,0,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [35]:
# Determining the proportion of labels for nondroput vs dropout to ensure (and assess) relatively good label balance
target = processed_dataset['Target'].to_numpy()

total_points = len(target)

nondropout = np.where((target == 'Graduate') | (target == 'Enrolled'))[0]

dropout_proportion = ((total_points - len(nondropout)) / total_points)
nondropout_proportion = 1 - dropout_proportion

print(f"Dropout proportion:     {dropout_proportion*100:.2f}%")
print(f"Non-dropout proportion: {nondropout_proportion*100:.2f}%")


Dropout proportion:     32.12%
Non-dropout proportion: 67.88%


The proportion of labels is sufficiently balanced to proceed with modeling.

In [36]:

processed_dataset['y_labels'] = np.where((processed_dataset['Target'] == 'Graduate') | (processed_dataset['Target'] == 'Enrolled'), 1, -1)
processed_dataset.head()


Unnamed: 0,Previous qualification (grade),Admission grade,Age at enrollment,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target,y_labels
0,122.0,127.3,20,0,0,0,0,0.0,0,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout,-1
1,160.0,142.5,19,0,6,6,6,14.0,0,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate,1
2,122.0,124.8,19,0,6,0,0,0.0,0,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout,-1
3,122.0,119.6,20,0,6,8,6,13.428571,0,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate,1
4,100.0,141.5,45,0,6,9,5,12.333333,0,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate,1


In [37]:
features_only = processed_dataset.drop(columns=['Target', 'y_labels'])

# Calculate and display the minimum, maximum, mean, and median values for each feature
min_values = features_only.min()
max_values = features_only.max()
mean_values = features_only.mean()
median_values = features_only.median()

print("Minimum values for each feature:")
print(min_values)

print("\nMaximum values for each feature:")
print(max_values)

print("\nMean values for each feature:")
print(mean_values)

print("\nMedian values for each feature:")
print(median_values)

Minimum values for each feature:
Previous qualification (grade)                    95.00
Admission grade                                   95.00
Age at enrollment                                 17.00
Curricular units 1st sem (credited)                0.00
Curricular units 1st sem (enrolled)                0.00
Curricular units 1st sem (evaluations)             0.00
Curricular units 1st sem (approved)                0.00
Curricular units 1st sem (grade)                   0.00
Curricular units 1st sem (without evaluations)     0.00
Curricular units 2nd sem (credited)                0.00
Curricular units 2nd sem (enrolled)                0.00
Curricular units 2nd sem (evaluations)             0.00
Curricular units 2nd sem (approved)                0.00
Curricular units 2nd sem (grade)                   0.00
Curricular units 2nd sem (without evaluations)     0.00
Unemployment rate                                  7.60
Inflation rate                                    -0.80
GDP            

In the dataset, there are several potential outliers:
1. Age at Enrollment:
    - Maximum: 70.00
    - Mean: 23.27
    - Median: 20.00

The maximum age is significantly higher than both the mean and median.
    
2. Curricular units 1st sem (evaluations)
    - Maximum: 45.00
    - Mean: 8.30
    - Median: 8.00

3. Curricular units 2nd sem (evaluations)
    - Maximum: 33.00
    - Mean: 8.06
    - Median: 8.00

The maximum values for evaluations in both semesters are considerably higher than their respective means and medians.

4. Inflation Rate:
    - Maximum: 3.70
    - Mean: 1.23
    - Median: 1.40

The maximum inflation rate is higher than the average rates observed.

5. GDP:
    - Minimum: -4.06
    - Mean: 0.002
    - Median: 0.32

The minimum GDP is notably lower than the mean and median.

While these features in the dataset contain outliers, we are choosing not to remove them, as they appear contextually valid within the domain of the dataset.

In [38]:
# Feature Scaling
# Resources Used: 
#   - https://stackoverflow.com/questions/26225344/why-feature-scaling-in-svm
#   - https://scikit-learn.org/stable/modules/preprocessing.html

from sklearn.preprocessing import MinMaxScaler

features = processed_dataset.columns.drop(['Target', 'y_labels'])

# Normalizes scale of features so that all data points are in range between 0 and 1
min_max_scaler = MinMaxScaler()
processed_dataset[features] = min_max_scaler.fit_transform(processed_dataset[features])

processed_dataset.head()


Unnamed: 0,Previous qualification (grade),Admission grade,Age at enrollment,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target,y_labels
0,0.284211,0.34,0.056604,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.372093,0.488889,0.766182,Dropout,-1
1,0.684211,0.5,0.037736,0.0,0.230769,0.133333,0.230769,0.741722,0.0,0.0,0.26087,0.181818,0.3,0.735897,0.0,0.732558,0.111111,0.640687,Graduate,1
2,0.284211,0.313684,0.037736,0.0,0.230769,0.0,0.0,0.0,0.0,0.0,0.26087,0.0,0.0,0.0,0.0,0.372093,0.488889,0.766182,Dropout,-1
3,0.284211,0.258947,0.056604,0.0,0.230769,0.177778,0.230769,0.711447,0.0,0.0,0.26087,0.30303,0.25,0.667692,0.0,0.209302,0.0,0.124174,Graduate,1
4,0.052632,0.489474,0.528302,0.0,0.230769,0.2,0.192308,0.653422,0.0,0.0,0.26087,0.181818,0.3,0.7,0.0,0.732558,0.111111,0.640687,Graduate,1


In [39]:
processed_dataset.to_csv('data/processed-data.csv', index=False)