# Data analysis

In [43]:
# Imports
import os
import pandas as pd
import numpy as np

In [44]:
## Exploring the data

In [45]:
# Read data
# Define the file path
file_path = 'data/dataset.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4424 entries, 0 to 4423
Data columns (total 35 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Marital status                                  4424 non-null   int64  
 1   Application mode                                4424 non-null   int64  
 2   Application order                               4424 non-null   int64  
 3   Course                                          4424 non-null   int64  
 4   Daytime/evening attendance                      4424 non-null   int64  
 5   Previous qualification                          4424 non-null   int64  
 6   Nacionality                                     4424 non-null   int64  
 7   Mother's qualification                          4424 non-null   int64  
 8   Father's qualification                          4424 non-null   int64  
 9   Mother's occupation                      

In [46]:
non_integer_columns = df.select_dtypes(exclude=['int64']).columns
print(non_integer_columns)
non_integer_dtypes = df.select_dtypes(exclude=['int64']).dtypes
print(non_integer_dtypes)
print(df[non_integer_columns].describe())


Index(['Curricular units 1st sem (grade)', 'Curricular units 2nd sem (grade)',
       'Unemployment rate', 'Inflation rate', 'GDP', 'Target'],
      dtype='object')
Curricular units 1st sem (grade)    float64
Curricular units 2nd sem (grade)    float64
Unemployment rate                   float64
Inflation rate                      float64
GDP                                 float64
Target                               object
dtype: object
       Curricular units 1st sem (grade)  Curricular units 2nd sem (grade)  \
count                       4424.000000                       4424.000000   
mean                          10.640822                         10.230206   
std                            4.843663                          5.210808   
min                            0.000000                          0.000000   
25%                           11.000000                         10.750000   
50%                           12.285714                         12.200000   
75%               

In [47]:
df.columns

Index(['Marital status', 'Application mode', 'Application order', 'Course',
       'Daytime/evening attendance', 'Previous qualification', 'Nacionality',
       'Mother's qualification', 'Father's qualification',
       'Mother's occupation', 'Father's occupation', 'Displaced',
       'Educational special needs', 'Debtor', 'Tuition fees up to date',
       'Gender', 'Scholarship holder', 'Age at enrollment', 'International',
       'Curricular units 1st sem (credited)',
       'Curricular units 1st sem (enrolled)',
       'Curricular units 1st sem (evaluations)',
       'Curricular units 1st sem (approved)',
       'Curricular units 1st sem (grade)',
       'Curricular units 1st sem (without evaluations)',
       'Curricular units 2nd sem (credited)',
       'Curricular units 2nd sem (enrolled)',
       'Curricular units 2nd sem (evaluations)',
       'Curricular units 2nd sem (approved)',
       'Curricular units 2nd sem (grade)',
       'Curricular units 2nd sem (without evaluations)

In [48]:
set(df['Target'])

{'Dropout', 'Enrolled', 'Graduate'}

We see that there are three categories for the target variable. I'm not sure how we should handle enrolled, as it doesn't really help us with predicting dropout, so I will remove them for now. 

In [49]:
# Assuming df is your DataFrame
counts = df['Target'].value_counts()
print(counts)


Graduate    2209
Dropout     1421
Enrolled     794
Name: Target, dtype: int64


In [50]:
# Assuming df is your DataFrame
filtered_df = df[df['Target'] != 'Enrolled']

# If you want to modify the original DataFrame in-place:
df = df[df['Target'] != 'Enrolled']

# After filtering, check the shape or head of your DataFrame to confirm the operation:
print(filtered_df.shape)
print(filtered_df.head())


(3630, 35)
   Marital status  Application mode  Application order  Course  \
0               1                 8                  5       2   
1               1                 6                  1      11   
2               1                 1                  5       5   
3               1                 8                  2      15   
4               2                12                  1       3   

   Daytime/evening attendance  Previous qualification  Nacionality  \
0                           1                       1            1   
1                           1                       1            1   
2                           1                       1            1   
3                           1                       1            1   
4                           0                       1            1   

   Mother's qualification  Father's qualification  Mother's occupation  ...  \
0                      13                      10                    6  ...   
1            

We are left with 3650 useable examples. Now I will map the categories to 1 and 0 so it can be used for training

In [51]:
# Create a mapping dictionary with Dropout as 1 and Graduate as 0
target_mapping = {
    'Dropout': 1,
    'Graduate': 0
}

# Apply the mapping to the 'Target' column
df['Target'] = df['Target'].map(target_mapping)


In [56]:
print('Class balance:', sum(df['Target'] == 1) / len(df['Target'])*100 , '% of the dataset are dropouts, excluding current students.')

Class balance: 39.146005509641874 % of the dataset are dropouts, excluding current students.


# Traning

### Without cross-validation

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import time
from prettytable import PrettyTable

In [28]:
# Splitting the data
X = df.drop('Target', axis=1)
y = df['Target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# Initializing the models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "CART": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Neural Network": MLPClassifier(max_iter=1000),
    # "KNN": KNeighborsClassifier()  # Commented out due to error
}

results = {}
trained_models = {}  

for name, model in models.items():
    print(f"Training {name}...")
    
    start_time = time.time()
    model.fit(X_train, y_train)
    end_time = time.time()
    
    elapsed_time = end_time - start_time
    minutes = int(elapsed_time // 60)
    seconds = int(elapsed_time % 60)
    
    print(f"{name} trained in {minutes}:m,{seconds}s")
    
    in_sample_score = accuracy_score(y_train, model.predict(X_train))
    out_of_sample_score = accuracy_score(y_test, model.predict(X_test))
    
    results[name] = [in_sample_score, out_of_sample_score, elapsed_time]
    trained_models[name] = model  

# Displaying results using PrettyTable
table = PrettyTable()
table.field_names = ["Model", "In-sample Accuracy", "Out-of-sample Accuracy", "Training Time (s)"]

for name, metrics in results.items():
    table.add_row([name] + metrics)

print(table)


Training Logistic Regression...
Logistic Regression trained in 0:m,0s
Training SVM...
SVM trained in 0:m,0s
Training CART...
CART trained in 0:m,0s
Training Random Forest...
Random Forest trained in 0:m,0s
Training Neural Network...
Neural Network trained in 0:m,44s
+---------------------+--------------------+------------------------+----------------------+
|        Model        | In-sample Accuracy | Out-of-sample Accuracy |  Training Time (s)   |
+---------------------+--------------------+------------------------+----------------------+
| Logistic Regression |  0.91538764266037  |   0.9201101928374655   | 0.29344844818115234  |
|         SVM         | 0.8890200708382526 |   0.898989898989899    | 0.09829306602478027  |
|         CART        |        1.0         |   0.8631772268135904   | 0.017052412033081055 |
|    Random Forest    |        1.0         |   0.9090909090909091   | 0.31484508514404297  |
|    Neural Network   | 0.9807162534435262 |   0.8943985307621671   |  44.87282037

Note that KNN was skipped since I got an error with running predict to compute the accuracy metric

We can see that all models did very well in a short amount of time juding by accuracy. Note that the dataset is somewhat unbalanced. Should check other metrics as well depending on model. I.e. FP, FN, precision, recall, f1-score, confusion matrices etc.

Below I save the results in a CSV file for later use.

In [26]:
save_results = results
# Check if the 'results' directory exists, if not, create it
if not os.path.exists('results'):
    os.makedirs('results')

# Convert the results dictionary to a pandas DataFrame and save it to a CSV file
results_df = pd.DataFrame.from_dict(save_results, orient='index', columns=["In-sample Accuracy", "Out-of-sample Accuracy", "Training Time (s)"])
results_df.to_csv('results/model_results.csv')


Next step: also try gradient boosting methods like lightgbm and xtreme gradient booston.
Next step: Make more performance metrics as listed in earlier comment.
Next step: Make plots or otherwise quantify variable importance.