# Imports

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# From our Data Directory
from pathlib import Path
data_dir = Path('data')
outputs_dir = Path('outputs')

# Uploading and Cleaning Data

In [2]:
student_data = pd.read_csv(data_dir/'student_data.csv', sep = ';' )

In [3]:
print(student_data.columns)

Index(['Marital status', 'Application mode', 'Application order', 'Course',
       'Daytime/evening attendance\t', 'Previous qualification',
       'Previous qualification (grade)', 'Nacionality',
       'Mother's qualification', 'Father's qualification',
       'Mother's occupation', 'Father's occupation', 'Admission grade',
       'Displaced', 'Educational special needs', 'Debtor',
       'Tuition fees up to date', 'Gender', 'Scholarship holder',
       'Age at enrollment', 'International',
       'Curricular units 1st sem (credited)',
       'Curricular units 1st sem (enrolled)',
       'Curricular units 1st sem (evaluations)',
       'Curricular units 1st sem (approved)',
       'Curricular units 1st sem (grade)',
       'Curricular units 1st sem (without evaluations)',
       'Curricular units 2nd sem (credited)',
       'Curricular units 2nd sem (enrolled)',
       'Curricular units 2nd sem (evaluations)',
       'Curricular units 2nd sem (approved)',
       'Curricular units 2nd

#### Clean Student Data

In [13]:
# We are going to do an EDA on age at enrollment, debtor (whether the student is going into debt) and admission grade. 
# We will then examine their relationship to our target variable (whether the student dropped out or graduated).

clean_sd = student_data[['Previous qualification (grade)','Debtor', 'Age at enrollment', 
                         'Admission grade', 'Target', 'Tuition fees up to date', "Mother's qualification", "Father's qualification"]]

# Removing the 794 students that are marked as enrolled
clean_sd = clean_sd[clean_sd["Target"] != "Enrolled"]

## Exploring 'Mother's occupation' and 'Father's qualification' for categorization

In [17]:
mothers_quals = sorted(clean_sd["Mother's qualification"].unique())
fathers_quals = sorted(clean_sd["Father's qualification"].unique())

# No 13, 20, 25, 31, 33
print(mothers_quals)

print(fathers_quals)

len(fathers_quals)
# shared_values = set(mothers_quals) & set(fathers_quals)
# shared_values

[1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 14, 18, 19, 22, 26, 27, 29, 30, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44]
[1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 13, 14, 18, 19, 20, 22, 25, 26, 27, 29, 30, 31, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44]


34

In [18]:
allowed_values = {1, 2, 3, 4, 5, 18, 22, 35, 39, 41, 42}

clean_sd_trial = clean_sd[clean_sd["Father's qualification"].isin(allowed_values) & clean_sd["Mother's qualification"].isin(allowed_values)]

In [19]:
clean_sd_trial

Unnamed: 0,Previous qualification (grade),Debtor,Age at enrollment,Admission grade,Target,Tuition fees up to date,Mother's qualification,Father's qualification,Mother_edu_category,Mother_edu_code,Father_edu_category,Father_edu_code
1,160.0,0,19,142.5,Graduate,0,1,3,High School (Completed),5,Completed Bachelors,7
8,137.0,0,21,129.3,Graduate,1,1,1,High School (Completed),5,High School (Completed),5
13,110.0,0,21,111.8,Graduate,1,1,1,High School (Completed),5,High School (Completed),5
20,122.0,0,21,120.3,Graduate,1,1,1,High School (Completed),5,High School (Completed),5
31,125.0,0,20,130.0,Graduate,1,42,3,Professional Course,2,Completed Bachelors,7
...,...,...,...,...,...,...,...,...,...,...,...,...
4403,137.0,0,19,124.8,Graduate,1,1,1,High School (Completed),5,High School (Completed),5
4404,133.1,1,28,120.0,Dropout,0,2,1,Completed Bachelors,7,High School (Completed),5
4417,132.0,0,20,133.8,Graduate,1,1,1,High School (Completed),5,High School (Completed),5
4419,125.0,0,19,122.2,Graduate,1,1,1,High School (Completed),5,High School (Completed),5


In [5]:
# 1 - Secondary Education - 12th Year of Schooling or Eq. 
# 2 - Higher Education - Bachelor's Degree 
# 3 - Higher Education - Degree 
# 4 - Higher Education - Master's 
# 5 - Higher Education - Doctorate 
# 6 - Frequency of Higher Education 
# 9 - 12th Year of Schooling - Not Completed 
# 10 - 11th Year of Schooling - Not Completed 
# 11 - 7th Year (Old) 
# 12 - Other - 11th Year of Schooling 
# 14 - 10th Year of Schooling 
# 18 - General commerce course 
# 19 - Basic Education 3rd Cycle (9th/10th/11th Year) or Equiv. 
# 22 - Technical-professional course 
# 26 - 7th year of schooling 
# 27 - 2nd cycle of the general high school course 
# 29 - 9th Year of Schooling - Not Completed 
# 30 - 8th year of schooling 
# 34 - Unknown 
# 35 - Can't read or write 
# 36 - Can read without having a 4th year of schooling 
# 37 - Basic education 1st cycle (4th/5th year) or equiv. 
# 38 - Basic Education 2nd Cycle (6th/7th/8th Year) or Equiv. 
# 39 - Technological specialization course 
# 40 - Higher education - degree (1st cycle) 
# 41 - Specialized higher studies course 
# 42 - Professional higher technical course 
# 43 - Higher Education - Master (2nd cycle) 
# 44 - Higher Education - Doctorate (3rd cycle)

# -------

# Unknown - 0
# Illiterate - 1
# Some Level of Elementary School Education -  2
# Elemenatary School (Completed) - 3
# Middle School Education (Completed) - 4
# High School (Completed) - 5
# Process of Getting Bachelors - 6
# Completed Bachelors - 7
# Process of Getting Masters - 8
# Completed Masters - 9
# Process of Getting Doctorate - 10
# Completed Doctorate - 11

Categorizing the 34 previous categories into 12 categories to better understand the data.

In [20]:
# Complete data including new codes
data = {
    "Code": [1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 14, 18, 19, 22, 26, 27, 29, 30, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 34,
             13, 20, 25, 31, 33],
    "Original Description": [
        "Secondary Education - 12th Year of Schooling or Eq.",
        "Higher Education - Bachelor's Degree",
        "Higher Education - Degree",
        "Higher Education - Master's",
        "Higher Education - Doctorate",
        "Frequency of Higher Education",
        "12th Year of Schooling - Not Completed",
        "11th Year of Schooling - Not Completed",
        "7th Year (Old)",
        "Other - 11th Year of Schooling",
        "10th Year of Schooling",
        "General commerce course",
        "Basic Education 3rd Cycle (9th/10th/11th Year) or Equiv.",
        "Technical-professional course",
        "7th year of schooling",
        "2nd cycle of the general high school course",
        "9th Year of Schooling - Not Completed",
        "8th year of schooling",
        "Can't read or write",
        "Can read without having a 4th year of schooling",
        "Basic education 1st cycle (4th/5th year) or equiv.",
        "Basic Education 2nd Cycle (6th/7th/8th Year) or Equiv.",
        "Technological specialization course",
        "Higher education - degree (1st cycle)",
        "Specialized higher studies course",
        "Professional higher technical course",
        "Higher Education - Master (2nd cycle)",
        "Higher Education - Doctorate (3rd cycle)",
        "Unknown",
        "2nd year complementary high school course",
        "Complementary High School Course",
        "Complementary High School Course – not concluded",
        "General Course of Administration and Commerce",
        "Supplementary Accounting and Administration"
    ],
    "Category": [
        "High School (Completed)",
        "Completed Bachelors",
        "Completed Bachelors",
        "Completed Masters",
        "Completed Doctorate",
        "Process of Getting Bachelors",
        "Middle School Education (Completed)",
        "Middle School Education (Completed)",
        "Elementary School (Completed)",
        "Middle School Education (Completed)",
        "Middle School Education (Completed)",
        "Professional Course",
        "Middle School Education (Completed)",
        "Professional Course",
        "Elementary School (Completed)",
        "High School (Completed)",
        "Middle School Education (Completed)",
        "Middle School Education (Completed)",
        "Illiterate",
        "Some Level of Elementary School Education",
        "Elementary School (Completed)",
        "Some Level of Elementary School Education",
        "Professional Course",
        "Process of Getting Bachelors",
        "Process of Getting Masters",
        "Professional Course",
        "Completed Masters",
        "Completed Doctorate",
        "Unknown",
        "Process of Getting Bachelors",
        "Process of Getting Bachelors",
        "Process of Getting Bachelors",
        "High School (Completed)",
        "Professional Course"
    ]
}

# Mapping categories to numeric codes
category_to_code = {
    "Unknown": 0,
    "Illiterate": 1,
    "Some Level of Elementary School Education": 2,
    "Elementary School (Completed)": 3,
    "Middle School Education (Completed)": 4,
    "High School (Completed)": 5,
    "Process of Getting Bachelors": 6,
    "Completed Bachelors": 7,
    "Process of Getting Masters": 8,
    "Completed Masters": 9,
    "Process of Getting Doctorate": 10,
    "Completed Doctorate": 11,
    "Professional Course": 2  # same as Some Level of Elementary or adjust if desired
}

# Create DataFrame
education_df = pd.DataFrame(data)

# Add numeric category codes
education_df["Category_Code"] = education_df["Category"].map(category_to_code)

# Display
education_df

Unnamed: 0,Code,Original Description,Category,Category_Code
0,1,Secondary Education - 12th Year of Schooling o...,High School (Completed),5
1,2,Higher Education - Bachelor's Degree,Completed Bachelors,7
2,3,Higher Education - Degree,Completed Bachelors,7
3,4,Higher Education - Master's,Completed Masters,9
4,5,Higher Education - Doctorate,Completed Doctorate,11
5,6,Frequency of Higher Education,Process of Getting Bachelors,6
6,9,12th Year of Schooling - Not Completed,Middle School Education (Completed),4
7,10,11th Year of Schooling - Not Completed,Middle School Education (Completed),4
8,11,7th Year (Old),Elementary School (Completed),3
9,12,Other - 11th Year of Schooling,Middle School Education (Completed),4


Now mapping the codes and new categories to our old clean_sd dataframe.

In [15]:
# Create mapping dictionaries
code_to_category = dict(zip(education_df["Code"], education_df["Category"]))
code_to_code = dict(zip(education_df["Code"], education_df["Category_Code"]))

# Map Mother's qualification
clean_sd["Mother_edu_category"] = clean_sd["Mother's qualification"].map(code_to_category)
clean_sd["Mother_edu_code"] = clean_sd["Mother's qualification"].map(code_to_code).fillna(0).astype(int)

# Map Father's qualification
clean_sd["Father_edu_category"] = clean_sd["Father's qualification"].map(code_to_category)
clean_sd["Father_edu_code"] = clean_sd["Father's qualification"].map(code_to_code).fillna(0).astype(int)

# New DF
clean_sd

Unnamed: 0,Previous qualification (grade),Debtor,Age at enrollment,Admission grade,Target,Tuition fees up to date,Mother's qualification,Father's qualification,Mother_edu_category,Mother_edu_code,Father_edu_category,Father_edu_code
0,122.0,0,20,127.3,Dropout,1,19,12,Middle School Education (Completed),4,Middle School Education (Completed),4
1,160.0,0,19,142.5,Graduate,0,1,3,High School (Completed),5,Completed Bachelors,7
2,122.0,0,19,124.8,Dropout,0,37,37,Elementary School (Completed),3,Elementary School (Completed),3
3,122.0,0,20,119.6,Graduate,1,38,37,Some Level of Elementary School Education,2,Elementary School (Completed),3
4,100.0,0,45,141.5,Graduate,1,37,38,Elementary School (Completed),3,Some Level of Elementary School Education,2
...,...,...,...,...,...,...,...,...,...,...,...,...
4419,125.0,0,19,122.2,Graduate,1,1,1,High School (Completed),5,High School (Completed),5
4420,120.0,1,18,119.0,Dropout,0,1,1,High School (Completed),5,High School (Completed),5
4421,154.0,0,30,149.5,Dropout,1,37,37,Elementary School (Completed),3,Elementary School (Completed),3
4422,180.0,0,20,153.8,Graduate,1,37,37,Elementary School (Completed),3,Elementary School (Completed),3


### Exploring Data

In [21]:
# 139 parents have an unknown level of education
# Students "enrolled" ~ 800

# Dataframe without the 'unknown' category
clean_sd_education = clean_sd[
    (clean_sd["Mother's qualification"] != 34) &
    (clean_sd["Father's qualification"] != 34)
]

clean_sd_education

Unnamed: 0,Previous qualification (grade),Debtor,Age at enrollment,Admission grade,Target,Tuition fees up to date,Mother's qualification,Father's qualification,Mother_edu_category,Mother_edu_code,Father_edu_category,Father_edu_code
0,122.0,0,20,127.3,Dropout,1,19,12,Middle School Education (Completed),4,Middle School Education (Completed),4
1,160.0,0,19,142.5,Graduate,0,1,3,High School (Completed),5,Completed Bachelors,7
2,122.0,0,19,124.8,Dropout,0,37,37,Elementary School (Completed),3,Elementary School (Completed),3
3,122.0,0,20,119.6,Graduate,1,38,37,Some Level of Elementary School Education,2,Elementary School (Completed),3
4,100.0,0,45,141.5,Graduate,1,37,38,Elementary School (Completed),3,Some Level of Elementary School Education,2
...,...,...,...,...,...,...,...,...,...,...,...,...
4419,125.0,0,19,122.2,Graduate,1,1,1,High School (Completed),5,High School (Completed),5
4420,120.0,1,18,119.0,Dropout,0,1,1,High School (Completed),5,High School (Completed),5
4421,154.0,0,30,149.5,Dropout,1,37,37,Elementary School (Completed),3,Elementary School (Completed),3
4422,180.0,0,20,153.8,Graduate,1,37,37,Elementary School (Completed),3,Elementary School (Completed),3


Previous qualification (grade) & Admission grade is from a scale of 0 - 200



Debtor = 1 means that yes they going into debt



Tuition fees up to date = 1 means that yes they are up to date