In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

In [4]:
df_mat = pd.read_csv("data/student-mat.csv", sep=';')
df_mat.head()


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [7]:
df_por = pd.read_csv("data/student-por.csv" , sep=";")
df_por.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13


# EDA

## 📄 UCI Student Performance Dataset

### 🔢 Attributes for both `student-mat.csv` (Math course) and `student-por.csv` (Portuguese course):

1. **school** - student's school (binary): `"GP"` (Gabriel Pereira) or `"MS"` (Mousinho da Silveira)  
2. **sex** - student's sex (binary): `"F"` (female) or `"M"` (male)  
3. **age** - student's age (numeric): from 15 to 22  
4. **address** - home address type (binary): `"U"` (urban) or `"R"` (rural)  
5. **famsize** - family size (binary): `"LE3"` (≤3) or `"GT3"` (>3)  
6. **Pstatus** - parent's cohabitation status (binary): `"T"` (together) or `"A"` (apart)  
7. **Medu** - mother's education (numeric):  
   - 0: none  
   - 1: primary education (4th grade)  
   - 2: 5th to 9th grade  
   - 3: secondary education  
   - 4: higher education  
8. **Fedu** - father's education (same scale as Medu)  
9. **Mjob** - mother's job (nominal): `"teacher"`, `"health"`, `"services"`, `"at_home"`, `"other"`  
10. **Fjob** - father's job (same as Mjob)  
11. **reason** - reason for choosing this school (nominal): `"home"`, `"reputation"`, `"course"`, `"other"`  
12. **guardian** - student's guardian (nominal): `"mother"`, `"father"`, `"other"`  
13. **traveltime** - home to school travel time (numeric):  
    - 1: <15 min  
    - 2: 15–30 min  
    - 3: 30 min–1 hour  
    - 4: >1 hour  
14. **studytime** - weekly study time (numeric):  
    - 1: <2 hours  
    - 2: 2–5 hours  
    - 3: 5–10 hours  
    - 4: >10 hours  
15. **failures** - number of past class failures (numeric): 0–3, 4 = 4+ failures  
16. **schoolsup** - extra educational support (binary): `"yes"` or `"no"`  
17. **famsup** - family educational support (binary): `"yes"` or `"no"`  
18. **paid** - extra paid classes (binary): `"yes"` or `"no"`  
19. **activities** - extra-curricular activities (binary): `"yes"` or `"no"`  
20. **nursery** - attended nursery school (binary): `"yes"` or `"no"`  
21. **higher** - wants to pursue higher education (binary): `"yes"` or `"no"`  
22. **internet** - internet access at home (binary): `"yes"` or `"no"`  
23. **romantic** - in a romantic relationship (binary): `"yes"` or `"no"`  
24. **famrel** - family relationship quality (numeric): 1 (very bad) to 5 (excellent)  
25. **freetime** - free time after school (numeric): 1 (very low) to 5 (very high)  
26. **goout** - going out with friends (numeric): 1 (very low) to 5 (very high)  
27. **Dalc** - workday alcohol consumption (numeric): 1 (very low) to 5 (very high)  
28. **Walc** - weekend alcohol consumption (numeric): 1 (very low) to 5 (very high)  
29. **health** - current health status (numeric): 1 (very bad) to 5 (very good)  
30. **absences** - number of school absences (numeric): from 0 to 93  

### 🎯 Grade Attributes:
31. **G1** - First period grade (0–20)  
32. **G2** - Second period grade (0–20)  
33. **G3** - Final grade (0–20) → **Target variable**

---

⚠️ Note:  
There are several (382) students common to both datasets.  
These can be identified via identical attributes, as outlined in the dataset's accompanying R script.



In [13]:
df_mat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      395 non-null    object
 1   sex         395 non-null    object
 2   age         395 non-null    int64 
 3   address     395 non-null    object
 4   famsize     395 non-null    object
 5   Pstatus     395 non-null    object
 6   Medu        395 non-null    int64 
 7   Fedu        395 non-null    int64 
 8   Mjob        395 non-null    object
 9   Fjob        395 non-null    object
 10  reason      395 non-null    object
 11  guardian    395 non-null    object
 12  traveltime  395 non-null    int64 
 13  studytime   395 non-null    int64 
 14  failures    395 non-null    int64 
 15  schoolsup   395 non-null    object
 16  famsup      395 non-null    object
 17  paid        395 non-null    object
 18  activities  395 non-null    object
 19  nursery     395 non-null    object
 20  higher    

In [14]:
df_por.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649 entries, 0 to 648
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      649 non-null    object
 1   sex         649 non-null    object
 2   age         649 non-null    int64 
 3   address     649 non-null    object
 4   famsize     649 non-null    object
 5   Pstatus     649 non-null    object
 6   Medu        649 non-null    int64 
 7   Fedu        649 non-null    int64 
 8   Mjob        649 non-null    object
 9   Fjob        649 non-null    object
 10  reason      649 non-null    object
 11  guardian    649 non-null    object
 12  traveltime  649 non-null    int64 
 13  studytime   649 non-null    int64 
 14  failures    649 non-null    int64 
 15  schoolsup   649 non-null    object
 16  famsup      649 non-null    object
 17  paid        649 non-null    object
 18  activities  649 non-null    object
 19  nursery     649 non-null    object
 20  higher    

In [15]:
df_mat.shape , df_por.shape

((395, 33), (649, 33))

# Right Now lets just use mat data set and predict G3 of maths 

In [17]:
df_mat.describe()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
count,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0
mean,16.696203,2.749367,2.521519,1.448101,2.035443,0.334177,3.944304,3.235443,3.108861,1.481013,2.291139,3.55443,5.708861,10.908861,10.713924,10.41519
std,1.276043,1.094735,1.088201,0.697505,0.83924,0.743651,0.896659,0.998862,1.113278,0.890741,1.287897,1.390303,8.003096,3.319195,3.761505,4.581443
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0
25%,16.0,2.0,2.0,1.0,1.0,0.0,4.0,3.0,2.0,1.0,1.0,3.0,0.0,8.0,9.0,8.0
50%,17.0,3.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,2.0,4.0,4.0,11.0,11.0,11.0
75%,18.0,4.0,3.0,2.0,2.0,0.0,5.0,4.0,4.0,2.0,3.0,5.0,8.0,13.0,13.0,14.0
max,22.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0,75.0,19.0,19.0,20.0


In [20]:
df_mat.corr()

ValueError: could not convert string to float: 'GP'

# We are seeing this error lets first convert string to numeric using panda catagorical 

In [22]:
for label , content in df_mat.items():
    if pd.api.types.is_object_dtype(content):
        print(label)

school
sex
address
famsize
Pstatus
Mjob
Fjob
reason
guardian
schoolsup
famsup
paid
activities
nursery
higher
internet
romantic


In [23]:
#turn all string values in categories
for label  , content in df_mat.items():
    if pd.api.types.is_object_dtype(content):
        df_mat[label] = content.astype("category").cat.as_ordered()


In [24]:
df_mat.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [25]:
df_mat["school"].cat.codes

0      0
1      0
2      0
3      0
4      0
      ..
390    1
391    1
392    1
393    1
394    1
Length: 395, dtype: int8

In [27]:
df_mat["activities"].cat.codes

0      0
1      0
2      0
3      1
4      0
      ..
390    0
391    0
392    0
393    0
394    0
Length: 395, dtype: int8

In [28]:
df_mat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   school      395 non-null    category
 1   sex         395 non-null    category
 2   age         395 non-null    int64   
 3   address     395 non-null    category
 4   famsize     395 non-null    category
 5   Pstatus     395 non-null    category
 6   Medu        395 non-null    int64   
 7   Fedu        395 non-null    int64   
 8   Mjob        395 non-null    category
 9   Fjob        395 non-null    category
 10  reason      395 non-null    category
 11  guardian    395 non-null    category
 12  traveltime  395 non-null    int64   
 13  studytime   395 non-null    int64   
 14  failures    395 non-null    int64   
 15  schoolsup   395 non-null    category
 16  famsup      395 non-null    category
 17  paid        395 non-null    category
 18  activities  395 non-null    category
 19  nursery 

# see object has changed to category

In [29]:
df_mat.isna().sum()

school        0
sex           0
age           0
address       0
famsize       0
Pstatus       0
Medu          0
Fedu          0
Mjob          0
Fjob          0
reason        0
guardian      0
traveltime    0
studytime     0
failures      0
schoolsup     0
famsup        0
paid          0
activities    0
nursery       0
higher        0
internet      0
romantic      0
famrel        0
freetime      0
goout         0
Dalc          0
Walc          0
health        0
absences      0
G1            0
G2            0
G3            0
dtype: int64