# Objectives :
* Predict whether or not a student will pass the final exam based on certain information given details
* Find out what most affects student achievement

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df_math = pd.read_csv('student-mat.csv',sep =';')
df_por = pd.read_csv('student-por.csv',sep=';')

In [3]:
df_math.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [4]:
# Checking the shape of the DataFrame
print(df_math.shape)

(395, 33)


# Math dataset Analysis For 'Fedu' column

In [5]:
# Math
df_math['Fedu'].info()
# No nulls

<class 'pandas.core.series.Series'>
RangeIndex: 395 entries, 0 to 394
Series name: Fedu
Non-Null Count  Dtype
--------------  -----
395 non-null    int64
dtypes: int64(1)
memory usage: 3.2 KB


In [6]:
# Check unique values in the 'Fedu' column
print(df_math['Fedu'].unique())

[4 1 2 3 0]


In [7]:
# Get summary statistics of 'Fedu'
print(df_math['Fedu'].describe())

count    395.000000
mean       2.521519
std        1.088201
min        0.000000
25%        2.000000
50%        2.000000
75%        3.000000
max        4.000000
Name: Fedu, dtype: float64


# Visualization

In [8]:
 # plot to visualize the count of students for each level of father's education:
import plotly.express as px

father_education_count = df_math['Fedu'].value_counts().sort_index()
fig = px.bar(x=father_education_count.index, y=father_education_count.values, labels={'x': "Father's Education Level", 'y': 'Count'},
             title="Count of Students based on Father's Education Level")
fig.show()

# Inference:
* The majority of students have fathers with a 'secondary education' Overall, the bar plot provides a clear visual representation of the distribution of students based on their father's education level. It highlights the prevalence of students with fathers having higher education levels.

In [9]:
# Bar Plot: Count of students for each level of father's education
father_education_count = df_math['Fedu'].value_counts().sort_index()
fig = px.bar(x=father_education_count.index, y=father_education_count.values, labels={'x': "Father's Education Level", 'y': 'Count'},
             title="Count of Students based on Father's Education Level")
fig.show()

In [10]:
# Box Plot: Math Final Grade vs. Father's Education Level
fig = px.box(df_math, x='Fedu', y='G3', labels={'x': "Father's Education Level", 'y': 'Math Final Grade'},
             title="Math Final Grade vs. Father's Education Level")
fig.show()

# Inference:
* The median Math grade tends to be higher for students whose fathers have a higher education level (e.g., 'higher education' compared to those with lower education levels.   

In [11]:
# Violin Plot: Distribution of Math Final Grade based on Father's Education Level
fig = px.violin(df_math, x='Fedu', y='G3', box=True, points='all', labels={'x': "Father's Education Level", 'y': 'Math Final Grade'},
                title="Distribution of Math Final Grade based on Father's Education Level")
fig.show()

# Inference:
*The width of the violins represents the density of the data points. Wider sections indicate higher data density, while narrower sections indicate lower data density.

*Outliers, represented by individual points outside the violins, can be observed in several groups, indicating students with exceptional or underperforming Math grades compared to their respective groups.

In [12]:
# Grouped Bar Plot: Average Math Final Grade based on Father's Education Level
father_education_grouped = df_math.groupby('Fedu')['G3'].mean().reset_index()
fig = px.bar(father_education_grouped, x='Fedu', y='G3', labels={'x': "Father's Education Level", 'y': 'Average Math Final Grade'},
             title="Average Math Final Grade based on Father's Education Level")
fig.show()

# Inference:
*As the father's education level increases from 'primary education' level

In [13]:
#Create a sunburst chart to visualize the distribution of students' father's education level ('Fedu') based on their mother's education level ('Medu'):

In [14]:
fig = px.sunburst(df_math, path=['Medu', 'Fedu'], values='G3', labels={'G3': 'Math Final Grade'},
                  title="Distribution of Father's Education Level based on Mother's Education Level")
fig.show()

In [15]:
#Use a scatter matrix to visualize the relationships between multiple numerical attributes, such as 'studytime', 'G1', 'G2', and 'G3':

fig = px.scatter_matrix(df_math, dimensions=['studytime', 'G1', 'G2', 'G3'], color='Fedu',
                        labels={'Fedu': "Father's Education Level"},
                        title="Scatter Matrix of Study Time and Grades based on Father's Education Level")
fig.show()

# Inference:
* 'G1'vs.'G3': There is a clear positive correlation between the grades in the first period 
    
* 'G2'vs.'G3': Similar to the previous plot, there is a strong positive correlation between the grades in the second period 
    
* 'studytime' vs. 'G1', 'G2', and 'G3':There are no clear linear relationships between study time and the grades in the first    period ('G1'), second period ('G2'), or the final Math grades ('G3')

In [16]:
#Create a stacked bar plot to visualize the proportion of students who have extra educational support from the school ('schoolsup') and family educational support ('famsup') for each father's education level:

In [17]:
fig = px.bar(df_math, x='Fedu', y='G3', color='schoolsup', barmode='stack',
             labels={'x': "Father's Education Level", 'y': 'Math Final Grade', 'schoolsup': 'School Support'},
             title="Math Final Grade based on Father's Education Level with School Support")
fig.show()

fig = px.bar(df_math, x='Fedu', y='G3', color='famsup', barmode='stack',
             labels={'x': "Father's Education Level", 'y': 'Math Final Grade', 'famsup': 'Family Support'},
             title="Math Final Grade based on Father's Education Level with Family Support")
fig.show()


# Inference:
* Similar to the school support trend, the proportion of students with family support tends to decrease as the father's education level increases.

In [18]:
fig = px.scatter_3d(df_math, x='absences', y='Fedu', z='G3', color='Fedu', labels={'absences': 'School Absences', 'Fedu': "Father's Education Level", 'G3': 'Math Final Grade'},
                    title="3D Scatter Plot: Math Final Grade vs. School Absences and Father's Education Level")
fig.show()

# Inference:
*  may vary across different levels of father's education.within each level of father's education, we can identify specific patterns between school absences and Math final grades. Some clusters may show a more evident relationship between school absences and grades, while others may not exhibit a clear trend

In [19]:
#3D Scatter Plot with Color Scale:
#Create a 3D scatter plot to visualize the relationship between the weekly study time ('studytime'), the final Math grade ('G3'), and the age of the students ('age'). Use a color scale to represent the age of the students.

In [20]:
import plotly.graph_objects as go

fig = go.Figure(data=go.Scatter3d(
    x=df_math['studytime'],
    y=df_math['G3'],
    z=df_math['age'],
    mode='markers',
    marker=dict(
        size=5,
        color=df_math['age'],
        colorscale='Viridis',  # You can choose any colorscale you prefer
        opacity=0.8
    ),
    text=df_math['age'],
    hoverinfo='text'
))

fig.update_layout(
    scene=dict(
        xaxis_title='Weekly Study Time',
        yaxis_title='Math Final Grade',
        zaxis_title='Age',
    ),
    title='3D Scatter Plot: Weekly Study Time vs. Math Final Grade vs. Age',
)

fig.show()


# Inference:
* The density of data points is reflected in the plot. Higher data density in certain regions indicates a larger number of students with similar study time, Math grades, and age.

In [21]:
#Create a 3D surface plot to visualize the relationship between the number of school absences ('absences'), the weekly study time ('studytime'), and the Math final grades ('G3').

In [22]:
fig = go.Figure(data=[go.Surface(z=df_math['G3'],
                                 x=df_math['absences'],
                                 y=df_math['studytime'])])

fig.update_layout(
    scene=dict(
        xaxis_title='School Absences',
        yaxis_title='Weekly Study Time',
        zaxis_title='Math Final Grade',
    ),
    title='3D Surface Plot: Math Final Grade vs. School Absences vs. Weekly Study Time',
)

fig.show()


# Inference:
The range of Math final grades is visualized on the surface plot. Areas with higher peaks represent higher Math grades, while areas with lower peaks represent lower Math grades.