In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff

In [33]:
df = pd.read_csv('Final-data.csv')

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,...,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3,Subject
0,0,GP,F,18,U,GT3,A,4,4,at_home,...,3,4,1,1,3,6,5,6,6,1
1,1,GP,F,17,U,GT3,T,1,1,at_home,...,3,3,1,1,3,4,5,5,6,1
2,2,GP,F,15,U,LE3,T,1,1,at_home,...,3,2,2,3,3,10,7,8,10,1
3,3,GP,F,15,U,GT3,T,4,2,health,...,2,2,1,1,5,2,15,14,15,1
4,4,GP,F,16,U,GT3,T,3,3,other,...,3,2,1,2,5,4,6,10,10,1


In [9]:
df.shape


(1044, 35)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1044 entries, 0 to 1043
Data columns (total 35 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  1044 non-null   int64 
 1   school      1044 non-null   object
 2   sex         1044 non-null   object
 3   age         1044 non-null   int64 
 4   address     1044 non-null   object
 5   famsize     1044 non-null   object
 6   Pstatus     1044 non-null   object
 7   Medu        1044 non-null   int64 
 8   Fedu        1044 non-null   int64 
 9   Mjob        1044 non-null   object
 10  Fjob        1044 non-null   object
 11  reason      1044 non-null   object
 12  guardian    1044 non-null   object
 13  traveltime  1044 non-null   int64 
 14  studytime   1044 non-null   int64 
 15  failures    1044 non-null   int64 
 16  schoolsup   1044 non-null   object
 17  famsup      1044 non-null   object
 18  paid        1044 non-null   object
 19  activities  1044 non-null   object
 20  nursery 

In [12]:
# checking for NA Values 
print(df.isna().sum())

Unnamed: 0    0
school        0
sex           0
age           0
address       0
famsize       0
Pstatus       0
Medu          0
Fedu          0
Mjob          0
Fjob          0
reason        0
guardian      0
traveltime    0
studytime     0
failures      0
schoolsup     0
famsup        0
paid          0
activities    0
nursery       0
higher        0
internet      0
romantic      0
famrel        0
freetime      0
goout         0
Dalc          0
Walc          0
health        0
absences      0
G1            0
G2            0
G3            0
Subject       0
dtype: int64


# EDA for features 13 - 23

In [69]:
# create a histogram with a density curve of final scores
fig = ff.create_distplot(
    [df["G3"]], 
    ['Final Grade'], 
    colors=['indianred'],
    show_hist=True, 
    show_curve=True,
    bin_size=0.5
)

# set the title and axis labels of the chart
fig.update_layout(title='Distribution of Final Score',
                   xaxis_title='Final Period Grades',
                   yaxis_title='Density')

# display the chart
fig.show()

In [78]:
# calculate the mean grade
mean = df['G3'].mean()

# create a histogram of final grades
fig = px.histogram(df, x='G3', nbins=20, opacity=0.75)

# add a vertical line showing the mean
fig.add_shape(type='line', x0=mean, x1=mean, y0=0, y1=1, line=dict(color='red', width=10))

# set the title and axis labels of the chart
fig.update_layout(title='Distribution of Final Score',
                  xaxis_title='Final Period Grades',
                  yaxis_title='Count')

# display the chart
fig.show()

In [98]:
# create a boxplot of final grades to identify outliers
fig = px.box(df, y='G3')

# set the title and axis labels of the chart
fig.update_layout(title='Distribution of Final Score',
                  xaxis_title='',
                  yaxis_title='Final Period Grades')

# display the chart
fig.show()

In [99]:
mean = df['G3'].min()
mean

0

In [87]:
# calculate the correlation matrix
df_corr = pd.DataFrame(df[["G1", "G2", "G3"]])
corr_matrix = df_corr.corr()

# create a heatmap using Plotly
fig = go.Figure(data=go.Heatmap(z=corr_matrix.values,
                                 x=corr_matrix.columns,
                                 y=corr_matrix.index,
                                 colorscale='Viridis',
                                 text=corr_matrix.round(2).values))

# set the title of the chart
fig.update_layout(title='Correlation Heatmap for Grades')

# add labels to the heatmap
fig.update_layout(
    xaxis=dict(side='top'),
    yaxis=dict(title='Features'),
    annotations=[dict(
        x=x_val,
        y=y_val,
        text=str(text_val),
        font=dict(color='white'),
        showarrow=False
    )
        for y_val, y in enumerate(corr_matrix.index)
        for x_val, x in enumerate(corr_matrix.columns)
        for text_val in [corr_matrix.loc[y, x].round(2)]
        if x_val < y_val
])

# display the chart
fig.show()

In [13]:
df['traveltime'].value_counts()

1    623
2    320
3     77
4     24
Name: traveltime, dtype: int64

In [51]:
counts = df['traveltime'].value_counts(normalize=True) * 100
fig = px.bar(counts, x=counts.index, y=counts.values, text=counts.values, 
             labels={'x': 'Travel Time', 'y': 'Percentage'}, 
             title='Distribution of Students by Travel Time')

# add percentage labels to the bars
fig.update_traces(texttemplate='%{text:.2f}%', textposition='outside')

# add x-axis label
fig.update_xaxes(title_text='Travel Time', title_font=dict(size=14))

# show the plot
fig.show()

In [34]:
# maping traveltime values to human-readable labels
time_labels = {
    1: '<15 min.',
    2: '15 to 30 min.',
    3: '30 min. to 1 hour',
    4: '>1 hour'
}
df['traveltime'] = df['traveltime'].map(time_labels)

# mean number of students in each category
mean = df.groupby('traveltime')['G3'].mean()
# create a bar graph traveltime vs G3
fig = px.bar(mean, 
             labels={'traveltime': 'Travel Time', 'G3': 'Final Period Grade'})

# set the title of the graph
fig.update_layout(title_text='Average Final Period Math Grade vs. Travel time',
                 yaxis_title="Average")

# display the graph
fig.show()

In [52]:
counts = df['studytime'].value_counts(normalize=True) * 100
fig = px.bar(counts, x=counts.index, y=counts.values, text=counts.values, 
             labels={'x': 'Study Time', 'y': 'Percentage'}, 
             title='Distribution of Students by Study Time')

# add percentage labels to the bars
fig.update_traces(texttemplate='%{text:.2f}%', textposition='outside')

# add x-axis label
fig.update_xaxes(title_text='Study Time', title_font=dict(size=14))

# show the plot
fig.show()

In [35]:
# maping traveltime values to human-readable labels
studytime_labels = {
    1: '<2 hours',
    2: '2 to 5 hours',
    3: '5 to 10 hours',
    4: '>10 hours'
}
df['studytime'] = df['studytime'].map(studytime_labels)

# count the number of students in each category
mean = df.groupby('studytime')['G3'].mean()
# create a bar graph studytime vs G3
fig = px.bar(mean, 
             labels={'studytime': 'Study Time', 'G3': 'Final Period Grade'})

# set the title of the graph
fig.update_layout(title_text='Average Final Period Math Grade vs. Study time',
                 yaxis_title="Average Final sCore")

# display the graph
fig.show()

In [31]:
df["studytime"].value_counts()

2 to 5 hours     503
<2 hours         317
5 to 10 hours    162
>10 hours         62
Name: studytime, dtype: int64

In [53]:
counts = df['failures'].value_counts(normalize=True) * 100
fig = px.bar(counts, x=counts.index, y=counts.values, text=counts.values, 
             labels={'x': 'Failures', 'y': 'Percentage'}, 
             title='Distribution of Students by Failures')

# add percentage labels to the bars
fig.update_traces(texttemplate='%{text:.2f}%', textposition='outside')

# add x-axis label
fig.update_xaxes(title_text='Failures', title_font=dict(size=14))

# show the plot
fig.show()

In [37]:
fedu_labels = {
    0: 'None',
    1: '- primary education (4th grade)',
    2: '5th to 9th grade',
    3: 'secondary education',
    4: 'higher education'
}
ses_df['Fedu'] = ses_df['Fedu'].map(fedu_labels)
# mean the number of students in each category
mean = df.groupby('failures')['G3'].mean()
# create a bar graph failures vs G3
fig = px.bar(mean, 
             labels={'failures': 'Failures', 'G3': 'Final Period Grade'})

# set the title of the graph
fig.update_layout(title_text='Final Period Math Grade vs. Failures',
                 yaxis_title="Average Final Grade")

# display the graph
fig.show()

In [41]:
# get value counts for the 'schoolsup' column
counts = df['schoolsup'].value_counts()
# create a pie chart
fig = px.pie(counts, values=counts.values, names=counts.index)
fig.update_layout(title="Distribution of Students with extra educational support", showlegend=True)
fig.show()

In [39]:
# average the number of students in each category
mean = df.groupby('schoolsup')['G3'].mean()
# create a bar graph schoolsup vs G3
fig = px.bar(mean, 
             labels={'schoolsup': "extra educational support", 'Final_G': 'Final Period Grade'})

# set the title of the graph
fig.update_layout(title_text="Mean Final Period Grade vs. extra educational support",
                 yaxis_title="Average Final Grade")

# display the graph
fig.show()

In [44]:
# get value counts for the 'famsup' column
counts = df['famsup'].value_counts()
# create a pie chart
fig = px.pie(counts, values=counts.values, names=counts.index)
fig.update_layout(title="Distribution of Students with family educational support", showlegend=True)
fig.show()

In [43]:
# average the number of students in each category
mean = df.groupby('famsup')['G3'].mean()
# create a bar graph famsup vs G3
fig = px.bar(mean, 
             labels={'famsup': "family educational support", 'Final_G': 'Final Period Grade'})

# set the title of the graph
fig.update_layout(title_text="Mean Final Period Grade vs. family educational support",
                 yaxis_title="Average Final Grade")

# display the graph
fig.show()

In [46]:
# get value counts for the 'famsup' column
counts = df['paid'].value_counts()
# create a pie chart
fig = px.pie(counts, values=counts.values, names=counts.index)
fig.update_layout(title="Distribution of Students with extra paid classes within the course subject", showlegend=True)
fig.show()

In [45]:
# average the number of students in each category
mean = df.groupby('paid')['G3'].mean()
# create a bar graph paid vs G3
fig = px.bar(mean, 
             labels={'paid': "extra paid classes within the course subject ", 'Final_G': 'Final Period Grade'})

# set the title of the graph
fig.update_layout(title_text="Mean Final Period Grade vs. extra paid classes within the course subject ",
                 yaxis_title="Average Final Grade")

# display the graph
fig.show()

In [48]:
# get value counts for the 'activities' column
counts = df['activities'].value_counts()
# create a pie chart
fig = px.pie(counts, values=counts.values, names=counts.index)
fig.update_layout(title="Distribution of Students with extra-curricular activities", showlegend=True)
fig.show()

In [47]:
# average the number of students in each category
mean = df.groupby('activities')['G3'].mean()
# create a bar graph activities vs G3
fig = px.bar(mean, 
             labels={'activities': "extra-curricular activities", 'Final_G': 'Final Period Grade'})

# set the title of the graph
fig.update_layout(title_text="Mean Final Period Grade vs. extra-curricular activities",
                 yaxis_title="Average Final Grade")

# display the graph
fig.show()

In [55]:
# get value counts for the 'nursery' column
counts = df['nursery'].value_counts()
# create a pie chart
fig = px.pie(counts, values=counts.values, names=counts.index)
fig.update_layout(title="Distribution of Students who attended nursery school", showlegend=True)
fig.show()

In [56]:
# count the number of students in nursery category
mean = df.groupby('nursery')['G3'].mean()
# create a bar graph nursery vs G3
fig = px.bar(mean, 
             labels={'nursery': 'Attended nursery school ', 'G3': 'Final Period Grade'})

# set the title of the graph
fig.update_layout(title_text='Average Final Period Math Grade vs. Attended nursery school',
                 yaxis_title="Average Final Grade")

# display the graph
fig.show()

In [57]:
# get value counts for the 'higher' column
counts = df['higher'].value_counts()
# create a pie chart
fig = px.pie(counts, values=counts.values, names=counts.index)
fig.update_layout(title="Distribution of Students who want to take higher education", showlegend=True)
fig.show()

In [58]:
# mean number of students in nursery category
mean = df.groupby('higher')['G3'].mean()
# create a bar graph higher vs G3
fig = px.bar(mean, 
             labels={'higher': 'Students who want to take higher education', 'G3': 'Final Period Grade'})

# set the title of the graph
fig.update_layout(title_text='Average Final Period Math Grade vs. Students who want to take higher education',
                 yaxis_title="Average Final Grade")

# display the graph
fig.show()

In [60]:
# get value counts for the 'internet' column
counts = df['internet'].value_counts()
# create a pie chart
fig = px.pie(counts, values=counts.values, names=counts.index)
fig.update_layout(title="Distribution of Students with internet", showlegend=True)
fig.show()

In [61]:
# average number of students in nursery category
mean = df.groupby('internet')['G3'].mean()
# create a bar graph higher vs G3
fig = px.bar(mean, 
             labels={'internet': 'Internet', 'G3': 'Final Period Grade'})

# set the title of the graph
fig.update_layout(title_text='Average Final Period Math Grade vs. Students with internet',
                 yaxis_title="Average Final Grade")

# display the graph
fig.show()

In [63]:
# get value counts for the 'romantic' column
counts = df['romantic'].value_counts()
# create a pie chart
fig = px.pie(counts, values=counts.values, names=counts.index)
fig.update_layout(title="Distribution of Students in romantic relationship", showlegend=True)
fig.show()

In [64]:
# average number of students in nursery category
mean = df.groupby('romantic')['G3'].mean()
# create a bar graph higher vs G3
fig = px.bar(mean, 
             labels={'romantic': 'Romantic', 'G3': 'Final Period Grade'})

# set the title of the graph
fig.update_layout(title_text='Average Final Period Math Grade vs. Students in romantic relationships',
                 yaxis_title="Average Final Grade")

# display the graph
fig.show()

In [89]:
df[["age", "G1", "G2", "G3"]].describe()

Unnamed: 0,age,G1,G2,G3
count,1044.0,1044.0,1044.0,1044.0
mean,16.726054,11.213602,11.246169,11.341954
std,1.239975,2.983394,3.285071,3.864796
min,15.0,0.0,0.0,0.0
25%,16.0,9.0,9.0,10.0
50%,17.0,11.0,11.0,11.0
75%,18.0,13.0,13.0,14.0
max,22.0,19.0,19.0,20.0


In [93]:
# Create 3D scatter plot

fig = px.scatter_3d(
    df,
    x="G1",
    y="G2",
    z="G3",
    labels={"G1": "first period grade", "G2": "second period grade",
            "G3": "third period grade"},
    width=800,
    height=700,
)

# define plane coordinates
plane_x = [min(df['G1']), max(df['G1']), max(df['G1']), min(df['G1'])]
plane_y = [min(df['G2']), min(df['G2']), max(df['G2']), max(df['G2'])]
plane_z = [df['G3'].mean(), df['G3'].mean(), df['G3'].mean(), df['G3'].mean()]

# add plane trace
fig.add_trace(go.Mesh3d(x=plane_x, y=plane_y, z=plane_z, color='red', opacity=0.5))

# Refine formatting
fig.update_traces(
    marker={"size": 4, "line": {"width": 2, "color": "DarkSlateGrey"}},
    selector={"mode": "markers"}
)

# Display figure
fig.show()