In [157]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

cl_dt = pd.read_csv("lpor_explorer.csv")
df = pd.read_csv("lpor_classification.csv")
male_df = df.loc[df["Gender"] == "1"]
female_df = df.loc[df["Gender"] == "0"]

The dataset contains a lot of interesting information so there is a lot of curious information to be found, so I asked myself several questions and looked for answers in the data.

#What is the age of high school students?

In [175]:
fig1 = px.box(df, x="Gender", y="Age", notched=True, points="all", color="Gender")

# Update layout for better appearance
fig1.update_layout(
    title="Distribution of Age by Gender",
    xaxis_title="Gender",
    yaxis_title="Age",
    showlegend=False,  # Hiding legend since colors represent the same information
    boxmode='group',  # Display box plots side by side
    height=500,  # Set the height of the plot
    width=800,   # Set the width of the plot
    template='plotly_white'  # Use a white layout template
)

# Add strip plot points for each data point
fig1.update_traces(marker=dict(size=4, opacity=0.7))

# Show the plot
fig1.show()

Not living in Portugal, I didn't know at what age Portuguese kids attend high school, so we can observe that most kids start school at 15 and finish at 18/19, while there are also some boys and girls who if I fail I can even finish it at 21/22 years old.

#Do families live more in urban or rural areas? Does family size influence the choice?

In [177]:
# Create a subplot with 1 row and 1 column
fig2 = make_subplots(rows=1, cols=1)

# Grouped histogram for Housing Type based on Family Size
fig2.add_trace(go.Histogram(x=df.loc[df["Family Size"] == 1]["Housing Type"], name="Family Size Above 3"),
               row=1, col=1)

fig2.add_trace(go.Histogram(x=df.loc[df["Family Size"] == 0]["Housing Type"], name="Family Size Up to 3"),
               row=1, col=1)

# Update layout
fig2.update_layout(
    height=600,
    width=700,
    barmode='group',
    xaxis_title="Areas of the city",
    yaxis_title="Count",
    legend_title="Family Size",
    title="Housing Type Distribution by Family Size"
)

# Set x-axis category order
fig2.update_xaxes(categoryorder='array', categoryarray=['Urban', 'Rural'])

# Show the plot
fig2.show()

As we can see, most families are made up of more than 3 people and in general families live mainly in urban areas compared to rural areas.

#Does the relationship between parents influence the size of the family?

In [179]:
fig3 = make_subplots(rows=1, cols=1)

fig3.append_trace(go.Histogram(x=df.loc[df["Parents' Situation"] == 1]["Family Size"], name="Separated"), row=1, col=1, )

fig3.append_trace(go.Histogram(x=df.loc[df["Parents' Situation"] == 0]["Family Size"], name="Living together"), row=1, col=1)

fig3.update_layout(height=600, width=700, barmode='group' , xaxis_title="Family size",
    yaxis_title="Count", legend_title= "Relationship between parents")
fig3.update_xaxes(categoryorder='array', categoryarray=[0,  1])
fig3.show()

From the graph we can see how 87.6% of students live with both parents, 74.5% of joint families are made up of at least 4 people while as regards separated families there is substantial equity between large and small families.

#What is the average level of education of the parents?

In [180]:
fig4 = make_subplots(rows=1, cols=1)

fig4.append_trace(go.Histogram(x=df["Mother's Education"], name="Mothers", histnorm = "percent"),
                 row=1, col=1, )

fig4.append_trace(go.Histogram(x=df["Father's Education"], name="Fathers", histnorm = "percent"),row=1, col=1, )
# Updating layout
fig4.update_layout(
    height=600,
    width=700,
    barmode='group',
    xaxis_title="Level of Education",
    yaxis_title="Percentage of Count",
    legend_title="Parents",
    title_text="Distribution of Parents' Education",
    showlegend=True,
    legend=dict(x=0.7, y=1),
)

# Customizing x-axis categories order
fig4.update_xaxes(categoryorder='array', categoryarray=['Primary School', 'Lower Secondary School', 'High School', 'Higher Education'])

# Customizing the color palette
fig4.update_layout(coloraxis=dict(colorscale='Viridis'))

# Show the plot
fig4.show()


As can be seen, the educational level of parents is very varied even if the majority of parents (32.2% of fathers and 28.6% of mothers) stopped at lower secondary school.

#Are there more educated mothers or fathers in families?

In [162]:
def level_of_education(level):
    if level == 1:
        return 1
    elif level == 2:
        return 2
    elif level == 3:
        return 3
    else:
        return 4

def more_smart(parent):
    if level == 1:
        return 1
    elif level == 2:
        return 2
    elif level == 3:
        return 3
    else:
        return 4

df["M_E_num"] = df['Mother\'s Education'].apply(level_of_education)
df["F_E_num"] = df['Father\'s Education'].apply(level_of_education)

# Funzione per determinare il genitore più istruito
def more_educated_parent(row):
    mother_level = row['M_E_num']
    father_level = row['F_E_num']

    if mother_level == father_level:
        return 'Same'
    elif mother_level > father_level:
        return 'Mother'
    else:
        return 'Father'

# Creare la nuova feature
df['More_Educated_Parent'] = df.apply(more_educated_parent, axis=1)

fig5 = px.histogram(df, x="More_Educated_Parent", histnorm = "percent")
fig5.update_layout(height=600, width=700, barmode='group', xaxis_title="More educated parent",
    yaxis_title="Count")
fig5.show()

I was curious to know in families, in a comparison between mother and father, which parent was more educated and as I could expect most students (the 53%) have equally educated parents. Who knows if it isn't due to the fact that several parents met at school?

#Which parent is responsible for the child?

In [163]:
fig6 = px.histogram(df, x="Legal Guardian", histnorm = "percent")
fig6.update_layout(height=600, width=700, barmode='group', xaxis_title="Parent responsible for child",yaxis_title="Count")
fig6.show()


As expected, the majority (70%) of the students have their mother as their legal representative. Strangely, however, 41 students do not have one of their parents as a tutor, who knows why. Is it probably due to the fact that the parents are separated?

In [164]:
no_parent_as_legal = df.loc[df["Legal Guardian"] == "Other"]
no_parent_as_legal["Parents' Situation"].value_counts()/no_parent_as_legal.shape[0]

Series([], Name: Parents' Situation, dtype: float64)

No. As we can see above, the majority, i.e. 70%, of students who do not have parents as tutors have parents who live together. So who knows why they didn't set themselves up as legal guardians.

#Why did they choose this school? Are there gender differences on this?

In [165]:
fig7 = make_subplots(rows=1, cols=1)

fig7.append_trace(go.Histogram(x=df.loc[df["Gender"] == "Female"]["Reason for Choosing School"], name="Female", histnorm = "percent"),
                 row=1, col=1, )

fig7.append_trace(go.Histogram(x=df.loc[df["Gender"] == "Male"]["Reason for Choosing School"], name="Male", histnorm = "percent"),
                 row=1, col=1, )
fig7.update_layout(height=600, width=700, barmode='group', xaxis_title="Motivation",
    yaxis_title="Count", legend_title= "Gender")
fig7.update_xaxes(categoryorder='array', categoryarray=['Course Preference', 'Near Home', 'Reputation', 'Other'])
fig7.show()

High schools are an important point for a student's school career and also a first step towards work, it is therefore normal that the main motivation for the choice that pushed students to choose one school over another is the preference of the course.

#Do girls or boys study more?

In [166]:
fig8 = make_subplots(rows=1, cols=1)

fig8.append_trace(go.Histogram(x=df.loc[df["Gender"] == "Female"]["Weekly Study Time"], name="Female", histnorm = "percent"),
                 row=1, col=1 )

fig8.append_trace(go.Histogram(x=df.loc[df["Gender"] == "Male"]["Weekly Study Time"], name="Male", histnorm = "percent"),
                 row=1, col=1 )
fig8.update_layout(height=600, width=700, barmode='group', xaxis_title="Weekly study time",yaxis_title="Count", legend_title= "Gender")
fig8.update_xaxes(categoryorder='array', categoryarray=["Up to 2h", "2 to 5h", "5 to 10h", "More than 10h"])
fig8.show()

As in all things, even in studying there is a bit of curiosity to see which of the two genres studies more. From personal experience, among males, those who study more are seen worse than what happens in the female world, as we can see from the graph, the data confirms my vision. By extracting some interesting data we can see how:

86% of students study up to 5 hours a week with 53% of them studying a maximum of 2 hours a week;
as regards girls we can instead see how 50% study from 2 to 5 hours and 20% study even 5 to 10 hours a week.

#Does extra educational support push people to study more?

In [167]:
studentsWithExtraEducationalSupport = len(df.loc[df["Extra Educational Support"] == 1])
studentsWithoutExtraEducationalSupport = len(df.loc[df["Extra Educational Support"] == 0])
print(f"There are {studentsWithExtraEducationalSupport} students with extra educationl support and {studentsWithoutExtraEducationalSupport} who do not use the support")

# Crea un subplot con una sola colonna e una riga
fig9 = make_subplots(rows=1, cols=1)

# Aggiungi gli istogrammi per le due categorie
fig9.add_trace(go.Histogram(x=df.loc[df["Extra Educational Support"] == 1]["Weekly Study Time"],
                           name="Support", histnorm = "percent"), row=1, col=1)

fig9.add_trace(go.Histogram(x=df.loc[df["Extra Educational Support"] == 0]["Weekly Study Time"],
                           name="Not supported", histnorm = "percent"), row=1, col=1)

fig9.update_layout(height=600, width=700, barmode='group', xaxis_title="Weekly study time",
    yaxis_title="Count", legend_title= "Gender")

fig9.update_xaxes(categoryorder='array', categoryarray=["Up to 2h", "2 to 5h", "5 to 10h", "More than 10h"])
# Mostra il grafico
fig9.show()

There are 68 students with extra educationl support and 581 who do not use the support


There are 68 students with extra educationl support and 581 who do not use the support

We can see that there is a marked difference between the first 2 blocks of columns, this leads us to think about how extra-curricular educational support helps many students to study a few more hours a week.

#Is daycare also needed when parents live together?

In [174]:
fig10 = make_subplots(rows=1, cols=1)

fig10.append_trace(go.Histogram(x=df.loc[df["Parents' Situation"] == 1]["Attended Preschool"],
                                name="Living Together" , histnorm = "percent"), row=1, col=1, )

fig10.append_trace(go.Histogram(x=df.loc[df["Parents' Situation"] == 0]["Attended Preschool"],
                                name="Separated", histnorm = "percent"), row=1, col=1)

fig10.update_layout(height=600, width=700, barmode='group', xaxis_title="Have attended daycare? ",
    yaxis_title="Count", legend_title= "Relationship between parents")
fig10.update_xaxes(categoryorder='array', categoryarray=['Yes', 'No'])
fig10.show()


We can see how the majority of students attended nursery school and there is a slightly greater participation (4%) among students with separated parents.

#Does having internet at home increase study hours?

In [169]:
studentsWithInternetAtHome = len(df.loc[df["Has Internet Access"] == 1])
studentsWithoutInternetAtHome = len(df.loc[df["Has Internet Access"] == 0])
print(f"There are {studentsWithInternetAtHome} students with internet at home and {studentsWithoutInternetAtHome} who do not have internet at home")

# Crea un subplot con una sola colonna e una riga
fig11 = make_subplots(rows=1, cols=1,)

# Aggiungi gli istogrammi per le due categorie
fig11.add_trace(go.Histogram(x=df.loc[df["Has Internet Access"] == 1]["Weekly Study Time"],
                           name="Yes" , histnorm = "percent"), row=1, col=1)

fig11.add_trace(go.Histogram(x=df.loc[df["Has Internet Access"] == 0]["Weekly Study Time"],
                           name="No", histnorm = "percent"), row=1, col=1)

fig11.update_layout(height=600, width=700, barmode='group', xaxis_title="Weekly study time",
    yaxis_title="Count", legend_title= "Do have internet at home?")
fig11.update_xaxes(categoryorder='array', categoryarray=["Up to 2h", "2 to 5h", "5 to 10h", "More than 10h"])
# Mostra il grafico
fig11.show()

There are 498 students with internet at home and 151 who do not have internet at home


There are 498 students with internet at home and 151 who do not have internet at home