# **Let's Dive into Student Performance 📈**

![picture](https://generationstudy.com/wp-content/uploads/2019/09/Study-Exams.jpeg)

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as figure_factory

# from dataprep.eda import plot
# from dataprep.eda.missing import plot_missing

In [None]:
sns.set(rc={'figure.figsize': (11.7, 8.27)})
sns.set_style('darkgrid')

In [None]:
students_data = pd.read_csv(r'../input/students-performance-in-exams/StudentsPerformance.csv')

In [None]:
students_data.head()

In [None]:
students_data.columns

Check for a missing values

In [None]:
students_data.isnull().sum()

In [None]:
students_data.rename(columns={'parental level of education': 'parental_level_of_education',
                              'test preparation course': 'test_preparation_course',
                              'math score': 'math_score',
                              'reading score': 'reading_score',
                              'writing score': 'writing_score'}, inplace=True)

In [None]:
students_data.head()

In [None]:
students_data['total_score'] = students_data['math_score'] + students_data['reading_score'] + students_data['writing_score']

In [None]:
students_data.groupby(['gender'], as_index=False).agg({'math_score': [np.mean, np.average, np.median],
                                                       'reading_score': [np.mean, np.average, np.median],
                                                       'writing_score': [np.mean, np.average, np.median]})

In [None]:
scores_frame = students_data[['math_score', 'reading_score', 'writing_score', 'total_score']]

pd.DataFrame({'Mean': students_data.mean(),
              'Median': students_data.median(),
              'Average': pd.DataFrame.apply(scores_frame, np.average)})

Let us understand about the basic information of the data, like min, max, mean and standard deviation etc.

In [None]:
students_data.describe(percentiles=[.25, .5, .75])

Interquantile range

In [None]:
students_data.quantile(0.75) - students_data.quantile(0.25)

In [None]:
students_data.head()

Let's start ower EDA to know some information about this data, which will make you think :)

In [None]:
gender = students_data.gender.value_counts().reset_index()
gender.columns = ['gender', 'value_counts']

px.pie(gender, names='gender', values='value_counts', title='Male and female percentage')

In [None]:
groups = students_data['race/ethnicity'].value_counts().reset_index().rename(columns={'index': 'groups', 'race/ethnicity': 'value_counts'})

px.bar(groups, x='groups', y='value_counts', color='value_counts', title='Number of students in groups')

In [None]:
students_data_info = students_data[['gender', 'race/ethnicity', 'parental_level_of_education']]
students_data_info['gender_copy'] = students_data_info['gender']

students_dist = students_data_info.groupby(['gender', 'gender_copy', 'race/ethnicity', 'parental_level_of_education'], as_index=False).agg({'gender': len})
students_dist.columns = ['gender', 'group_name', 'parental_level_of_education', 'value_counts']
students_dist

fig = px.bar(students_dist, x='parental_level_of_education', y='value_counts', color='group_name', barmode='group', hover_data=students_dist.columns)
fig.update_layout(title_text='<b>How many persons are in each group?<b>', titlefont={'size': 24, 'family':'Serif'})

fig.show()

In [None]:
fig = px.bar(students_dist, x='parental_level_of_education', y='value_counts', color='value_counts', barmode='group', hover_data=students_dist.columns)
fig.update_layout(title_text='<b>How many persons are in each group? (Second)<b>', titlefont={'size': 24, 'family':'Serif'})

fig.show()

# And finally..

In [None]:
fig = px.sunburst(students_data, path=['race/ethnicity', 'gender', 'parental_level_of_education'])
fig.update_layout(title_text='<b>Parental level of education hierarchy<b>', titlefont={'size': 24, 'family':'Serif'})
fig.show()

In [None]:
group_test_data = students_data[['race/ethnicity', 'test_preparation_course']]
group_test_data['test_preparation_course_copy'] = group_test_data['test_preparation_course']

students_test_info = group_test_data.groupby(['race/ethnicity', 'test_preparation_course_copy'], as_index=False).agg({'test_preparation_course': len})
students_test_info.columns = ['group', 'test_preparation_course', 'value_counts']

fig = px.pie(students_test_info,
       names='group',
       values='value_counts',
       color='group',
       title='How many persons are completed a test in each group?')

fig.show()

## Distribution analysis

In [None]:
fig = px.box(students_data,
                x='parental_level_of_education',
                y='total_score',
                color='gender',
                points='all',
                hover_data=students_data.columns)

fig.update_layout(title_text='<b>Total_score & parental_level_of_education distribution<b>', titlefont={'size': 24, 'family':'Serif'})

fig.show()

In [None]:
fig = px.violin(students_data,
                x='parental_level_of_education',
                y='total_score',
                color='gender',
                box=True,
                points='all',
                hover_data=students_data.columns)

fig.update_layout(title_text='<b>Total score & parental_level_of_education density<b>', titlefont={'size': 24, 'family':'Serif'})

fig.show()

In [None]:
fig = px.box(students_data,
                x='race/ethnicity',
                y='total_score',
                color='gender',
                points='all',
                hover_data=students_data.columns,
                title='total_score & groups distribution',
                color_discrete_sequence=px.colors.sequential.Rainbow)

fig.show()

As we can see, here you can mark the groups in descending order of the total score (according to q1 and q3 in each plot).

Therefore, E < D < C < B < A. Finally, we will see it later.

In [None]:
fig = px.violin(students_data,
                x='race/ethnicity',
                y='total_score',
                color='gender',
                box=True,
                points='all',
                hover_data=students_data.columns,
                title='total_score & groups density',
                color_discrete_sequence=px.colors.sequential.Rainbow)

fig.show()

In [None]:
scores = [column for column in students_data.columns if 'score' in column]

fig, axes = plt.subplots(1, len(scores), figsize=(20, 7))
fig.suptitle('Scores distribution')

for i, score in enumerate(scores):
    sns.histplot(data=students_data, x=score, hue='gender', kde=True, ax=axes[i])

# Correlation

Since we will investigate the dependence of the scores on the other variables, the first ones will be ordinal variables, it makes sense to use the `Spearman` correlation coefficient

In [None]:
students_data.corr(method='spearman')

In [None]:
correlation_matrix = students_data.corr(method='spearman')
scores_names = [scores]

fig = figure_factory.create_annotated_heatmap(x=list(correlation_matrix.index.values),
                                              y=list(correlation_matrix.columns.values),
                                              z=correlation_matrix.to_numpy().round(3))

fig.update_layout(title_text='<b>Correlation matrix<b>', titlefont={'size': 24, 'family':'Serif'})

fig.show()

In [None]:
score_dataframe = students_data[scores]

fig = figure_factory.create_scatterplotmatrix(score_dataframe, diag='histogram', colormap='Viridis', height=700, width=1150)

fig.update_layout(title_text='<b>Scatterplot matrix of scores<b>', titlefont={'size': 24, 'family':'Serif'})
fig.show()

Видим, в основном, сильную положительную корелляцию. Можем отметить наименьшую корелляцию предметов гуманитарных наук с точными (math). Оно то и понятно, без разъяснений :) \
\
В основном, можно сказать, рост любой переменной рассматриваемой переменной сказывается на рост любой другой. \
Из этого можем отметить, что если студент хорошо сдал один предмет, с большой вероятностью он хорошо сдаст и остальные два. Если рассматривать любое высшее заведение, где предметов гораздо больше, то там ситуация будет интереснее. Там можно с большей уверенностью судить о предрасположенностях каждого. \
\
В нашем случае, высокая корелляция, полагаю, связана с тем, что студенты сдавали какое-то испытание, возможно, государственный экзамен, который завалить никак нельзя. Именно поэтому результаты по трем дисциплинам либо отличные, либо средестатистические, либо низкие (студент был не готов к испытаниям -> видно из последнего графика). А как мы знаем, либо ты готов ко всему, либо не готов :)

--`English`-- \
We see mainly a strong positive correlation. We can note the smallest correlation of humanities subjects with exact (math). It is understandable, without explanation :) \
\
Basically, it can be said that the growth of any variable of the considered variable affects the growth of any other. \
From this we can note that if a student has passed one subject well, he is most likely to pass the other two well. If we consider any higher institution, where there are much more subjects, then the situation there will be more interesting. There one can more confidently judge the dispositions of each. \
\
In our case, the high correlation, I believe, is due to the fact that students passed some kind of test, perhaps a state exam, which cannot be failed. That is why the results in three disciplines are either excellent, or statistically average, or low (the student was not ready for the tests -> can be seen from the very last pie plot). And as we know, either you are ready for anything, or you are not ready :)

In [None]:
average_scores_by_group = students_data.groupby('race/ethnicity', as_index=False).agg({'math_score': np.average,
                                                                                       'reading_score': np.average,
                                                                                       'writing_score': np.average,
                                                                                       'total_score': np.average})

average_scores_by_group = average_scores_by_group.melt(id_vars=['race/ethnicity'],
                                                       value_vars=['math_score', 'reading_score', 'writing_score', 'total_score'],
                                                       var_name='score_name', value_name='average_score')

fig = px.bar(average_scores_by_group, x='race/ethnicity', y='average_score', color='score_name')
fig.update_layout(title_text='<b>Averege students scores by groups<b>', titlefont={'size': 24, 'family':'Serif'})

fig.show()

You can see that the average result in all disciplines is higher in group E, and further, everywhere in descending order: in any group in any discipline, the result average_score is less than in the previous one.

In [None]:
m_largest = students_data.nlargest(12, 'math_score').sort_values(['math_score', 'reading_score', 'writing_score'], ascending=False)


fig = px.sunburst(m_largest, path=['race/ethnicity', 'gender', 'parental_level_of_education'])
fig.update_layout(title_text='<b>Parental level of education of 12 `successful` students<b>', titlefont={'size': 24, 'family':'Serif'})
fig.show()

In [None]:
m_smallest = students_data.nsmallest(12, 'math_score').sort_values(['math_score', 'reading_score', 'writing_score'], ascending=False)

fig = px.sunburst(m_smallest, path=['race/ethnicity', 'gender', 'parental_level_of_education'])
fig.update_layout(title_text='<b>Parental level of education of 12 `unsuccessful` students<b>', titlefont={'size': 24, 'family':'Serif'})
fig.show()