In [29]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import inflection
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.DataFrame(
    {"a":[4,5,6],
     "b":[7,8,9],
     "c":[10,11,12]},
    index = [1,2,3])

In [5]:
df

Unnamed: 0,a,b,c
1,4,7,10
2,5,8,11
3,6,9,12


In [19]:
df = pd.DataFrame(
    [[4,5,6],
     [7,8,9],
     [10,11,12]],
    index = [1,2,3],
    columns = ['a','b','c'])

In [20]:
df

Unnamed: 0,a,b,c
0,4,5,6
1,7,8,9
2,10,11,12


In [27]:
df = pd.DataFrame(
    {"a":[4,5,6],
     "b":[7,8,9],
     "c":[10,11,12]},
    index = pd.MultiIndex.from_tuples(
    [('d',1),('D',2),('e',2)], names = ['n','v']))

In [28]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
n,v,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
d,1,4,7,10
D,2,5,8,11
e,2,6,9,12


In [30]:
df_original = pd.read_csv("data/student_performance_data.csv")

In [31]:
df = df_original.copy(deep = True)

In [32]:
df.head()

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,1001,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196,2.0
1,1002,18,0,0,1,15.408756,0,0,1,0,0,0,0,3.042915,1.0
2,1003,15,0,2,3,4.21057,26,0,2,0,0,0,0,0.112602,4.0
3,1004,17,1,0,3,10.028829,14,0,3,1,0,0,0,2.054218,3.0
4,1005,17,1,0,2,4.672495,17,1,3,0,0,0,0,1.288061,4.0


In [34]:
df[df.Age < 16]

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
2,1003,15,0,2,3,4.210570,26,0,2,0,0,0,0,0.112602,4.0
6,1007,15,0,1,1,15.601680,10,0,3,0,1,0,0,2.748237,2.0
7,1008,15,1,1,4,15.424496,22,1,1,1,0,0,0,1.360143,4.0
15,1016,15,0,0,2,9.728101,17,1,0,0,1,0,0,1.341521,4.0
21,1022,15,0,0,2,15.323142,25,0,1,1,0,0,0,0.346894,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2367,3368,15,1,3,1,9.381207,18,0,3,0,0,1,0,1.454723,1.0
2368,3369,15,0,0,4,6.682728,3,0,2,0,0,1,1,3.158592,1.0
2378,3379,15,1,0,2,12.905555,26,0,2,0,0,1,1,0.709353,3.0
2381,3382,15,0,2,0,10.095086,5,0,3,0,0,0,0,2.956255,0.0


In [35]:
df.drop_duplicates()

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,1001,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196,2.0
1,1002,18,0,0,1,15.408756,0,0,1,0,0,0,0,3.042915,1.0
2,1003,15,0,2,3,4.210570,26,0,2,0,0,0,0,0.112602,4.0
3,1004,17,1,0,3,10.028829,14,0,3,1,0,0,0,2.054218,3.0
4,1005,17,1,0,2,4.672495,17,1,3,0,0,0,0,1.288061,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2387,3388,18,1,0,3,10.680555,2,0,4,1,0,0,0,3.455509,0.0
2388,3389,17,0,0,1,7.583217,4,1,4,0,1,0,0,3.279150,4.0
2389,3390,16,1,0,2,6.805500,20,0,2,0,0,0,1,1.142333,2.0
2390,3391,16,1,1,0,12.416653,17,0,2,0,1,1,0,1.803297,1.0


In [36]:
df.sample(frac=.5)

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
765,1766,17,0,0,1,12.820121,10,1,1,0,1,0,0,2.465306,3.0
1488,2489,15,1,1,0,11.083394,24,0,4,0,0,0,0,0.952675,4.0
2024,3025,15,0,0,1,18.912594,10,0,2,0,1,1,0,2.735673,2.0
1231,2232,18,1,2,2,4.830125,17,0,3,1,0,0,0,1.636403,4.0
1093,2094,17,0,1,2,13.920628,23,1,2,1,0,0,0,1.354327,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2150,3151,15,1,0,3,15.101364,15,0,2,1,1,0,1,2.525086,2.0
2147,3148,15,1,0,0,8.574503,16,0,3,0,0,0,0,1.718182,4.0
1567,2568,18,1,0,2,1.078652,8,0,1,0,0,0,0,1.663637,4.0
670,1671,15,1,1,2,16.643249,16,0,2,1,0,0,0,1.581931,4.0


In [37]:
df.sample(123)

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
1891,2892,16,1,0,3,10.100561,14,1,1,0,0,0,0,1.914602,4.0
942,1943,18,0,2,1,3.434200,19,0,1,1,0,1,0,1.141639,4.0
1574,2575,18,0,0,3,12.194164,0,0,1,0,0,0,0,2.773484,2.0
1646,2647,17,1,1,2,2.543649,26,0,1,1,0,0,0,0.236497,4.0
1478,2479,16,1,0,1,7.429267,15,1,3,1,0,0,0,2.158610,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331,1332,15,0,0,2,6.538842,8,0,3,0,1,0,0,2.384691,3.0
451,1452,16,0,3,0,8.808691,3,0,0,0,0,0,0,2.878927,2.0
1516,2517,17,0,1,2,5.276121,15,1,4,1,0,0,0,2.432896,3.0
459,1460,17,0,0,3,0.876937,3,1,2,0,1,1,0,3.004233,1.0


In [41]:
df.nlargest(10, "Absences")

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
18,1019,18,0,1,3,16.254658,29,0,2,1,0,0,1,0.469553,4.0
22,1023,16,1,1,0,18.64888,29,1,1,0,0,0,0,0.312546,4.0
54,1055,17,0,0,0,0.186206,29,1,2,1,1,0,0,0.756259,4.0
57,1058,15,0,0,4,10.877558,29,0,3,1,0,0,0,0.286571,4.0
95,1096,16,1,2,3,19.27832,29,0,3,0,0,0,1,0.732908,4.0
96,1097,16,1,1,1,12.831594,29,0,4,0,0,1,0,0.965482,4.0
104,1105,16,1,1,0,11.40625,29,0,3,1,0,0,0,0.648274,4.0
215,1216,15,0,0,2,0.493946,29,1,1,1,0,0,0,0.125933,4.0
322,1323,18,1,0,1,4.257002,29,1,3,1,1,0,0,0.912416,4.0
333,1334,17,1,0,2,18.608972,29,0,2,0,1,0,1,0.534836,4.0


In [43]:
df.nsmallest(10, "Absences")

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
1,1002,18,0,0,1,15.408756,0,0,1,0,0,0,0,3.042915,1.0
5,1006,18,0,0,1,8.191219,0,0,1,1,0,0,0,3.084184,1.0
9,1010,16,1,0,1,18.444466,0,0,3,1,0,0,0,3.573474,0.0
91,1092,18,1,0,2,7.724728,0,0,2,1,1,0,0,3.50692,0.0
162,1163,18,0,0,2,13.084716,0,1,3,0,0,1,1,3.718587,0.0
196,1197,16,1,0,4,1.989925,0,0,1,1,0,0,1,3.117354,1.0
236,1237,16,1,0,2,5.082946,0,0,3,0,0,1,0,2.98392,2.0
256,1257,17,0,0,2,3.868945,0,1,3,0,0,0,0,3.156508,1.0
259,1260,17,0,0,2,17.934931,0,0,1,0,0,0,0,3.468581,1.0
363,1364,16,0,0,3,7.873638,0,1,4,0,0,1,0,3.88064,0.0


In [44]:
df.tail()

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
2387,3388,18,1,0,3,10.680555,2,0,4,1,0,0,0,3.455509,0.0
2388,3389,17,0,0,1,7.583217,4,1,4,0,1,0,0,3.27915,4.0
2389,3390,16,1,0,2,6.8055,20,0,2,0,0,0,1,1.142333,2.0
2390,3391,16,1,1,0,12.416653,17,0,2,0,1,1,0,1.803297,1.0
2391,3392,16,1,0,2,17.819907,13,0,2,0,0,0,1,2.140014,1.0


In [45]:
df[['Gender', 'Ethnicity','Tutoring']]

Unnamed: 0,Gender,Ethnicity,Tutoring
0,1,0,1
1,0,0,0
2,0,2,0
3,1,0,0
4,1,0,1
...,...,...,...
2387,1,0,0
2388,0,0,1
2389,1,0,0
2390,1,1,0


In [51]:
df.query('GPA > 3.5' and 'Tutoring == 1' and 'Music == 1') 

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,1001,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196,2.0
19,1020,17,0,0,1,10.835206,9,0,2,0,0,1,0,2.395784,3.0
25,1026,16,1,0,3,2.710337,5,0,4,0,0,1,0,2.977852,2.0
27,1028,16,1,0,3,2.252185,8,0,3,0,0,1,0,2.145205,3.0
45,1046,18,0,0,2,4.894312,7,0,4,0,0,1,0,2.515300,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2367,3368,15,1,3,1,9.381207,18,0,3,0,0,1,0,1.454723,1.0
2368,3369,15,0,0,4,6.682728,3,0,2,0,0,1,1,3.158592,1.0
2375,3376,18,0,0,2,18.925290,24,0,1,1,0,1,1,1.164539,0.0
2378,3379,15,1,0,2,12.905555,26,0,2,0,0,1,1,0.709353,3.0


In [52]:
df.shape

(2392, 15)

In [53]:
df.columns

Index(['StudentID', 'Age', 'Gender', 'Ethnicity', 'ParentalEducation',
       'StudyTimeWeekly', 'Absences', 'Tutoring', 'ParentalSupport',
       'Extracurricular', 'Sports', 'Music', 'Volunteering', 'GPA',
       'GradeClass'],
      dtype='object')

In [54]:
df.describe()

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
count,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0
mean,2196.5,16.468645,0.51087,0.877508,1.746237,9.771992,14.541388,0.301421,2.122074,0.383361,0.303512,0.196906,0.157191,1.906186,2.983696
std,690.655244,1.123798,0.499986,1.028476,1.000411,5.652774,8.467417,0.458971,1.122813,0.486307,0.45987,0.397744,0.364057,0.915156,1.233908
min,1001.0,15.0,0.0,0.0,0.0,0.001057,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1598.75,15.0,0.0,0.0,1.0,5.043079,7.0,0.0,1.0,0.0,0.0,0.0,0.0,1.174803,2.0
50%,2196.5,16.0,1.0,0.0,2.0,9.705363,15.0,0.0,2.0,0.0,0.0,0.0,0.0,1.893393,4.0
75%,2794.25,17.0,1.0,2.0,2.0,14.40841,22.0,1.0,3.0,1.0,1.0,0.0,0.0,2.622216,4.0
max,3392.0,18.0,1.0,3.0,4.0,19.978094,29.0,1.0,4.0,1.0,1.0,1.0,1.0,4.0,4.0


In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2392 entries, 0 to 2391
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   StudentID          2392 non-null   int64  
 1   Age                2392 non-null   int64  
 2   Gender             2392 non-null   int64  
 3   Ethnicity          2392 non-null   int64  
 4   ParentalEducation  2392 non-null   int64  
 5   StudyTimeWeekly    2392 non-null   float64
 6   Absences           2392 non-null   int64  
 7   Tutoring           2392 non-null   int64  
 8   ParentalSupport    2392 non-null   int64  
 9   Extracurricular    2392 non-null   int64  
 10  Sports             2392 non-null   int64  
 11  Music              2392 non-null   int64  
 12  Volunteering       2392 non-null   int64  
 13  GPA                2392 non-null   float64
 14  GradeClass         2392 non-null   float64
dtypes: float64(3), int64(12)
memory usage: 280.4 KB


In [56]:
df.isnull().sum()

StudentID            0
Age                  0
Gender               0
Ethnicity            0
ParentalEducation    0
StudyTimeWeekly      0
Absences             0
Tutoring             0
ParentalSupport      0
Extracurricular      0
Sports               0
Music                0
Volunteering         0
GPA                  0
GradeClass           0
dtype: int64

In [57]:
df.corr()

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
StudentID,1.0,-0.042255,-0.014625,-0.01299,-0.002307,0.026976,0.014841,-0.007834,0.003016,-0.003611,-0.020703,-0.005468,0.008011,-0.002697,-0.0985
Age,-0.042255,1.0,0.044895,-0.028473,0.025099,-0.0068,-0.011511,-0.012076,0.033197,-0.025061,-0.04632,-0.003492,0.013074,0.000275,-0.00625
Gender,-0.014625,0.044895,1.0,0.01601,0.006771,0.011469,0.021479,-0.031597,0.008065,-0.005964,-0.008897,0.007109,-0.0002,-0.01336,0.022998
Ethnicity,-0.01299,-0.028473,0.01601,1.0,0.033595,0.007184,-0.025712,-0.01744,0.020922,-0.008927,-0.004484,-0.014627,0.013468,0.02776,-0.023326
ParentalEducation,-0.002307,0.025099,0.006771,0.033595,1.0,-0.011051,0.036518,-0.01734,-0.017463,0.007479,0.002029,0.039439,0.01196,-0.035854,0.041031
StudyTimeWeekly,0.026976,-0.0068,0.011469,0.007184,-0.011051,1.0,0.009326,0.02893,0.0358,-0.02286,0.006836,0.007791,-0.016604,0.179275,-0.134131
Absences,0.014841,-0.011511,0.021479,-0.025712,0.036518,0.009326,1.0,-0.015534,0.002108,0.00036,0.041454,-0.008692,-0.018528,-0.919314,0.728633
Tutoring,-0.007834,-0.012076,-0.031597,-0.01744,-0.01734,0.02893,-0.015534,1.0,-0.000824,0.004865,0.006278,-0.011385,-0.050898,0.145119,-0.111695
ParentalSupport,0.003016,0.033197,0.008065,0.020922,-0.017463,0.0358,0.002108,-0.000824,1.0,-0.008381,-0.006176,0.035122,-0.006036,0.190774,-0.136823
Extracurricular,-0.003611,-0.025061,-0.005964,-0.008927,0.007479,-0.02286,0.00036,0.004865,-0.008381,1.0,-0.01182,-0.014191,-0.007427,0.094078,-0.069733


In [65]:
def highlight_above_threshold(val):
    moderate = .1
    strong = .6
    
    if val == 1:
        color = 'background-color: black'
    elif val > strong: 
        color = 'background-color: gold' 
    elif val > moderate: 
        color = 'background-color: yellow' 
    elif val < (strong * -1):
        color = 'background-color: orange'
    elif val < (moderate * -1): 
        color = 'background-color: levender'     
    else:
        color = ''
    return color


In [66]:
styled_correlation_matrix = df.corr().style.applymap(highlight_above_threshold)

In [67]:
styled_correlation_matrix

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
StudentID,1.0,-0.042255,-0.014625,-0.01299,-0.002307,0.026976,0.014841,-0.007834,0.003016,-0.003611,-0.020703,-0.005468,0.008011,-0.002697,-0.0985
Age,-0.042255,1.0,0.044895,-0.028473,0.025099,-0.0068,-0.011511,-0.012076,0.033197,-0.025061,-0.04632,-0.003492,0.013074,0.000275,-0.00625
Gender,-0.014625,0.044895,1.0,0.01601,0.006771,0.011469,0.021479,-0.031597,0.008065,-0.005964,-0.008897,0.007109,-0.0002,-0.01336,0.022998
Ethnicity,-0.01299,-0.028473,0.01601,1.0,0.033595,0.007184,-0.025712,-0.01744,0.020922,-0.008927,-0.004484,-0.014627,0.013468,0.02776,-0.023326
ParentalEducation,-0.002307,0.025099,0.006771,0.033595,1.0,-0.011051,0.036518,-0.01734,-0.017463,0.007479,0.002029,0.039439,0.01196,-0.035854,0.041031
StudyTimeWeekly,0.026976,-0.0068,0.011469,0.007184,-0.011051,1.0,0.009326,0.02893,0.0358,-0.02286,0.006836,0.007791,-0.016604,0.179275,-0.134131
Absences,0.014841,-0.011511,0.021479,-0.025712,0.036518,0.009326,1.0,-0.015534,0.002108,0.00036,0.041454,-0.008692,-0.018528,-0.919314,0.728633
Tutoring,-0.007834,-0.012076,-0.031597,-0.01744,-0.01734,0.02893,-0.015534,1.0,-0.000824,0.004865,0.006278,-0.011385,-0.050898,0.145119,-0.111695
ParentalSupport,0.003016,0.033197,0.008065,0.020922,-0.017463,0.0358,0.002108,-0.000824,1.0,-0.008381,-0.006176,0.035122,-0.006036,0.190774,-0.136823
Extracurricular,-0.003611,-0.025061,-0.005964,-0.008927,0.007479,-0.02286,0.00036,0.004865,-0.008381,1.0,-0.01182,-0.014191,-0.007427,0.094078,-0.069733


In [69]:
df.Age.value_counts()

15    630
16    593
17    587
18    582
Name: Age, dtype: int64