In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import plotly.express as px
import warnings
import plotly.graph_objects as go
from scipy.stats import chi2_contingency

In [2]:
warnings.filterwarnings("ignore")

data = pd.read_csv('student-matA.csv', sep=';')

In [3]:
data

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,20,U,LE3,A,2,2,services,services,...,5,5,4,4,5,4,11,9,9,9
391,MS,M,17,U,LE3,T,3,1,services,services,...,2,4,5,3,4,2,3,14,16,16
392,MS,M,21,R,GT3,T,1,1,other,other,...,5,5,3,3,3,3,3,10,8,7
393,MS,M,18,R,LE3,T,3,2,services,other,...,4,4,1,3,4,5,0,11,12,10


In [4]:
data.isnull().sum()

school        0
sex           0
age           0
address       0
famsize       0
Pstatus       0
Medu          0
Fedu          0
Mjob          0
Fjob          0
reason        0
guardian      0
traveltime    0
studytime     0
failures      0
schoolsup     0
famsup        0
paid          0
activities    0
nursery       0
higher        0
internet      0
romantic      0
famrel        0
freetime      0
goout         0
Dalc          0
Walc          0
health        0
absences      0
G1            0
G2            0
G3            0
dtype: int64

In [5]:
data['G3'].value_counts()

10    56
11    47
0     38
15    33
8     32
13    31
12    31
9     28
14    27
16    16
6     15
18    12
7      9
5      7
17     6
19     5
20     1
4      1
Name: G3, dtype: int64

In [6]:
fig = px.bar(data['G3'].value_counts())

fig.update_layout(
    title="Liczebności zmiennej celu",
    xaxis_title="Ocena",
    yaxis_title="Ilość uczniów"
)
fig.show()

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      395 non-null    object
 1   sex         395 non-null    object
 2   age         395 non-null    int64 
 3   address     395 non-null    object
 4   famsize     395 non-null    object
 5   Pstatus     395 non-null    object
 6   Medu        395 non-null    int64 
 7   Fedu        395 non-null    int64 
 8   Mjob        395 non-null    object
 9   Fjob        395 non-null    object
 10  reason      395 non-null    object
 11  guardian    395 non-null    object
 12  traveltime  395 non-null    int64 
 13  studytime   395 non-null    int64 
 14  failures    395 non-null    int64 
 15  schoolsup   395 non-null    object
 16  famsup      395 non-null    object
 17  paid        395 non-null    object
 18  activities  395 non-null    object
 19  nursery     395 non-null    object
 20  higher    

In [8]:
variable_to_remove = ['G1', 'G2']

binary_variables = ['schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']

for variable in binary_variables:
    data[variable] = data[variable].map({'yes': 1, 'no': 0})

data['sex'] = data['sex'].map({'F': 0, 'M': 1})
data['famsize'] = data['famsize'].map({'LE3': 0, 'GT3': 1})
data['Pstatus'] = data['Pstatus'].map({'T': 0, 'A': 1})
data['address'] = data['address'].map({'U': 0, 'R': 1})
data['school'] = data['school'].map({'GP': 0, 'MS': 1})
data['guardian'] = data['guardian'].map({'mother': 0, 'father': 1, 'other': 2})
data['Mjob'] = data['Mjob'].map({'teacher': 0, 'health': 1, 'services': 2, 'at_home': 3, 'other': 4})
data['Fjob'] = data['Fjob'].map({'teacher': 0, 'health': 1, 'services': 2, 'at_home': 3, 'other': 4})
data['reason'] = data['reason'].map({'home': 0, 'reputation': 1, 'course': 2, 'other': 3})

In [9]:
fig = go.Figure()
fig.add_trace(
    go.Heatmap(
        x = data.corr().columns,
        y = data.corr().index,
        z = np.array(data.corr()),
        text=data.corr().values,
        texttemplate='%{text:.2f}'
    )
)
fig.show()

In [10]:
crosstab = pd.crosstab(data['Walc'], data['Dalc'])
crosstab1 = pd.crosstab(data['Fedu'], data['Medu'])

test_chi = chi2_contingency(crosstab)[1]
test_chi1 = chi2_contingency(crosstab1)[1]

print(test_chi, test_chi1)

1.2457721336029066e-51 8.014562451932377e-34


In [11]:
for column in data.columns:
    index = np.where(abs(data.corr()[column]) >= 0.6)[0]
    if len(index) > 2:
        print(f'Zmienna {column} jest skorelowana z {data.columns[index]}')

Zmienna G1 jest skorelowana z Index(['G1', 'G2', 'G3'], dtype='object')
Zmienna G2 jest skorelowana z Index(['G1', 'G2', 'G3'], dtype='object')
Zmienna G3 jest skorelowana z Index(['G1', 'G2', 'G3'], dtype='object')


In [12]:
abs(data.corr()["G3"]).sort_values(ascending=False)

G3            1.000000
G2            0.904868
G1            0.801468
failures      0.360415
Medu          0.217147
higher        0.182465
age           0.161579
Fedu          0.152457
Mjob          0.145827
goout         0.132791
romantic      0.129970
traveltime    0.117142
address       0.105756
sex           0.103456
paid          0.101996
internet      0.098483
studytime     0.097820
Fjob          0.091128
schoolsup     0.082788
famsize       0.081407
health        0.061335
Pstatus       0.058009
Dalc          0.054660
guardian      0.054193
Walc          0.051939
nursery       0.051568
famrel        0.051363
school        0.045017
famsup        0.039157
absences      0.034247
activities    0.016100
freetime      0.011307
reason        0.008502
Name: G3, dtype: float64

In [13]:
from sklearn.feature_selection import mutual_info_classif
from operator import itemgetter

info = mutual_info_classif(
    X=data.drop(['G1', 'G2', 'G3'], axis=1),
    y=data.G3, 
    random_state=308289
).tolist()

sorted(zip(info, data.columns), key=itemgetter(0), reverse=True)

[(0.24628557129011686, 'higher'),
 (0.20403295154085122, 'internet'),
 (0.20076136648761356, 'nursery'),
 (0.1892907945574711, 'Walc'),
 (0.17501816440279594, 'Dalc'),
 (0.17382969808549698, 'absences'),
 (0.15455698298556042, 'famsup'),
 (0.1174285636006771, 'famsize'),
 (0.10358211083568136, 'traveltime'),
 (0.08841220681924833, 'paid'),
 (0.08493822041745247, 'failures'),
 (0.07188048863927143, 'Mjob'),
 (0.06150357021941, 'activities'),
 (0.05802008135420067, 'romantic'),
 (0.052954237304925655, 'freetime'),
 (0.04290897526539883, 'schoolsup'),
 (0.03742479229763651, 'reason'),
 (0.0367195714351114, 'sex'),
 (0.030595255230388307, 'Fjob'),
 (0.0230140081578063, 'guardian'),
 (0.016546468328907604, 'school'),
 (0.01234683474217002, 'goout'),
 (0.00998683734336181, 'Fedu'),
 (0.006155414489574351, 'age'),
 (0.004168873417390806, 'studytime'),
 (0.0, 'address'),
 (0.0, 'Pstatus'),
 (0.0, 'Medu'),
 (0.0, 'famrel'),
 (0.0, 'health')]

In [14]:
data.groupby(['school'], as_index=True).agg({'G3': ['mean', 'std', 'count']})

Unnamed: 0_level_0,G3,G3,G3
Unnamed: 0_level_1,mean,std,count
school,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,10.489971,4.625397,349
1,9.847826,4.237229,46


In [15]:
px.box(data, x='school', y='G3')

In [16]:
variable_to_remove.append('school')

In [17]:
data.groupby(['sex'], as_index=True).agg({'G3': ['mean', 'std', 'count']})

Unnamed: 0_level_0,G3,G3,G3
Unnamed: 0_level_1,mean,std,count
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,9.966346,4.622338,208
1,10.914439,4.495297,187


In [18]:
px.box(data, x='sex', y='G3')

In [19]:
data.groupby(['age'], as_index=True).agg({'G3': ['mean', 'std', 'count']})

Unnamed: 0_level_0,G3,G3,G3
Unnamed: 0_level_1,mean,std,count
age,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
15,11.256098,4.596861,82
16,11.028846,4.282402,104
17,10.27551,4.290437,98
18,9.54878,4.9942,82
19,8.208333,4.606037,24
20,14.0,4.582576,3
21,7.0,,1
22,8.0,,1


In [20]:
data['age'].replace({22:19, 21:19, 20:19}, inplace=True)

In [21]:
data.groupby(['age'], as_index=True).agg({'G3': ['mean', 'std', 'count']})

Unnamed: 0_level_0,G3,G3,G3
Unnamed: 0_level_1,mean,std,count
age,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
15,11.256098,4.596861,82
16,11.028846,4.282402,104
17,10.27551,4.290437,98
18,9.54878,4.9942,82
19,8.758621,4.718165,29


In [22]:
px.box(data, x='age', y='G3')

In [23]:
data.groupby(['address'], as_index=True).agg({'G3': ['mean', 'std', 'count']})

Unnamed: 0_level_0,G3,G3,G3
Unnamed: 0_level_1,mean,std,count
address,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,10.674267,4.563075,307
1,9.511364,4.556149,88


In [24]:
px.box(data, x='address', y='G3')

In [25]:
data.groupby(['famsize'], as_index=True).agg({'G3': ['mean', 'std', 'count']})

Unnamed: 0_level_0,G3,G3,G3
Unnamed: 0_level_1,mean,std,count
famsize,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,11.0,4.225921,114
1,10.177936,4.70452,281


In [26]:
px.box(data, x='famsize', y='G3')

In [27]:
variable_to_remove.append('famsize')

In [28]:
data.groupby(['Pstatus'], as_index=True).agg({'G3': ['mean', 'std', 'count']})

Unnamed: 0_level_0,G3,G3,G3
Unnamed: 0_level_1,mean,std,count
Pstatus,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,10.324859,4.611121,354
1,11.195122,4.290801,41


In [29]:
px.box(data, x='Pstatus', y='G3')

In [30]:
variable_to_remove.append('Pstatus')

In [31]:
data.groupby(['Fedu'], as_index=True).agg({'G3': ['mean', 'std', 'count']})

Unnamed: 0_level_0,G3,G3,G3
Unnamed: 0_level_1,mean,std,count
Fedu,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,13.0,4.242641,2
1,9.158537,4.563596,82
2,10.26087,4.733396,115
3,10.66,4.149285,100
4,11.364583,4.665934,96


In [32]:
data['Fedu'].replace({0:4}, inplace=True)

In [33]:
px.box(data, x='Fedu', y='G3')

In [34]:
data.groupby(['Mjob'], as_index=True).agg({'G3': ['mean', 'std', 'count']})

Unnamed: 0_level_0,G3,G3,G3
Unnamed: 0_level_1,mean,std,count
Mjob,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,11.051724,4.398654,58
1,12.147059,4.22928,34
2,11.019417,4.758853,103
3,9.152542,4.733642,59
4,9.822695,4.359364,141


In [35]:
px.box(data, x='Mjob', y='G3')

In [36]:
data.groupby(['Fjob'], as_index=True).agg({'G3': ['mean', 'std', 'count']})

Unnamed: 0_level_0,G3,G3,G3
Unnamed: 0_level_1,mean,std,count
Fjob,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,11.965517,5.486885,29
1,11.611111,3.238353,18
2,10.297297,4.477407,111
3,10.15,5.323978,20
4,10.193548,4.511236,217


In [37]:
px.box(data, x='Fjob', y='G3')

In [38]:
data.groupby(['reason'], as_index=True).agg({'G3': ['mean', 'std', 'count']})

Unnamed: 0_level_0,G3,G3,G3
Unnamed: 0_level_1,mean,std,count
reason,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,10.256881,4.643601,109
1,11.142857,4.2617,105
2,9.82069,4.878623,145
3,11.166667,3.760699,36


In [39]:
px.box(data, x='reason', y='G3')

In [40]:
data.groupby(['guardian'], as_index=True).agg({'G3': ['mean', 'std', 'count']})

Unnamed: 0_level_0,G3,G3,G3
Unnamed: 0_level_1,mean,std,count
guardian,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,10.483516,4.597064,273
1,10.688889,4.463697,90
2,9.0625,4.689986,32


In [41]:
px.box(data, x='guardian', y='G3')

In [42]:
data['guardian'].replace({1:0, 2:1}, inplace=True)

In [43]:
data.groupby(['traveltime'], as_index=True).agg({'G3': ['mean', 'std', 'count']})

Unnamed: 0_level_0,G3,G3,G3
Unnamed: 0_level_1,mean,std,count
traveltime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,10.782101,4.523289,257
2,9.906542,4.600108,107
3,9.26087,5.074154,23
4,8.75,3.918819,8


In [44]:
px.box(data, x='traveltime', y='G3')

In [45]:
data['traveltime'].replace({4:3}, inplace=True)

In [46]:
data.groupby(['traveltime'], as_index=True).agg({'G3': ['mean', 'std', 'count']})

Unnamed: 0_level_0,G3,G3,G3
Unnamed: 0_level_1,mean,std,count
traveltime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,10.782101,4.523289,257
2,9.906542,4.600108,107
3,9.129032,4.745116,31


In [47]:
data.groupby(['studytime'], as_index=True).agg({'G3': ['mean', 'std', 'count']})

Unnamed: 0_level_0,G3,G3,G3
Unnamed: 0_level_1,mean,std,count
studytime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,10.047619,4.956311,105
2,10.171717,4.217537,198
3,11.4,4.639504,65
4,11.259259,5.281263,27


In [48]:
px.box(data, x='studytime', y='G3')

In [49]:
data['studytime'].replace({2:1, 3:0, 4:0}, inplace=True)

In [50]:
data.groupby(['studytime'], as_index=True).agg({'G3': ['mean', 'std', 'count']})

Unnamed: 0_level_0,G3,G3,G3
Unnamed: 0_level_1,mean,std,count
studytime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,11.358696,4.807461,92
1,10.128713,4.479527,303


In [51]:
data.groupby(['failures'], as_index=True).agg({'G3': ['mean', 'std', 'count']})

Unnamed: 0_level_0,G3,G3,G3
Unnamed: 0_level_1,mean,std,count
failures,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,11.253205,4.169633,312
1,8.12,4.710561,50
2,6.235294,4.841609,17
3,5.6875,4.190764,16


In [52]:
px.box(data, x='failures', y='G3')

In [53]:
data.groupby(['schoolsup'], as_index=True).agg({'G3': ['mean', 'std', 'count']})

Unnamed: 0_level_0,G3,G3,G3
Unnamed: 0_level_1,mean,std,count
schoolsup,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,10.561047,4.769533,344
1,9.431373,2.865344,51


In [54]:
px.box(data, x='schoolsup', y='G3')

In [55]:
data.groupby(['famsup'], as_index=True).agg({'G3': ['mean', 'std', 'count']})

Unnamed: 0_level_0,G3,G3,G3
Unnamed: 0_level_1,mean,std,count
famsup,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,10.640523,4.636262,153
1,10.272727,4.550318,242


In [56]:
px.box(data, x='famsup', y='G3')

In [57]:
variable_to_remove.append('famsup')

In [58]:
data.groupby(['paid'], as_index=True).agg({'G3': ['mean', 'std', 'count']})

Unnamed: 0_level_0,G3,G3,G3
Unnamed: 0_level_1,mean,std,count
paid,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,9.985981,5.12609,214
1,10.922652,3.791011,181


In [59]:
px.box(data, x='paid', y='G3')

In [60]:
data.groupby(['activities'], as_index=True).agg({'G3': ['mean', 'std', 'count']})

Unnamed: 0_level_0,G3,G3,G3
Unnamed: 0_level_1,mean,std,count
activities,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,10.340206,4.488065,194
1,10.487562,4.679861,201


In [61]:
px.box(data, x='activities', y='G3')

In [62]:
variable_to_remove.append('activities')

In [63]:
data.groupby(['nursery'], as_index=True).agg({'G3': ['mean', 'std', 'count']})

Unnamed: 0_level_0,G3,G3,G3
Unnamed: 0_level_1,mean,std,count
nursery,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,9.950617,4.560431,81
1,10.535032,4.586449,314


In [64]:
px.box(data, x='nursery', y='G3')

In [65]:
variable_to_remove.append('nursery')

In [66]:
data.groupby(['higher'], as_index=True).agg({'G3': ['mean', 'std', 'count']})

Unnamed: 0_level_0,G3,G3,G3
Unnamed: 0_level_1,mean,std,count
higher,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,6.8,4.829732,20
1,10.608,4.493422,375


In [67]:
px.box(data, x='higher', y='G3')

In [68]:
data.groupby(['internet'], as_index=True).agg({'G3': ['mean', 'std', 'count']})

Unnamed: 0_level_0,G3,G3,G3
Unnamed: 0_level_1,mean,std,count
internet,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,9.409091,4.485797,66
1,10.617021,4.580494,329


In [69]:
px.box(data, x='internet', y='G3')

In [70]:
data.groupby(['romantic'], as_index=True).agg({'G3': ['mean', 'std', 'count']})

Unnamed: 0_level_0,G3,G3,G3
Unnamed: 0_level_1,mean,std,count
romantic,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,10.836502,4.385946,263
1,9.575758,4.856916,132


In [71]:
px.box(data, x='romantic', y='G3')

In [72]:
data.groupby(['famrel'], as_index=True).agg({'G3': ['mean', 'std', 'count']})

Unnamed: 0_level_0,G3,G3,G3
Unnamed: 0_level_1,mean,std,count
famrel,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,10.625,4.838462,8
2,9.888889,5.550717,18
3,10.044118,4.647046,68
4,10.358974,4.395916,195
5,10.830189,4.733813,106


In [73]:
px.box(data, x='famrel', y='G3')

In [74]:
variable_to_remove.append('famrel')

In [75]:
data.groupby(['freetime'], as_index=True).agg({'G3': ['mean', 'std', 'count']})

Unnamed: 0_level_0,G3,G3,G3
Unnamed: 0_level_1,mean,std,count
freetime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,9.842105,4.752346,19
2,11.5625,4.219663,64
3,9.783439,4.79492,157
4,10.426087,4.330757,115
5,11.3,4.619912,40


In [76]:
px.box(data, x='freetime', y='G3')

In [77]:
variable_to_remove.append('freetime')

In [78]:
data.groupby(['goout'], as_index=True).agg({'G3': ['mean', 'std', 'count']})

Unnamed: 0_level_0,G3,G3,G3
Unnamed: 0_level_1,mean,std,count
goout,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,9.869565,5.336873,23
2,11.194175,4.535391,103
3,10.961538,4.210367,130
4,9.651163,4.421252,86
5,9.037736,5.072408,53


In [79]:
px.box(data, x='goout', y='G3')

In [80]:
data.groupby(['Walc'], as_index=True).agg({'G3': ['mean', 'std', 'count']})

Unnamed: 0_level_0,G3,G3,G3
Unnamed: 0_level_1,mean,std,count
Walc,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,10.735099,5.133812,151
2,10.082353,4.950257,85
3,10.725,3.700753,80
4,9.686275,3.619338,51
5,10.142857,4.12503,28


In [81]:
px.box(data, x='Walc', y='G3')

In [82]:
variable_to_remove.append('Walc')

In [83]:
data.groupby(['health'], as_index=True).agg({'G3': ['mean', 'std', 'count']})

Unnamed: 0_level_0,G3,G3,G3
Unnamed: 0_level_1,mean,std,count
health,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,11.87234,4.351996,47
2,10.222222,5.497474,45
3,10.010989,4.183286,91
4,10.106061,4.871041,66
5,10.39726,4.41702,146


In [84]:
px.box(data, x='health', y='G3')

In [85]:
data['health'].replace({1:0, 2:1, 3:1, 4:2, 5:2}, inplace=True)

In [86]:
data['absences'].value_counts()

0     115
2      65
4      53
6      31
8      22
10     17
14     12
12     12
3       8
16      7
7       7
5       5
18      5
20      4
11      3
9       3
13      3
15      3
22      3
1       3
38      1
30      1
40      1
23      1
19      1
28      1
75      1
21      1
24      1
56      1
26      1
54      1
25      1
17      1
Name: absences, dtype: int64

In [87]:
from collections import Counter

df = pd.DataFrame(
    Counter(data['absences']).items(), 
    columns=[
        'Liczba nieobecności', 
        'Liczba nieobecnych'
    ]
)

In [88]:
px.histogram(df, x='Liczba nieobecności', y='Liczba nieobecnych')

In [89]:
absences= data["absences"]
data["absences_bin"] = pd.qcut(absences, 2, labels=[0, 1])
data['absences_cat'] = pd.qcut(absences, 3, labels=[0, 1, 2])

In [90]:
data.groupby(['absences_bin'], as_index=True).agg({'G3': ['mean', 'std', 'count']})

Unnamed: 0_level_0,G3,G3,G3
Unnamed: 0_level_1,mean,std,count
absences_bin,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,10.139344,5.194672,244
1,10.860927,3.330745,151


In [91]:
px.box(data, x='absences_bin', y='G3')

In [92]:
px.box(data, x='absences_cat', y='G3')

In [93]:
variable_to_remove.extend(['Unnamed: 0','absences', 'absences_bin', 'Dalc', 'Medu', 'absences_cat'])

In [94]:
data = data[list(set(data.columns) - set(variable_to_remove))]

#data.to_csv('uczniowie.csv', sep=';', index=False)

In [95]:
variable_to_remove

['G1',
 'G2',
 'school',
 'famsize',
 'Pstatus',
 'famsup',
 'activities',
 'nursery',
 'famrel',
 'freetime',
 'Walc',
 'Unnamed: 0',
 'absences',
 'absences_bin',
 'Dalc',
 'Medu',
 'absences_cat']