In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
import plotly.express as px
import plotly.graph_objects as go
from scipy.stats import chi2_contingency
import numpy as np
import seaborn as sns
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from prettytable import PrettyTable

In [21]:
data = pd.read_csv('uczniowie.csv', sep=';')

X = data.drop('G3', axis=1)
y = data.G3

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.33, 
    random_state=308289
)

In [23]:
X_train.columns

Index(['failures', 'romantic', 'address', 'age', 'higher', 'goout',
       'traveltime', 'schoolsup', 'internet', 'Mjob', 'reason', 'guardian',
       'Fjob', 'sex', 'paid', 'studytime', 'health', 'Fedu'],
      dtype='object')

In [24]:
X_train['G3'] = y_train

In [25]:
fig = go.Figure()
fig.add_trace(
    go.Heatmap(
        x = X_train.corr().columns,
        y = X_train.corr().index,
        z = np.array(X_train.corr()),
        text=X_train.corr().values,
        texttemplate='%{text:.2f}'
    )
)
fig.update_layout(title='Korelacja Pearsona')
fig.show()

In [26]:
fig1 = go.Figure()
fig1.add_trace(
    go.Heatmap(
        x = X_train.corr(method='spearman').columns,
        y = X_train.corr(method='spearman').index,
        z = np.array(X_train.corr()),
        text=X_train.corr(method='spearman').values,
        texttemplate='%{text:.2f}'
    )
)
fig1.update_layout(title='Korelacja Spearmana')
fig1.show()

In [27]:
X_train.drop('G3', axis=1, inplace=True)

In [28]:
info_class = mutual_info_classif(
    X_train,
    y_train, 
    random_state=308289
)

fig2 = go.Bar(name='Klasyfikacja',x=X_train.columns, y=info_class)

info_reg = mutual_info_regression(
    X_train,
    y_train, 
    random_state=308289
)

fig3 = go.Bar(name='Regresja', x=X_train.columns, y=info_reg)

In [29]:
fig4 = go.Bar(
    name='Średnia', 
    x=X_train.columns,
    y=np.mean(np.array([info_reg, info_class]), axis=0)
)

In [30]:
fig5 = go.Figure(
    data = [
        fig2,
        fig3,
        fig4
    ]
)
fig5.show()

In [31]:
def cramers_V(var1, var2):
  crosstab = np.array(pd.crosstab(var1, var2, rownames=None, colnames=None))
  stat = chi2_contingency(crosstab)[0]
  obs = np.sum(crosstab)
  mini = min(crosstab.shape) - 1 
  return (stat / (obs * mini))

In [32]:
c = cramers_V(data['Fjob'], data['Fedu'])
c1 = cramers_V(data['age'], data['guardian'])
c2 = cramers_V(data['guardian'], data['failures'])
c3 = cramers_V(data['Fjob'], data['Mjob'])

In [33]:
test_chi = chi2_contingency(pd.crosstab(data['Fjob'], data['Fedu']))[1]
test_chi1 = chi2_contingency(pd.crosstab(data['age'], data['guardian']))[1]
test_chi2 = chi2_contingency(pd.crosstab(data['guardian'], data['failures']))[1]
test_chi3 = chi2_contingency(pd.crosstab(data['Fjob'], data['Mjob']))[1]

In [34]:
table = PrettyTable()

table.field_names = ["Para", "Korelacja Cramera V", "p-value testu niezależności"]

table.add_row(["Fjob i Fedu", c, test_chi])
table.add_row(["Age i Guardian", c1, test_chi1])
table.add_row(["Failures i Guardian", c2, test_chi2])
table.add_row(["Fjob i Mjob", c3, test_chi3])

In [35]:
print(table)

+---------------------+----------------------+-----------------------------+
|         Para        | Korelacja Cramera V  | p-value testu niezależności |
+---------------------+----------------------+-----------------------------+
|     Fjob i Fedu     | 0.08744933086034527  |    1.0808166541983247e-16   |
|    Age i Guardian   | 0.49139665481309813  |    6.963175358356027e-41    |
| Failures i Guardian | 0.12446677120599246  |    1.2034708134316784e-10   |
|     Fjob i Mjob     | 0.046443604603197894 |    2.533576541234461e-09    |
+---------------------+----------------------+-----------------------------+


In [36]:
data.drop(['schoolsup', 'sex', 'traveltime', 'health', 'guardian'], axis=1, inplace=True)

In [37]:
data

Unnamed: 0,failures,romantic,address,age,higher,goout,internet,Mjob,reason,Fjob,paid,studytime,G3,Fedu
0,0,0,0,18,1,4,0,3,2,0,0,1,6,4
1,0,0,0,17,1,3,1,3,2,4,0,1,6,1
2,3,0,0,15,1,2,1,3,3,4,1,1,10,1
3,0,1,0,15,1,2,1,1,0,2,1,0,15,2
4,0,0,0,16,1,2,0,4,0,4,1,1,10,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,2,0,0,19,1,4,0,2,2,2,1,1,9,2
391,0,0,0,17,1,5,1,2,2,2,0,1,16,1
392,3,0,1,19,1,3,0,4,2,4,0,1,7,1
393,0,0,1,18,1,1,1,2,2,4,0,1,10,2


In [38]:
#data.to_csv('dane_do_modelu.csv', sep=';', index=False)