# Consult IT - preliminary elimination
## Team members (*Fratelli*):
1. Wiktor Jakubowski
2. Luca Nowosielski

In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, KBinsDiscretizer
from sklearn.cluster import AgglomerativeClustering

In [2]:
# load data
data = pd.read_excel('Baza_uczniów_CIT_2024.xlsx')
data.drop(['Imię', 'L.P.'], axis=1, inplace=True)

In [3]:
# check for missing values - there is none
data.isna().sum()

Płeć                     0
Wiek                     0
Język                    0
Poziom                   0
Hobby                    0
Typ prowadzenia zajęć    0
dtype: int64

In [4]:
### operations for categorical columns with order or binary values
ord_pipeline = Pipeline(steps=[
    ('encode', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
])

### operations for categorical unordered columns
cat_pipeline = Pipeline(steps=[
    ('encode', OneHotEncoder(handle_unknown='ignore'))
])

### operations for numerical columns
num_pipeline = Pipeline(steps=[
    ('discretize', KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform'))
])

# Column transformer
col_trans = ColumnTransformer(transformers=[
    ('ord_pipeline', ord_pipeline, ['Płeć', 'Typ prowadzenia zajęć', 'Poziom']),
    ('cat_pipeline', cat_pipeline, ['Język', 'Hobby']),
    ('num_pipeline', num_pipeline, ['Wiek'])
],
remainder='drop',
n_jobs=-1)

# hierarchical clustering at the end of the pipeline (limit as desired number of people in the group)
model_pipeline = Pipeline([
    ('preprocessing', col_trans),
    # ('clustering', AgglomerativeClustering(metric='euclidean', linkage='ward'))
])

In [5]:
# preprocess data
data_preprocessed = model_pipeline.fit_transform(data)

# convert compressed data to numpy array
decompressed_data = data_preprocessed.toarray()
