# Pre-processing
## Variabili numeriche
* Pulizia dei missing mediante valore medio
* Standardizzazione
* PCA (80% varianza totale)



## Variabili categoriche
* Pulizia dei missing mediante valore più probabile
* One-hot encoding

# Selezione delle 6 variabili più importanti usando una ANOVA

In [1]:
import pandas as pd

In [4]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [3]:
from sklearn.decomposition import PCA

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector

In [9]:
from sklearn.feature_selection import SelectKBest, f_classif

In [6]:
df = pd.read_csv("sample_dataset.csv")

In [7]:
X = df.drop('target',axis=1)
y = df['target']

In [17]:
ct = ColumnTransformer([
    (
        'numeriche',
         Pipeline([
            ('missing',SimpleImputer(strategy='mean')),
            ('scaler',StandardScaler()),
            ('pca',PCA(n_components=0.8))
         ]), 
         make_column_selector(dtype_exclude=['object','category','bool'])
    ),
    
    (
        'categoriche',
        Pipeline([
            ('missing', SimpleImputer(strategy='most_frequent')),
            ('encoder',OneHotEncoder(sparse=False))
        ]),
        make_column_selector(dtype_include=['object','category','bool'])
    )
    
])

In [18]:
feature_selector = SelectKBest(f_classif,k = 6)

In [19]:
pipeline = Pipeline([
    ('column_transformer',ct),
    ('selector',feature_selector)
])

In [20]:
pipeline.fit_transform(X,y)

array([[ 8.53523674e+00,  2.61250626e+00, -1.49226559e+00,
        -3.42792659e+00,  1.00000000e+00,  0.00000000e+00],
       [ 2.73321855e+00, -3.71166122e+00,  1.65927031e-04,
        -1.63242149e+00,  1.00000000e+00,  0.00000000e+00],
       [ 4.63667835e+00, -1.19132058e+00, -2.86584909e-01,
        -9.80911922e-01,  1.00000000e+00,  0.00000000e+00],
       ...,
       [ 1.08873892e+00, -2.15427891e+00,  1.16899456e+00,
         2.09989618e+00,  1.00000000e+00,  0.00000000e+00],
       [ 1.01797333e+01,  5.22505519e-01, -2.43102774e+00,
         1.08556305e+00,  1.00000000e+00,  0.00000000e+00],
       [-5.31084528e+00, -2.63343718e-01,  1.39742089e+00,
         1.49304346e+00,  1.00000000e+00,  0.00000000e+00]])