Per il preprocessing serve fare una pipeline in cui:
- vengono gestiti i valori Nan
- viene fatto encoding o standardizzazione

Pipeline diverse per tipi di dato diversi

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from pandas.api.types import CategoricalDtype
from scipy.stats import chi2_contingency
import seaborn as sns


df = pd.read_csv('data.csv')

In [5]:
binary_cols = ['HLTHPLN1','MEDCOST','BLOODCHO','TOLDHI2','CVDINFR4',
               'CVDCRHD4','CVDSTRK3','ASTHMA3','CHCSCNCR','CHCOCNCR',
               'CHCCOPD1','HAVARTH3','ADDEPEV2','CHCKIDNY','VETERAN3',
               'INTERNET','QLACTLM2','USEEQUIP','BLIND','DECIDE',
               'DIFFWALK','DIFFDRES','DIFFALON','SMOKE100','EXERANY2','FLUSHOT6',
               'PNEUVAC3', 'HIVTST6', 'PERSDOC2', 'BPHIGH4'] 
nominal_cols = [
    '_STATE','MARITAL','EMPLOY1','_RACE', '_BMI5CAT'
]

numeric_cols = [
    'PHYSHLTH','MENTHLTH','CHILDREN','NUMADULT_2',
    'ALCDAY5', 'FRUITJU1','NutritionScore',  'STRENGTH'
]
def to_binary(series, yes_value=1, no_value=2):
    """
    Trasforma una Serie pandas con codifica 1=yes, 2=no in 0/1.
    - yes_value  → 1
    - no_value   → 0
    - tutti gli altri valori → np.nan
    """
    return series.map({yes_value: 1, no_value: 0}).astype('Int64')

for col in binary_cols:
    if col in df.columns:
        df[f'{col}'] = to_binary(df[col])
    else:
        print(f"Colonna `{col}` non trovata nel DataFrame. Operazione saltata.")

ordinal_asc = {
    'GENHLTH':       [1,2,3,4,5],       # 1=Excellent … 5=Poor
    'CHECKUP1':      [1,2,3,4,5,6,7,8], # 
    'CHOLCHK':       [1,2,3,4],         
    '_AGE_G':        [1,2,3,4,5,6],     # 1=18–24 … 6=65+
    '_PACAT1':       [1,2,3,4],         # 1=High active … 4=Inactive      
}

# e quali ordinali hanno codifica "1 = worst, ↑ = better" (e.g. lower code = peggiore)
ordinal_desc = {
    'EDUCA':         [6,5,4,3,2,1],     # 1=None … 6=Post-grad (invertito)
    'INCOME2':       [8,7,6,5,4,3,2,1],     # 1=Less than $10K … 6=$75K or more
    '_SMOKER3':      [4,3,2,1] ,         # 1=Current every day … 4=Never        
}

# 2) Cast nominali a 'category' (manteniamo i codici numerici)
for col in nominal_cols:
    df[col] = df[col].astype('category')

# 3) Cast ordinali a 'category' con ordered=True
for col, cats in ordinal_asc.items():
    cat_type = CategoricalDtype(categories=cats, ordered=True)
    df[col] = df[col].astype(cat_type)

for col, cats in ordinal_desc.items():
    cat_type = CategoricalDtype(categories=cats, ordered=True)
    df[col] = df[col].astype(cat_type)


df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 321457 entries, 0 to 321456
Data columns (total 52 columns):
 #   Column          Non-Null Count   Dtype   
---  ------          --------------   -----   
 0   _STATE          321457 non-null  category
 1   GENHLTH         320707 non-null  category
 2   PHYSHLTH        315181 non-null  float64 
 3   MENTHLTH        316891 non-null  float64 
 4   HLTHPLN1        304968 non-null  Int64   
 5   PERSDOC2        287776 non-null  Int64   
 6   MEDCOST         27015 non-null   Int64   
 7   CHECKUP1        318370 non-null  category
 8   BPHIGH4         148310 non-null  Int64   
 9   BLOODCHO        321457 non-null  Int64   
 10  CHOLCHK         317268 non-null  category
 11  TOLDHI2         137499 non-null  Int64   
 12  CVDINFR4        21101 non-null   Int64   
 13  CVDCRHD4        21460 non-null   Int64   
 14  CVDSTRK3        14649 non-null   Int64   
 15  ASTHMA3         43177 non-null   Int64   
 16  CHCSCNCR        35369 non-null   Int64