# Operare su dati qualitativi

In [21]:
import pandas as pd

CSV_URL = "https://raw.githubusercontent.com/ProfAI/machine-learning-fondamenti/main/datasets/shirts.csv"

shirts = pd.read_csv(CSV_URL, index_col=0)
shirts.head()

Unnamed: 0,taglia,colore,prezzo
0,S,bianco,4.99
1,M,bianco,19.99
2,XL,bianco,12.49
3,XL,bianco,14.99
4,S,bianco,14.99


## Ordinal encoding delle variabili ordinali

#### Pandas

In [3]:
size_mapping = {"S":0,"M":1,"L":2,"XL":3} #dizionario che ordina le misure
shirts["taglia"] = shirts["taglia"].map(size_mapping) #mappiamo la misura con il numero corrispondente
shirts.head()

Unnamed: 0,taglia,colore,prezzo
0,0,bianco,4.99
1,1,bianco,19.99
2,3,bianco,12.49
3,3,bianco,14.99
4,0,bianco,14.99


#### Numpy

In [4]:
import numpy as np

shirts = pd.read_csv(CSV_URL,index_col=0)
X = shirts.values

size_mapping = {"S":0,"M":1,"L":2,"XL":3} #dizionario che ordina le misure
fmap = np.vectorize(lambda t:size_mapping[t])
X[:,0] = fmap(X[:,0])
X[:5]

array([[0, 'bianco', 4.99],
       [1, 'bianco', 19.99],
       [3, 'bianco', 12.49],
       [3, 'bianco', 14.99],
       [0, 'bianco', 14.99]], dtype=object)

## One-hot encoding

#### Scikit-learn

In [10]:
from sklearn.preprocessing import OneHotEncoder

X = [["bianco"], ["rosso"], ["bianco"], ["blu"], ["rosso"], ["verde"]]

enc = OneHotEncoder()
X_sparse = enc.fit_transform(X)
X = X_sparse.toarray()
X

array([[1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

In [15]:
enc.categories_

[array(['bianco', 'blu', 'rosso', 'verde'], dtype=object)]

In [9]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

X = shirts.values 
transf = ColumnTransformer([('ohe', OneHotEncoder(), [1])], remainder="passthrough")

X = transf.fit_transform(X)
X 

array([[1.0, 0.0, 0.0, 'S', 4.99],
       [1.0, 0.0, 0.0, 'M', 19.99],
       [1.0, 0.0, 0.0, 'XL', 12.49],
       [1.0, 0.0, 0.0, 'XL', 14.99],
       [1.0, 0.0, 0.0, 'S', 14.99],
       [0.0, 0.0, 1.0, 'S', 7.99],
       [0.0, 0.0, 1.0, 'M', 4.99],
       [0.0, 0.0, 1.0, 'L', 12.49],
       [1.0, 0.0, 0.0, 'XL', 12.49],
       [0.0, 0.0, 1.0, 'M', 19.99],
       [1.0, 0.0, 0.0, 'L', 14.99],
       [1.0, 0.0, 0.0, 'XL', 19.99],
       [1.0, 0.0, 0.0, 'M', 4.99],
       [1.0, 0.0, 0.0, 'L', 7.99],
       [1.0, 0.0, 0.0, 'M', 14.99],
       [0.0, 1.0, 0.0, 'XL', 9.99],
       [0.0, 1.0, 0.0, 'S', 12.49],
       [1.0, 0.0, 0.0, 'L', 7.99],
       [1.0, 0.0, 0.0, 'XL', 4.99],
       [0.0, 0.0, 1.0, 'M', 14.99],
       [0.0, 0.0, 1.0, 'S', 14.99],
       [1.0, 0.0, 0.0, 'XL', 7.99],
       [0.0, 0.0, 1.0, 'S', 9.99],
       [1.0, 0.0, 0.0, 'XL', 14.99],
       [0.0, 1.0, 0.0, 'S', 14.99],
       [1.0, 0.0, 0.0, 'XL', 9.99],
       [0.0, 0.0, 1.0, 'M', 7.99],
       [1.0, 0.0, 0.0, 'XL', 4.

### Pandas

In [22]:
shirts = pd.get_dummies(shirts,columns=["colore"]) # prefix="col",  prefix_sep='-''
shirts.head()

Unnamed: 0,taglia,prezzo,colore_bianco,colore_rosso,colore_verde
0,S,4.99,1,0,0
1,M,19.99,1,0,0
2,XL,12.49,1,0,0
3,XL,14.99,1,0,0
4,S,14.99,1,0,0


## Label encoding per la variabile target

In [45]:
CSV_URL = "https://raw.githubusercontent.com/ProfAI/machine-learning-fondamenti/main/datasets/shirts_sold.csv"

shirts = pd.read_csv(CSV_URL, index_col=0)
shirts.head()

Unnamed: 0,taglia,colore,prezzo,venduta
0,S,bianco,4.99,NO
1,M,bianco,19.99,SI
2,XL,bianco,12.49,NO
3,XL,bianco,14.99,NO
4,S,bianco,14.99,SI


In [48]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
shirts["venduta"] = le.fit_transform(shirts["venduta"])
shirts.head()

Unnamed: 0,taglia,colore,prezzo,venduta
0,S,bianco,4.99,0
1,M,bianco,19.99,1
2,XL,bianco,12.49,0
3,XL,bianco,14.99,0
4,S,bianco,14.99,1
