## Handling Numerical Missing Values

In [1]:
# Load CSV data from a string into a pandas DataFrame, handling missing values.

import pandas as pd
from io import StringIO

csv_data = '''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''

# For Python 2.7, uncomment the following line:
# csv_data = unicode(csv_data)

df = pd.read_csv(StringIO(csv_data))
print(df)

      A     B     C    D
0   1.0   2.0   3.0  4.0
1   5.0   6.0   NaN  8.0
2  10.0  11.0  12.0  NaN


In [2]:
df.isnull().sum() # returns the count of missing values in each column

A    0
B    0
C    1
D    1
dtype: int64

Note: when it comes to panda dataframes vs NumPy arrays, both are supported by Scikit-learn, but NumPy arrays are more mature and so it is recommended to use them. However, if you are more comfortable with pandas, you can use them as well.

### Deleting Missing Values

In [3]:
df.dropna(axis=0) # drops rows with any NaN values

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [4]:
df.dropna(axis=1) # drops columns with any NaN values

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


### Imputing Missing Values

In [5]:
# Imputing missing values

from sklearn.impute import SimpleImputer
import numpy as np

imr = SimpleImputer(missing_values=np.nan, strategy='mean')
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)
imputed_data

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

In [6]:
df.fillna(df.mean()) # fills NaN values with the mean of each column

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.5,8.0
2,10.0,11.0,12.0,6.0


## Handling Categorical Missing Values

In [7]:
# Make example dataframe with categorical data
import pandas as pd

df = pd.DataFrame([
    ['green', 'M', 10.1, 'class2'],
    ['red', 'L', 13.5, 'class1'],
    ['blue', 'XL', 15.3, 'class2']
])

df.columns = ['color', 'size', 'price', 'classlabel']
print(df)

   color size  price classlabel
0  green    M   10.1     class2
1    red    L   13.5     class1
2   blue   XL   15.3     class2


In [8]:
# Mapping ordinal features to numerical values
size_mapping = {'XL': 3, 'L': 2, 'M': 1}
df['size'] = df['size'].map(size_mapping)
print(df)

   color  size  price classlabel
0  green     1   10.1     class2
1    red     2   13.5     class1
2   blue     3   15.3     class2


In [9]:
# making a reverse mapping to convert numerical values back to original categories
inv_size_mapping = {v: k for k, v in size_mapping.items()}
df['size'].map(inv_size_mapping)

0     M
1     L
2    XL
Name: size, dtype: object

In [10]:
# Encoding class labels
class_mapping = {label: idx for idx, label in enumerate(np.unique(df['classlabel']))}
print(class_mapping)

{'class1': 0, 'class2': 1}


In [11]:
# Transform the labels to integers
df['classlabel'] = df['classlabel'].map(class_mapping)
print(df)

   color  size  price  classlabel
0  green     1   10.1           1
1    red     2   13.5           0
2   blue     3   15.3           1


In [12]:
# Create reverse mapping for class labels
inv_class_mapping = {v: k for k, v in class_mapping.items()}
df['classlabel'] = df['classlabel'].map(inv_class_mapping)
print(df)

   color  size  price classlabel
0  green     1   10.1     class2
1    red     2   13.5     class1
2   blue     3   15.3     class2


In [13]:
# Alternatively, can use LabelEncoder
from sklearn.preprocessing import LabelEncoder

class_le = LabelEncoder()
y = class_le.fit_transform(df['classlabel'].values)
print(y)

[1 0 1]


In [14]:
# Reversing the transformation
class_le.inverse_transform(y)

array(['class2', 'class1', 'class2'], dtype=object)

### one-hot encoding on nominal features

In [15]:
# First convert color column to numerical values with LabelEncoder
from sklearn.preprocessing import LabelEncoder
X = df[['color', 'size', 'price']].values
color_le = LabelEncoder()
X[:, 0] = color_le.fit_transform(X[:, 0])
print(X)

[[1 1 10.1]
 [2 2 13.5]
 [0 3 15.3]]


In [16]:
# Use OneHotEncoder to convert the 'color' column into one-hot encoded vectors.

from sklearn.preprocessing import OneHotEncoder

X = df[['color', 'size', 'price']].values
color_ohe = OneHotEncoder()
one_hot_encoded = color_ohe.fit_transform(X[:, 0].reshape(-1, 1)).toarray()
print(one_hot_encoded)

[[0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]]


In [17]:
# Use ColumnTransformer to apply OneHotEncoder to the 'color' column
# while passing through 'size' and 'price' columns unchanged.

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

X = df[['color', 'size', 'price']].values

c_transf = ColumnTransformer([
    ('onehot', OneHotEncoder(), [0]),   # One-hot encode 'color'
    ('nothing', 'passthrough', [1, 2])  # Keep 'size' and 'price' as is
])

transformed_X = c_transf.fit_transform(X).astype(float)
print(transformed_X)

[[ 0.   1.   0.   1.  10.1]
 [ 0.   0.   1.   2.  13.5]
 [ 1.   0.   0.   3.  15.3]]


In [20]:
# Use ColumnTransformer with OneHotEncoder to one-hot encode 'color',
# dropping the first category to avoid multicollinearity,
# while leaving 'size' and 'price' unchanged.

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

color_ohe = OneHotEncoder(categories='auto', drop='first')

c_transf = ColumnTransformer([
    ('onehot', color_ohe, [0]),        # One-hot encode 'color' (drop first category)
    ('nothing', 'passthrough', [1, 2]) # Keep 'size' and 'price'
])

transformed_X = c_transf.fit_transform(X).astype(float)
print(transformed_X)

[[ 1.   0.   1.  10.1]
 [ 0.   1.   2.  13.5]
 [ 0.   0.   3.  15.3]]


In [18]:
# Use pandas get_dummies to one-hot encode the 'color' and 'size' columns automatically,
# while keeping the 'price' column unchanged.

import pandas as pd

one_hot_df = pd.get_dummies(df[['price', 'color', 'size']])
print(one_hot_df)

   price  size  color_blue  color_green  color_red
0   10.1     1       False         True      False
1   13.5     2       False        False       True
2   15.3     3        True        False      False


In [19]:
# Use pandas get_dummies to one-hot encode 'color' and 'size' columns,
# but drop the first category in each to avoid multicollinearity (dummy variable trap).

import pandas as pd

one_hot_df = pd.get_dummies(df[['price', 'color', 'size']], drop_first=True)
print(one_hot_df)

   price  size  color_green  color_red
0   10.1     1         True      False
1   13.5     2        False       True
2   15.3     3        False      False


### Encoding ordinal Features

In [21]:
# Create a pandas DataFrame with categorical and numerical features, plus class labels.

import pandas as pd

df = pd.DataFrame([
    ['green', 'M', 10.1, 'class2'],
    ['red', 'L', 13.5, 'class1'],
    ['blue', 'XL', 15.3, 'class2']
])

df.columns = ['color', 'size', 'price', 'classlabel']
print(df)

   color size  price classlabel
0  green    M   10.1     class2
1    red    L   13.5     class1
2   blue   XL   15.3     class2


In [22]:
# Manually encode the 'size' column into binary features:
# 'x > M' is 1 if size is L or XL, otherwise 0
# 'x > L' is 1 if size is XL, otherwise 0
# Then remove the original 'size' column.

df['x > M'] = df['size'].apply(lambda x: 1 if x in {'L', 'XL'} else 0)
df['x > L'] = df['size'].apply(lambda x: 1 if x == 'XL' else 0)
del df['size']

print(df)

   color  price classlabel  x > M  x > L
0  green   10.1     class2      0      0
1    red   13.5     class1      1      0
2   blue   15.3     class2      1      1


## Partitioning Dataset into Training and Test Sets

In [24]:
# Use wine dataset from UCI Machine Learning Repository for this example

import pandas as pd
import numpy as np

df_wine = pd.read_csv(
    'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data',
    header=None
)

df_wine.columns = [
    'Class label', 'Alcohol', 'Malic acid', 'Ash',
    'Alcalinity of ash', 'Magnesium', 'Total phenols',
    'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
    'Color intensity', 'Hue',
    'OD280/OD315 of diluted wines', 'Proline'
]

print('Class labels:', np.unique(df_wine['Class label']))
print(df_wine.head())


Class labels: [1 2 3]
   Class label  Alcohol  Malic acid   Ash  Alcalinity of ash  Magnesium  \
0            1    14.23        1.71  2.43               15.6        127   
1            1    13.20        1.78  2.14               11.2        100   
2            1    13.16        2.36  2.67               18.6        101   
3            1    14.37        1.95  2.50               16.8        113   
4            1    13.24        2.59  2.87               21.0        118   

   Total phenols  Flavanoids  Nonflavanoid phenols  Proanthocyanins  \
0           2.80        3.06                  0.28             2.29   
1           2.65        2.76                  0.26             1.28   
2           2.80        3.24                  0.30             2.81   
3           3.85        3.49                  0.24             2.18   
4           2.80        2.69                  0.39             1.82   

   Color intensity   Hue  OD280/OD315 of diluted wines  Proline  
0             5.64  1.04          

In [25]:
# Split the Wine dataset into training and test sets (70/30 split),
# ensuring class proportions are preserved with stratified sampling.

from sklearn.model_selection import train_test_split

X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=0,
    stratify=y
)

## Bringing Features onto Same Scale (Normalization/Standardization)

In [None]:
# Apply normalization to the training and test sets

from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train)  # Fit on training data and scale
X_test_norm = mms.transform(X_test)        # Use same scaling on test data

In [None]:
# Apply standardization to the training and test sets

from sklearn.preprocessing import StandardScaler

stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)  # Fit on training data and scale
X_test_std = stdsc.transform(X_test)        # Use same scaling on test data

In [27]:
# Difference between standardization and normalization:

import numpy as np

ex = np.array([0, 1, 2, 3, 4, 5])

# Standardization: mean = 0, std = 1
print('standardized:', (ex - ex.mean()) / ex.std())

# Normalization: scale values into [0, 1]
print('normalized:', (ex - ex.min()) / (ex.max() - ex.min()))

standardized: [-1.46385011 -0.87831007 -0.29277002  0.29277002  0.87831007  1.46385011]
normalized: [0.  0.2 0.4 0.6 0.8 1. ]


## Selecting Meaningful Features