# Chapter 4: Handling Numerical Data

In [1]:
import numpy as np
from sklearn import preprocessing

## 4.1 Rescaling a Feature

In [2]:
feature = np.array([[-500.5],
                   [-100.1],
                   [0],
                   [100.1],
                   [900.9]])

Create scaler:

In [3]:
minmaxScale = preprocessing.MinMaxScaler(feature_range=(0,1))

In [4]:
type(minmaxScale)

sklearn.preprocessing._data.MinMaxScaler

In [5]:
scaledFeature = minmaxScale.fit_transform(feature)

In [6]:
scaledFeature

array([[0.        ],
       [0.28571429],
       [0.35714286],
       [0.42857143],
       [1.        ]])

In [7]:
900.9+500.5

1401.4

In [8]:
500.5-100.1

400.4

In [9]:
400.4/1401.4

0.2857142857142857

Got it.

## 4.2 Standardizing a Feature

Change to mean of 0 and std dev of 1 (i.e. transform to be approximately normally distributed) - *z-score*

In [10]:
x = np.array([[-1000.1],
             [-200.2],
              [500.5],
              [600.6],
              [9000.9]])

Create scaler:

In [11]:
scaler = preprocessing.StandardScaler()

Transform the feature:

In [12]:
standardized = scaler.fit_transform(x)

In [13]:
standardized

array([[-0.76058269],
       [-0.54177196],
       [-0.35009716],
       [-0.32271504],
       [ 1.97516685]])

In [14]:
print("Mean:", round(standardized.mean()))
print("Standard deviation:", standardized.std())

Mean: 0
Standard deviation: 1.0


In [15]:
robustScaler = preprocessing.RobustScaler()

In [16]:
robustScaler.fit_transform(x)

array([[-1.87387612],
       [-0.875     ],
       [ 0.        ],
       [ 0.125     ],
       [10.61488511]])

## 4.3 Normalizing Observations

In [17]:
from sklearn.preprocessing import Normalizer

In [18]:
features = np.array([[0.5, 0.5],
                    [1.1, 3.4],
                    [1.5, 20.2],
                    [1.63, 34.4],
                    [10.9, 3.3]])

In [19]:
normalizer = Normalizer(norm="l2")

In [20]:
normalizer.transform(features)

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

Transform feature matrix:

In [21]:
featuresl2Norm = Normalizer(norm="l2").transform(features)

In [22]:
featuresl2Norm

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

In [23]:
featuresl1Norm = Normalizer(norm="l1").transform(features)

In [24]:
featuresl1Norm

array([[0.5       , 0.5       ],
       [0.24444444, 0.75555556],
       [0.06912442, 0.93087558],
       [0.04524008, 0.95475992],
       [0.76760563, 0.23239437]])

In [25]:
print("Sum of the first observation\'s values:",
     featuresl1Norm[0,0] + featuresl1Norm[0,1])

Sum of the first observation's values: 1.0


## 4.4 Generating Polynomial and Interaction Features

In [26]:
from sklearn.preprocessing import PolynomialFeatures

In [27]:
features = np.array([[2,3],
                   [2,3],
                   [2,3]])

Create PolynomialFeatures object:

In [28]:
polynomialInteraction = PolynomialFeatures(degree=2, include_bias=False)

In [29]:
polynomialInteraction.fit_transform(features)

array([[2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.]])

In [30]:
interaction = PolynomialFeatures(degree=2,
                                interaction_only=True, include_bias=False)

In [31]:
interaction.fit_transform(features)

array([[2., 3., 6.],
       [2., 3., 6.],
       [2., 3., 6.]])

## 4.5 Transforming Features

In [32]:
from sklearn.preprocessing import FunctionTransformer

In [33]:
features = np.array([[2,3],
                   [2,3],
                   [2,3]])

Define a simple function:

In [34]:
def addTen(x):
    return x + 10

Create transformer:

In [35]:
tenTransformer = FunctionTransformer(addTen)

Transform feature matrix

In [36]:
tenTransformer.transform(features)

array([[12, 13],
       [12, 13],
       [12, 13]])

Do the same thing in pandas using `.apply()`:

In [37]:
import pandas as pd

In [38]:
df = pd.DataFrame(features, columns=["feature1", "feature2"])

In [39]:
df.apply(addTen)

Unnamed: 0,feature1,feature2
0,12,13
1,12,13
2,12,13


## 4.6 Detecting Outliers

In [40]:
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs

Create simulated data:

In [41]:
features, _ = make_blobs(n_samples = 10,
                         n_features = 2,
                         centers = 1,
                         random_state = 1)

Replace the first observation's values with extreme values:

In [42]:
features[0,0] = 10000
features[0,1] = 10000

Create detector:

In [43]:
outlierDetector = EllipticEnvelope(contamination=.1)

Fit detector:

In [44]:
outlierDetector.fit(features)

EllipticEnvelope()

In [45]:
outlierDetector.predict(features)

array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

Create one feature:

In [46]:
feature = features[:,0]

In [47]:
def indicesOfOutliers(x):
    '''Return index of outliers'''
    q1, q3 = np.percentile(x, [25, 75])
    iqr = q3 - q1
    lowerBound = q1 - (iqr * 1.5)
    upperBound = q3 + (iqr * 1.5)
    return np.where((x > upperBound) | (x < lowerBound))

In [48]:
indicesOfOutliers(feature)

(array([0], dtype=int64),)

## 4.7 Handling Outliers

In [49]:
houses = pd.DataFrame()
houses['Price'] = [534433, 392333, 293222, 4322032]
houses['Bathrooms'] = [2, 3.5, 2, 116]
houses['SqFeet'] = [1500, 2500, 1500, 48000]

In [50]:
houses[houses.Bathrooms < 20]

Unnamed: 0,Price,Bathrooms,SqFeet
0,534433,2.0,1500
1,392333,3.5,2500
2,293222,2.0,1500


In [51]:
houses['Outlier'] = np.where(houses.Bathrooms < 20, 0, 1)

In [52]:
houses

Unnamed: 0,Price,Bathrooms,SqFeet,Outlier
0,534433,2.0,1500,0
1,392333,3.5,2500,0
2,293222,2.0,1500,0
3,4322032,116.0,48000,1


Transform the feature to dampen the effect of the outlier:

In [53]:
houses['LogOfSqFeet'] = [np.log(x).round(1) for x in houses['SqFeet']]

In [54]:
houses

Unnamed: 0,Price,Bathrooms,SqFeet,Outlier,LogOfSqFeet
0,534433,2.0,1500,0,7.3
1,392333,3.5,2500,0,7.8
2,293222,2.0,1500,0,7.3
3,4322032,116.0,48000,1,10.8


## 4.8 Discretizing Features

In [55]:
from sklearn.preprocessing import Binarizer

In [56]:
age = np.array([[6],
               [12],
               [20],
               [36],
               [65]])

In [57]:
binarizer = Binarizer(threshold=18)

In [58]:
binarizer.fit_transform(age)

array([[0],
       [0],
       [1],
       [1],
       [1]])

In [59]:
np.digitize(age, bins=[20, 30, 64])

array([[0],
       [0],
       [1],
       [2],
       [3]], dtype=int64)

In [60]:
np.digitize(age, bins=[20, 30, 64], right=True)

array([[0],
       [0],
       [0],
       [2],
       [3]], dtype=int64)

In [61]:
np.digitize(age, bins=[18])

array([[0],
       [0],
       [1],
       [1],
       [1]], dtype=int64)

## 4.9 Grouping Observations Using Clustering

In [62]:
from sklearn.cluster import KMeans

In [63]:
features, _ = make_blobs(n_samples = 50,
                         n_features = 2,
                         centers = 3,
                         random_state = 1)

In [64]:
df = pd.DataFrame(features, columns=['feature1', 'feature2'])

Make k-means clusterer:

In [65]:
clusterer = KMeans(3, random_state=0)

In [66]:
clusterer.fit(features)

KMeans(n_clusters=3, random_state=0)

In [67]:
df['group'] = clusterer.predict(features)

In [68]:
df.head()

Unnamed: 0,feature1,feature2,group
0,-9.877554,-3.336145,2
1,-7.28721,-8.353986,0
2,-6.943061,-7.023744,0
3,-7.440167,-8.791959,0
4,-6.641388,-8.075888,0


## 4.10 Deleting Observations with Missing Values

In [69]:
features = np.array([[1.1, 11.1],
                    [2.2, 22.2],
                    [3.3, 33.3],
                    [4.4, 44.4],
                    [np.nan, 55]])

Keep only obs that are not (~) missing:

In [70]:
features[~np.isnan(features).any(axis=1)]

array([[ 1.1, 11.1],
       [ 2.2, 22.2],
       [ 3.3, 33.3],
       [ 4.4, 44.4]])

Drop using pandas:

In [71]:
df = pd.DataFrame(features, columns=['feat1', 'feat2'])

In [72]:
df.dropna()

Unnamed: 0,feat1,feat2
0,1.1,11.1
1,2.2,22.2
2,3.3,33.3
3,4.4,44.4


## 4.11 Imputing Missing Values

In [73]:
# conda install ecos  

In [74]:
# conda install CVXcanon  

In [75]:
# pip install fancyimpute

In [76]:
# pip uninstall numpy

In [77]:
# pip install numpy

In [78]:
# pip install numpy==1.19.2

In [80]:
# import cvxpy

In [81]:
# from cvxpy.utilities.power_tools import (fracify, decompose, approx_error, lower_bound, over_bound, prettydict)

In [82]:
from fancyimpute import KNN
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs

RuntimeError: module compiled against API version 0xe but this version of numpy is 0xd

ImportError: numpy.core.multiarray failed to import