# Feaure Extraction
- Dictionary Vectorizer - Converts lists of mappings of feature name and feature value into a matrix


In [1]:
data = [
    {
        'age':4, "height":96.0
    },
    {
        'age':6, "height":108.0
    },
    {
        'age':9, "height":173.0
    },
    {
        'age':10, "height":54.0
    },
]

In [5]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)
vectorized_data = dv.fit_transform(data)

In [11]:
print(vectorized_data)
print(type(vectorized_data))
print(data)
print(type(data))

[[  4.  96.]
 [  6. 108.]
 [  9. 173.]
 [ 10.  54.]]
<class 'numpy.ndarray'>
[{'age': 4, 'height': 96.0}, {'age': 6, 'height': 108.0}, {'age': 9, 'height': 173.0}, {'age': 10, 'height': 54.0}]
<class 'list'>


In [12]:
data_back = dv.inverse_transform(vectorized_data)
print(data_back)
print(type(data_back))

[{'age': np.float64(4.0), 'height': np.float64(96.0)}, {'age': np.float64(6.0), 'height': np.float64(108.0)}, {'age': np.float64(9.0), 'height': np.float64(173.0)}, {'age': np.float64(10.0), 'height': np.float64(54.0)}]
<class 'list'>


# Cleaning Data
- Simple Imputer - Fills missing values with one of the following stratergies
- - mean
- - median
- - most_frequesnt
- - constant

In [14]:
import numpy as np

In [16]:
matrix = np.array([[7, 1],[np.nan, 8], [2, np.nan], [9,6]])
print(matrix)

[[ 7.  1.]
 [nan  8.]
 [ 2. nan]
 [ 9.  6.]]


In [18]:
from sklearn.impute import SimpleImputer
si_mean = SimpleImputer(strategy="mean")
si_median = SimpleImputer(strategy="median")
si_most_frequent = SimpleImputer(strategy="most_frequent")
si_constant = SimpleImputer(strategy="constant")

In [19]:
matrix_mean = si_mean.fit_transform(matrix)
print(matrix_mean)

[[7. 1.]
 [6. 8.]
 [2. 5.]
 [9. 6.]]


In [20]:
matrix_median = si_median.fit_transform(matrix)
print(matrix_median)

[[7. 1.]
 [7. 8.]
 [2. 6.]
 [9. 6.]]


In [23]:
matrix_most_frequent = si_most_frequent.fit_transform(matrix)
print(matrix_most_frequent)

[[7. 1.]
 [2. 8.]
 [2. 1.]
 [9. 6.]]


In [24]:
from sklearn.impute import KNNImputer


In [26]:
matrix_KNN = np.array([[1, 2, np.nan], [3, 4, 3], [np.nan, 6, 5], [8, 8, 7]])
print(matrix_KNN)

[[ 1.  2. nan]
 [ 3.  4.  3.]
 [nan  6.  5.]
 [ 8.  8.  7.]]


In [27]:
knni = KNNImputer()

matrix_knn_transformed = knni.fit_transform(matrix_KNN)

print(matrix_knn_transformed) 

[[1. 2. 5.]
 [3. 4. 3.]
 [4. 6. 5.]
 [8. 8. 7.]]


In [28]:
knni2 = KNNImputer(n_neighbors=2, weights="uniform")

matrix_knn_transformed2 = knni2.fit_transform(matrix_KNN)

print(matrix_knn_transformed2) 

[[1.  2.  4. ]
 [3.  4.  3. ]
 [5.5 6.  5. ]
 [8.  8.  7. ]]


# Feature Scaling

## Numeric Transformer

In [29]:
from sklearn.preprocessing import StandardScaler

In [42]:
X_S = np.array([[4,], [3,], [2,], [5,], [6,]])
print(X_S)

[[4]
 [3]
 [2]
 [5]
 [6]]


In [43]:
ss = StandardScaler()
new_X_SS = ss.fit_transform(X_S)
print(new_X_SS)

[[ 0.        ]
 [-0.70710678]
 [-1.41421356]
 [ 0.70710678]
 [ 1.41421356]]


In [None]:
from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler()

