-
Notifications
You must be signed in to change notification settings - Fork 1
/
preprocess.py
76 lines (61 loc) · 2.15 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
def preprocess(X):
"""
This function takes in training data and applies
preprocessing steps such as scaling and one hot encoding.
Parameters
----------
X : Array-like of shape (n_samples, n_features)
Unprocessed data on which to apply preprocessing steps, can be
in form of DataFrame or Array.
Returns
-------
Numpy ndarray of shape (n_samples, n_features_new)
An array representing the data after appropriate preprocessing
steps are applied.
Examples
--------
>>> from sklearn.datasets import make_blobs
>>> X, _ = make_blobs(n_samples=10, centers=3, n_features=2)
>>> processed_data = preprocess(X)
"""
# Throw error for empty dataframe, alternative is to return empty
if len(X) < 1:
raise Exception(
"Please provide a dataframe X with at least one row as input"
)
# Throw error if input is not array-like
try:
df = pd.DataFrame(X)
except():
raise Exception("Input format not accepted")
if sum(pd.isna(df).all()):
raise Exception(
"Please provide at least one non-null value in each column"
)
# auto-detect feature type
numeric_features = df.select_dtypes("number").columns
categorical_features = df.select_dtypes("object").columns
# impute and scale numeric features
numeric_transformer = make_pipeline(
SimpleImputer(),
StandardScaler()
)
# use OHE for all other features
categorical_transformer = make_pipeline(
SimpleImputer(missing_values=[None, np.nan],
strategy="constant",
fill_value=""),
OneHotEncoder(handle_unknown="ignore")
)
preprocessor = make_column_transformer(
(numeric_transformer, numeric_features),
(categorical_transformer, categorical_features)
)
X_processed = preprocessor.fit_transform(X)
return X_processed