# 数据预处理

## 使用下采样处理不平衡类

在下采样中，我们从多数类（即具有更多观测值的类）中不放回随机抽样，来创建与少数类相等的新观测子集。

In [8]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_iris
iris = load_iris()

In [10]:
X, y = iris.data, iris.target
pd.Series(iris.target).value_counts()

2    50
1    50
0    50
dtype: int64

In [11]:

# 移除前 40 个观测
X = X[40:,:]
y = y[40:]
y = np.where(y == 0, 0, 1)
# 查看不平衡的目标向量
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [12]:
# 每个类别的观测的下标
i_class0 = np.where(y == 0)[0]
i_class1 = np.where(y == 1)[0]

In [15]:
# 随机从类1不放回采样 类0数量个的样本
i_class1_downsampled = np.random.choice(i_class1, size=len(i_class0), replace=False)  # 不要重复的->不放回

In [16]:
np.hstack([y[i_class0], y[i_class1_downsampled]])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

## 使用上采样处理不平衡类别

从少数类中带放回随机选择观测。 最终结果是来自少数类和多数类的观测数量相同

In [17]:
i_class0_upsample = np.random.choice(i_class0, size=len(i_class1), replace=True)
i_class0_upsample

array([6, 3, 3, 6, 4, 1, 0, 7, 0, 4, 3, 7, 4, 2, 7, 8, 2, 8, 0, 0, 6, 7,
       5, 9, 2, 8, 8, 8, 8, 2, 2, 9, 4, 0, 8, 3, 0, 9, 5, 4, 6, 2, 9, 6,
       5, 6, 0, 2, 1, 7, 4, 6, 5, 8, 8, 4, 1, 1, 6, 9, 6, 4, 3, 8, 0, 6,
       9, 8, 6, 7, 7, 8, 1, 4, 9, 5, 5, 3, 0, 0, 6, 4, 4, 9, 5, 8, 1, 1,
       2, 0, 6, 4, 4, 9, 8, 1, 6, 7, 5, 3])

In [18]:
np.hstack([y[i_class0_upsample], y[i_class1]])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1])

## 处理离群点

In [19]:
# 加载库
import pandas as pd

# 创建 DataFrame
houses = pd.DataFrame()
houses['Price'] = [534433, 392333, 293222, 4322032]
houses['Bathrooms'] = [2, 3.5, 2, 116]
houses['Square_Feet'] = [1500, 2500, 1500, 48000]

houses

Unnamed: 0,Price,Bathrooms,Square_Feet
0,534433,2.0,1500
1,392333,3.5,2500
2,293222,2.0,1500
3,4322032,116.0,48000


In [20]:
# 1  丢弃
houses[houses['Bathrooms']<20]

Unnamed: 0,Price,Bathrooms,Square_Feet
0,534433,2.0,1500
1,392333,3.5,2500
2,293222,2.0,1500


In [21]:
# 选择 2：标记
houses['outlier'] = np.where(houses['Bathrooms'] < 20, 0, 1)
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,outlier
0,534433,2.0,1500,0
1,392333,3.5,2500,0
2,293222,2.0,1500,0
3,4322032,116.0,48000,1


In [25]:
# 选择 3：重缩放
houses['log_of_Square_Feet'] = np.log(houses['Square_Feet'])
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,outlier,log_of_Square_Feet
0,534433,2.0,1500,0,7.31322
1,392333,3.5,2500,0,7.824046
2,293222,2.0,1500,0,7.31322
3,4322032,116.0,48000,1,10.778956


## 使用均值填充缺失值

均值插补用该特征/变量的平均值替换缺失值。 平均插补是最“朴素”的插补方法之一，因为不像 k 最近邻居插补这样的更复杂的方法，它不会使用观测的信息来估计它的值。

In [30]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

# 创建空数据集
df = pd.DataFrame()

# 创建两个变量，叫做 x0 和 x1
# 使 x1 的第一个值为缺失值
df['x0'] = [0.3051,0.4949,0.6974,0.3769,0.2231,0.341,0.4436,0.5897,0.6308,0.5]
df['x1'] = [np.nan,0.2654,0.2615,0.5846,0.4615,0.8308,0.4962,0.3269,0.5346,0.6731]

# 观察数据集
df

Unnamed: 0,x0,x1
0,0.3051,
1,0.4949,0.2654
2,0.6974,0.2615
3,0.3769,0.5846
4,0.2231,0.4615
5,0.341,0.8308
6,0.4436,0.4962
7,0.5897,0.3269
8,0.6308,0.5346
9,0.5,0.6731


In [32]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(df)
imp_mean.transform(df)

array([[0.3051    , 0.49273333],
       [0.4949    , 0.2654    ],
       [0.6974    , 0.2615    ],
       [0.3769    , 0.5846    ],
       [0.2231    , 0.4615    ],
       [0.341     , 0.8308    ],
       [0.4436    , 0.4962    ],
       [0.5897    , 0.3269    ],
       [0.6308    , 0.5346    ],
       [0.5       , 0.6731    ]])

## 填充缺失的类标签

In [33]:
X = np.array([[0, 2.10, 1.45], 
              [1, 1.18, 1.33], 
              [0, 1.22, 1.27],
              [0, -0.21, -1.19],
              [np.nan, 0.87, 1.31],
              [np.nan, -0.67, -0.22]])
# 使用最频繁的类别填充缺失值
most_imputer = SimpleImputer(strategy='most_frequent')
most_imputer.fit_transform(X)

array([[ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 0.  , -0.21, -1.19],
       [ 0.  ,  0.87,  1.31],
       [ 0.  , -0.67, -0.22]])

## 使用 KNN 填充缺失类别

In [38]:
# 加载库
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

# 创建带有类别特征的特征矩阵
X = np.array([[0, 2.10, 1.45], 
              [1, 1.18, 1.33], 
              [0, 1.22, 1.27],
              [1, -0.21, -1.19]])

# 创建类别特征有缺失的特征矩阵
X_with_nan = np.array([[np.nan, 0.87, 1.31], 
                       [np.nan, -0.67, -0.22]])

# 训练 KNN 学习器
clf = KNeighborsClassifier(3, weights='distance')
trained_model = clf.fit(X[:,1:], X[:,0])

# 预测缺失值的类别
imputed_values = trained_model.predict(X_with_nan[:,1:])

# 将预测分类的列和它们的其它特征连接
X_with_imputed = np.hstack((imputed_values.reshape(-1,1), X_with_nan[:,1:]))

# 连接两个特征矩阵
np.vstack((X_with_imputed, X))

array([[ 0.  ,  0.87,  1.31],
       [ 1.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

## KNN 使用最相邻的数据填充

In [36]:
from sklearn.impute import KNNImputer
nan = np.nan

X = [[1, 2, nan], [3, 4, 3], [nan, 6, 5], [8, 8, 7]]
X

[[1, 2, nan], [3, 4, 3], [nan, 6, 5], [8, 8, 7]]

In [37]:
imputer = KNNImputer(n_neighbors=2, weights='uniform')
imputer.fit_transform(X)

array([[1. , 2. , 4. ],
       [3. , 4. , 3. ],
       [5.5, 6. , 5. ],
       [8. , 8. , 7. ]])

## 标准化

In [41]:
# normalizer 重缩放各个观侧，使其具有单位范数  x/ ||x||
from sklearn.preprocessing import Normalizer
import numpy as np
X = np.array([[0.5, 0.5], 
              [1.1, 3.4], 
              [1.5, 20.2], 
              [1.63, 34.4], 
              [10.9, 3.3]])
normalizer = Normalizer(norm='l2')
normalizer.transform(X)

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

In [43]:
# minmax scalar  (x -min) / (max -min)
from sklearn.preprocessing import MinMaxScaler
# 创建特征
x = np.array([[-500.5], 
              [-100.1], 
              [0], 
              [100.1], 
              [900.9]])
minmax_scale = MinMaxScaler(feature_range=(0,1))
x_scaled = minmax_scale.fit_transform(x)
x_scaled

array([[0.        ],
       [0.28571429],
       [0.35714286],
       [0.42857143],
       [1.        ]])

In [44]:
# StandardScaler  0均值 1方差 (x - u)/ sigma
from sklearn.preprocessing import StandardScaler
# 创建特征
x = np.array([[-500.5], 
              [-100.1], 
              [0], 
              [100.1], 
              [900.9]])
scalar = StandardScaler()
x_standardized = scalar.fit_transform(x)
x_standardized

array([[-1.26687088],
       [-0.39316683],
       [-0.17474081],
       [ 0.0436852 ],
       [ 1.79109332]])