In [2]:
import pandas as pd
from scipy import stats
from sklearn.datasets import load_iris
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif

In [3]:
iris_flower = load_iris()

features = pd.DataFrame(iris_flower.data, columns= iris_flower.feature_names)

target = iris_flower.target

In [4]:
features.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [5]:
print(target)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [6]:
features.isna().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
dtype: int64

In [7]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
dtypes: float64(4)
memory usage: 4.8 KB


In [8]:
features.loc[10, "sepal length (cm)"] = None
features.loc[50:54, "sepal length (cm)"] = None
features.loc[100:102, "petal length (cm)"] = None

In [9]:
features.isnull().sum()

sepal length (cm)    6
sepal width (cm)     0
petal length (cm)    3
petal width (cm)     0
dtype: int64

In [11]:
impute_mean = SimpleImputer(strategy="mean")
impute_median = SimpleImputer(strategy="median")

In [14]:
features[["sepal length (cm)"]] = impute_mean.fit_transform(features[["sepal length (cm)"]])
features[["petal length (cm)"]] = impute_median.fit_transform(features[["petal length (cm)"]])

In [15]:
features.isna().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
dtype: int64

In [17]:
z_scores = stats.zscore(features) 


In [18]:
print(z_scores)

[[-8.94176304e-01  1.01900435e+00 -1.34014407e+00 -1.31544430e+00]
 [-1.14084563e+00 -1.31979479e-01 -1.34014407e+00 -1.31544430e+00]
 [-1.38751495e+00  3.28414053e-01 -1.39764453e+00 -1.31544430e+00]
 [-1.51084962e+00  9.82172869e-02 -1.28264361e+00 -1.31544430e+00]
 [-1.01751097e+00  1.24920112e+00 -1.34014407e+00 -1.31544430e+00]
 [-5.24172316e-01  1.93979142e+00 -1.16764268e+00 -1.05217993e+00]
 [-1.51084962e+00  7.88807586e-01 -1.34014407e+00 -1.18381211e+00]
 [-1.01751097e+00  7.88807586e-01 -1.28264361e+00 -1.31544430e+00]
 [-1.75751894e+00 -3.62176246e-01 -1.34014407e+00 -1.31544430e+00]
 [-1.14084563e+00  9.82172869e-02 -1.28264361e+00 -1.44707648e+00]
 [ 0.00000000e+00  1.47939788e+00 -1.28264361e+00 -1.31544430e+00]
 [-1.26418029e+00  7.88807586e-01 -1.22514315e+00 -1.31544430e+00]
 [-1.26418029e+00 -1.31979479e-01 -1.34014407e+00 -1.44707648e+00]
 [-1.88085360e+00 -1.31979479e-01 -1.51264545e+00 -1.44707648e+00]
 [-3.08336657e-02  2.16998818e+00 -1.45514499e+00 -1.31544430e

In [19]:
handle_outliers_1 = features.copy()
features_with_removed_outliers = handle_outliers_1[abs((z_scores < 3).all(axis=1))] 

In [20]:
# Lets view
features_with_removed_outliers.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [21]:
features.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
