In [2]:
# Importing our dataset from scikit learn
from sklearn.datasets import load_iris

In [3]:
# import pandas to load the dataset as a dataframe
import pandas as pd
iris_flower = load_iris() # we are assigning iris_f;lower to 'load-iris()' instance

# lets load the feature dataset
features = pd.DataFrame(iris_flower.data, columns=iris_flower.feature_names)
# lets load the target dataset
target = iris_flower.target


In [4]:
# lets view our data from first five rows

features.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [5]:
#lets view the target dataset
print(target)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


`Handling missing values`

In [6]:
#checking for missing values
features.isna().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
dtype: int64

In [7]:
# lets take a snapshot of our dataset 
\
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
dtypes: float64(4)
memory usage: 4.8 KB


* It is evident that this data does not have missing value but we are going to introduce missing values 

In [10]:
# Lets intrpoduce the missing values 
features.loc[10, "sepal length (cm)"] = None
features.loc[50:54, "sepal width (cm)"] = None
features.loc[100:102, "petal length (cm)"] = None


In [11]:
# lets check for missing values again
features.isnull().sum()

sepal length (cm)    1
sepal width (cm)     5
petal length (cm)    3
petal width (cm)     0
dtype: int64

In [12]:
# lets handle the missing values using simple imputer

from sklearn.impute import SimpleImputer


In [13]:
# Now lets create an instance of the imputer class using "mean" and meadian as strategy for imputation 
impute_mean = SimpleImputer(strategy="mean")

impute_median = SimpleImputer(strategy="median")

In [14]:
# lets apply the defined instances above
features[["sepal length (cm)"]] = impute_mean.fit_transform(features[["sepal length (cm)"]])
features[["sepal width (cm)"]] = impute_mean.fit_transform(features[["sepal width (cm)"]])
features[["petal length (cm)"]] = impute_median.fit_transform(features[["petal length (cm)"]])

In [15]:
features.isna().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
dtype: int64

` 2. Handling outliers`

``Method of Handling Outliers``

1. Removal
2. Capping(Winzorizing or clipping)

In [None]:
from scipy import stats # This is a statistics library
z_scores = stats.zscore(features) # Calculate the z-score for the whole dataframe

# lets print the z-scores
print(z_scores)

[[-9.05163302e-01  1.02053680e+00 -1.34014407e+00 -1.31544430e+00]
 [-1.14773404e+00 -1.44643012e-01 -1.34014407e+00 -1.31544430e+00]
 [-1.39030478e+00  3.21428915e-01 -1.39764453e+00 -1.31544430e+00]
 [-1.51159016e+00  8.83929516e-02 -1.28264361e+00 -1.31544430e+00]
 [-1.02644867e+00  1.25357277e+00 -1.34014407e+00 -1.31544430e+00]
 [-5.41307191e-01  1.95268066e+00 -1.16764268e+00 -1.05217993e+00]
 [-1.51159016e+00  7.87500841e-01 -1.34014407e+00 -1.18381211e+00]
 [-1.02644867e+00  7.87500841e-01 -1.28264361e+00 -1.31544430e+00]
 [-1.75416090e+00 -3.77678975e-01 -1.34014407e+00 -1.31544430e+00]
 [-1.14773404e+00  8.83929516e-02 -1.28264361e+00 -1.44707648e+00]
 [ 0.00000000e+00  1.48660873e+00 -1.28264361e+00 -1.31544430e+00]
 [-1.26901941e+00  7.87500841e-01 -1.22514315e+00 -1.31544430e+00]
 [-1.26901941e+00 -1.44643012e-01 -1.34014407e+00 -1.44707648e+00]
 [-1.87544627e+00 -1.44643012e-01 -1.51264545e+00 -1.44707648e+00]
 [-5.61657085e-02  2.18571662e+00 -1.45514499e+00 -1.31544430e

`Removal Method`

In [24]:
handle_outliers_1 = features.copy()
features_with_removed_outliers = handle_outliers_1[abs((z_scores < 3).all(axis=1))] # .all(axis = 1) is signifying that I am selecting all the columns

In [25]:
#lets view 
features_with_removed_outliers.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [26]:
features.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


`capping method`

In [None]:
import numpy as np 

#lets create a copy of our dataframe

handle_outliers_2 = features.copy()

# lets define the threshold first

threshold = 3

# lets apply capping to our calculated z-scores
# lets apply a little bit of complexity to it incase you are going to be selcting the numeric columns in your own dataset, it works here but doesnt reallly applies to this case though.
# creating a loop to handle both th limits and the capping of the selected solumns

for i, col in enumerate(handle_outliers_2.select_dtypes(include=[np.number]).columns):
    # lets select the outliers
    outliers = handle_outliers_2[col][abs(stats.zscore(handle_outliers_2[col])) > threshold]

    # lets cap anything outside the treshold 
    handle_outliers_2[col] = np.where(np.abs(z_scores.iloc[:, i]) > threshold, handle_outliers_2[col].clip(lower=outliers.min(), upper=outliers.max()), handle_outliers_2[col]) #Using clip to cap outliers within the IQR range.

AttributeError: 'numpy.ndarray' object has no attribute 'loc'