In [48]:
import pandas as pd
import os

import numpy as np

from scipy.stats.mstats import winsorize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
directory = '../data'
file_name = 'boston_housing.csv'
df = pd.read_csv(os.path.join(directory,file_name))

In [3]:
df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


# Handling missing values

In [4]:
print('The count of null values in each column of the dataset are as follows:')
df.isnull().sum()

The count of null values in each column of the dataset are as follows:


crim       0
zn         0
indus      0
chas       0
nox        0
rm         0
age        0
dis        0
rad        0
tax        0
ptratio    0
b          0
lstat      0
medv       0
dtype: int64

#### Since there is no missing values in the dataset, there is no need to handle missing value

# Handling outliers

There are many methods to handle outliers
- **Trimming**: Trimming is simply removing the outlier data from the dataset. If outliers are caused by errors or are irrelevant to the analysis, outliers can be removed from dataset.
- **Capping**: If dataset is normally distributed then we can use capping to handle outlier. It is a technique in which maximum and minimum values (considered as non-outliers) are decided, then outlier values above maximum value are replaced by this maximum value and outlier values below minimum value are replaced by this minimum vlaue.

### Winsorization 
It is the capping technique to handle outliers using percentile method. In this method we define a confidence interval of let's say 90% and then replace all the outliers below the 5th percentile with the value at 5th percentile and all the values above 95th percentile with the value at the 95th percentile. It is pretty useful when there are negative values and zeros in the features which cannot be treated with log transforms or square roots.

In [5]:
def detect_outliers(df):
    outliers_dict = {}
    # select numeric columns
    numeric_columns = df.select_dtypes(include=np.number).columns
    outliers = pd.DataFrame(columns = ['Feature', 'Number of Outliers'])
    for col in numeric_columns:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        fence_low = q1 - (1.5*iqr)
        fence_high = q3 + (1.5*iqr)
        df_outliers_col = df.loc[(df[col] < fence_low) | (df[col] > fence_high), [col]]
        outliers_dict[col] = len(df_outliers_col)
        outliers_df = pd.DataFrame(outliers_dict.items(), columns=['Feature', 'Number of outliers'])
    return outliers_df
outlier_df=detect_outliers(df)
print("Count of outliers from IQR rule in each features are:")
outlier_df

Count of outliers from IQR rule in each features are:


Unnamed: 0,Feature,Number of outliers
0,crim,66
1,zn,68
2,indus,0
3,chas,35
4,nox,0
5,rm,30
6,age,0
7,dis,5
8,rad,0
9,tax,0


In [6]:
## Outliers handling with Winsorization method.
## This method uses winsorization to handle outliers 
## where the lowest 5% and highest 5% of values are replaced by value at corresponding percentiles (5th and 95th).
def my_outlier_winsorization(df):
    # select numeric columns
    numeric_columns = df.select_dtypes(include=np.number).columns
    for col in numeric_columns:
        df[col] = winsorize(df[col], limits=[0.05, 0.05],inclusive=(True, True), inplace=True)
    return df
df = my_outlier_winsorization(df)

In [7]:
outlier_df=detect_outliers(df)
print("Count of outliers from IQR rule in each features are:")
outlier_df

Count of outliers from IQR rule in each features are:


  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(
  arr.partition(


Unnamed: 0,Feature,Number of outliers
0,crim,66
1,zn,68
2,indus,0
3,chas,35
4,nox,0
5,rm,0
6,age,0
7,dis,0
8,rad,0
9,tax,0


# Encode categorical variables
Categorical variables are determined by counting unique values of the variable in the DataFrame.
The threshold value for count of unique values is taken as 5.
If number of unique values < threshold (5), then that feature is considered as categorical.

In [21]:
### identify categorical features
## It uses thereshold for identifucation of categorical feature
## If count of uniques value in feature is less than threshold, then that feature is considered as categorical
def my_identify_categorical_features(df, threshold):
    # If unique values < 10, assume categorical
    categorical_columns = [col for col in df.columns if df[col].nunique() < threshold]
    print("{} Categorical Feature(s):".format(len(categorical_columns)), categorical_columns)
    return categorical_columns

In [30]:
categoical_columns = my_identify_categorical_features(df, 5)

1 Categorical Feature(s): ['chas']


In [31]:
categoical_columns

['chas']

In [32]:
df[categoical_columns].value_counts()

chas
0       471
1        35
Name: count, dtype: int64

Here as per the rule considered 'chas' feature of given dataset is found to be categorical. This feature has already numerical values, which can be considered as label encoding. So, no seperate encoding for this categorical variable is considered.

# Split the data into training and testing sets

In [40]:
## Split dataset into Features (X) and target (y)
X = df.drop(columns='medv')
y = df['medv']
## split data into tran-test
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size=0.2, random_state=42)

# Feature Scaling (Normalize/standardize numerical features)
Feature Scaling is always applied after train-test split. After train-test split we perform fit() on training data, which learn parameters for scaling and then perform transform() on both test and train data. <br>
Two techniques of feature scaling is available:
- **Standardization**: It is a feature scaling techinque in which we transform the feature into new feature such that the mean and Standard Deviation of this new feature is 0 and 1 respectively. Example sklearn.preprocessiong.StandardScaler(). In K-Means, K-Nearest-Neighbours, Principal Component Analysis (PCA), Artificial Neural Network, Gradient Descent algorithms we perform Standardization.
- **Normalization** : Normalization is a technique often applied as part of data preparation for machine learning. The goal of normalization is to change the values of numeric columns in the dataset to use a common scale, without distorting differences in the ranges of values or losing information. Examples: MinMax Scaling, Mean Normalization, Max Absolute Scaling, Robust Scaling etc. MinMax Scaling is most commonly used one. After applying Min-Max Scaling the range of transformed feature will be within [0,1]. <br>
Most of the problem is solved with standardization.

#### Let's use StandardScaler() on our dataset


In [49]:
scaler = StandardScaler()

# fit the scaler to the train set, it will learn the parameters
scaler.fit(X_train)

# Transform train and test set
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)