# Importing Modules

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from Utils import *

In [2]:
# Setting Environment to ignore future warnings
import warnings
warnings.simplefilter(action="ignore")

In [3]:
# Setting pandas default parameters
pd.set_option("max_rows", 85)
pd.set_option("max_columns", 85)

# Loading Data

In [4]:
main_data,df,num_cols = load_data()

In [5]:
main_data.shape

(6410, 11)

In [None]:
num_cols.shape

# Outlier Detection and Removal

Because we only have to check and remove outliers. So, we will deal only with Numerical features. Let's extract only numerical features from data.

In [6]:
df.head()

0
1
2
3
4


In [7]:
statistics(df)

Unnamed: 0_level_0,Unique_values,Missing values,Percentage of Missing Values,Data Type
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


In [8]:
# Filling NAN with mean values
for i in df.columns:
    df[i].fillna(df[i].mean(), inplace=True)

In [9]:
statistics(df)

Unnamed: 0_level_0,Unique_values,Missing values,Percentage of Missing Values,Data Type
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


In [10]:
data = df.copy()

## Outlier removal wit STD Mean and model building

One of the ways we can remove outliers is remove any data points that are beyond 2 standard deviation from mean. Which means we can come up with following upper and lower bounds

In [11]:
# detecting outliers in feature
for i in data.columns:
    df = std_mean(i, df, 2)

# Displaying outliers
outliers_statistics(df)

Unnamed: 0_level_0,Outliers,Percentage of Outliers
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1


In [12]:
# Total Number of Outliers in all features
outliers_statistics(df).Outliers.sum()

0

In [13]:
# Preparing data for model
X_train, X_test, y_train, y_test = prepare_data(df,main_data,num_cols)

In [14]:
run_model("STD Mean", X_train, y_train)

Model Accuracy : 0.974154772443607


## Outlier removal wit STD Median and model building

In [15]:
df = data.copy()

In [16]:
# detecting outliers in feature
for i in data.columns:
    df = std_median(i, df,2)

# Displaying outliers
outliers_statistics(df)

Unnamed: 0_level_0,Outliers,Percentage of Outliers
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1


In [17]:
# Total Number of Outliers in all features
outliers_statistics(df).Outliers.sum()

0

In [18]:
# Preparing data for model
X_train, X_test, y_train, y_test = prepare_data(df,main_data,num_cols)

In [19]:
run_model("STD Median", X_train, y_train)

Model Accuracy : 0.9755059795059795


With **Mean** method we detected **2605** outliers in whole dataset with standard deviation 2. And model's accuracy is 98 % percent on such data.

With **Median** method we detected **3495** outliers in whole dataset with standard deviation 2. Model accuracy is 1 percent less than the mean method. Here the accuracy is around 97 percent.

## Outlier removal with EllipticEnvelope and model building

In [20]:
df = set_data(main_data,num_cols)

In [21]:
from sklearn.covariance import EllipticEnvelope
ellip = EllipticEnvelope()
ellip.fit(df.drop("SalePrice", axis=1))

EllipticEnvelope()

In [22]:
# predict returns 1 for an inlier and -1 for an outlier
y_pred = ellip.predict(df.drop("SalePrice", axis=1))
df["outlier"] = y_pred

# Dropping outliers
df = df[df.outlier == 1]

In [23]:
X_train, X_test, y_train, y_test = split_data(df)

In [24]:
run_model("EllipticEnvelopen", X_train, y_train)

Model Accuracy : 0.9791194899400852


## Outlier removal with LocalOutlierFactor and model building

In [25]:
df = set_data(main_data,num_cols)

In [26]:
from sklearn.neighbors import LocalOutlierFactor
fac = LocalOutlierFactor()
fac.fit(df.drop("SalePrice", axis=1))

LocalOutlierFactor()

In [27]:
# predict returns 1 for an inlier and -1 for an outlier
y_pred = fac.fit_predict(df.drop("SalePrice", axis=1))
df["outlier"] = y_pred

# Dropping outliers
df = df[df.outlier == 1]

In [28]:
X_train, X_test, y_train, y_test = split_data(df)

In [29]:
run_model("LocalOutlierFactor", X_train, y_train)

Model Accuracy : 0.9752630382665788


## Outlier removal with IsolationForest and model building

In [30]:
df = set_data(main_data,num_cols)

In [31]:
from sklearn.ensemble import IsolationForest
iso = IsolationForest()
iso.fit(df.drop("SalePrice", axis=1))

IsolationForest()

In [32]:
# predict returns 1 for an inlier and -1 for an outlier
y_pred = iso.fit_predict(df.drop("SalePrice", axis=1))
df["outlier"] = y_pred

# Dropping outliers
df = df[df.outlier == 1]

In [33]:
X_train, X_test, y_train, y_test = split_data(df)

In [34]:
run_model("IsolationFactor", X_train, y_train)

Model Accuracy : 0.9715551928248153


# None

In [35]:
df = set_data(main_data,num_cols)
X_train, X_test, y_train, y_test = split_data(df)

In [36]:
run_model("None", X_train, y_train)

Model Accuracy : 0.9757713565534857


# Comparison

In [37]:
comp = pd.DataFrame({"Method": method_name, "Model's Performances": model_score})
comp.sort_values("Model's Performances", ascending=False, inplace=True)
comp

Unnamed: 0,Method,Model's Performances
2,EllipticEnvelopen,0.979119
5,,0.975771
1,STD Median,0.975506
3,LocalOutlierFactor,0.975263
0,STD Mean,0.974155
4,IsolationFactor,0.971555


From all of this we can see that our method **STD Mean** is performing best and than **STD Median**. Our proposed methods are performing much better than built-in methods.