In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('datasets/data_with_nans.csv')
data.head(3)

Unnamed: 0.1,Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,0,0,5.1,3.5,1.4,0.2,Iris-setosa
1,1,1,4.7,3.2,1.6,0.2,Iris-setosa
2,2,2,4.9,3.1,1.5,0.1,Iris-setosa


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170 entries, 0 to 169
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     170 non-null    int64  
 1   Id             170 non-null    int64  
 2   SepalLengthCm  167 non-null    float64
 3   SepalWidthCm   169 non-null    float64
 4   PetalLengthCm  166 non-null    float64
 5   PetalWidthCm   168 non-null    float64
 6   Species        170 non-null    object 
dtypes: float64(4), int64(2), object(1)
memory usage: 9.4+ KB


In [4]:
data.drop(data.columns[0], axis=1, inplace=True)

In [5]:
data.isna().sum()

Id               0
SepalLengthCm    3
SepalWidthCm     1
PetalLengthCm    4
PetalWidthCm     2
Species          0
dtype: int64

In [6]:
# data.groupby('Species').mean()

for column in data.columns[1:-1]:
    data[column].fillna(value=data[column].mean(), inplace=True)

In [7]:
data.isna().sum()

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

In [8]:
# for col in data.columns[1:-1]:
#     plt.figure(figsize=(8,6))
#     sns.scatterplot(x=data.Id, y=data[col], hue=data.Species)
#     plt.show()

## Outlier Detection 

#### 1 - Three Sigma (3σ)

In [8]:
cols = data.columns[1:-1]

In [9]:
species = data.Species.unique()

In [10]:
for c in cols:
    for s in species:
        df = data[data.Species == s]

        mean = df[c].mean()
        std = df[c].std()

        s3_max = mean + (std*3)
        s3_min = mean - (std*3)

        outlier = df[(df[c] > s3_max) | (df[c] < s3_min)]
        data.drop(index=outlier.index, axis=0, inplace=True)
    

In [11]:
data.shape

(163, 6)

In [13]:
#for col in data.columns[1:-1]:
#    plt.figure(figsize=(8,6))
#    sns.scatterplot(x=data.Id, y=data[col], hue=data.Species)
#    plt.show()

#### 2 - Quantil (IQT)

In [12]:
data.head(3)

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,0,5.1,3.5,1.4,0.2,Iris-setosa
1,1,4.7,3.2,1.6,0.2,Iris-setosa
2,2,4.9,3.1,1.5,0.1,Iris-setosa


In [13]:
for c in cols:
    for s in species:
        df = data[data.Species == s]

        Q1 = df[c].quantile(0.25)
        Q3 = df[c].quantile(0.75)
        
        IQR = Q3-Q1
        step = IQR*1.5

        outlier = df[(df[c] > (Q3+step)) | (df[c] < (Q1-step))]
        data.drop(index=outlier.index, axis=0, inplace=True)
    

In [16]:
data

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,0,5.1,3.5,1.4,0.2,Iris-setosa
1,1,4.7,3.2,1.6,0.2,Iris-setosa
2,2,4.9,3.1,1.5,0.1,Iris-setosa
3,3,4.4,2.9,1.4,0.2,Iris-setosa
4,4,5.0,3.4,1.5,0.2,Iris-setosa
...,...,...,...,...,...,...
165,165,5.7,2.5,5.0,2.0,Iris-virginica
166,166,6.8,3.0,5.5,2.1,Iris-virginica
167,167,6.4,2.7,5.3,1.9,Iris-virginica
168,168,7.2,3.6,6.1,2.5,Iris-virginica


In [17]:
#for col in data.columns[1:-1]:
#    plt.figure(figsize=(8,6))
#    sns.scatterplot(x=data.Id, y=data[col], hue=data.Species)
#    plt.show()

In [23]:
data.drop('Id', axis=1, inplace=True)

In [27]:
data.to_csv('datasets/final_data.csv')