In [8]:
import pandas as pd
from pandas.api.types import is_numeric_dtype
import numpy as np
import matplotlib.pyplot as plt

DSI Lab: Task 1

Import pandas and read file as data frame. Check readme for attribute information
Print the data

NOTE: The delimiters in the file are blanks

In [9]:
def readMissingValuesAsNan(path):
    df = pd.read_csv(path, delimiter=' ', names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class'])
    df = df.replace({'?': np.nan})
    df = df.iloc[0:50,: ]
    df = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
    df = df.astype(float)
    return df

## 1. Analyse

In [10]:
path = './datasets/iris/data_someMissing.all'
colnames=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']
dfa = pd.read_csv(path, delimiter=' ', names=colnames)
print(dfa.info())
print(dfa.describe())
print(dfa[dfa.eq('?').any(axis=1)])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   sepal_length  150 non-null    object
 1   sepal_width   150 non-null    object
 2   petal_length  150 non-null    object
 3   petal_width   150 non-null    object
 4   class         150 non-null    object
dtypes: object(5)
memory usage: 6.0+ KB
None
       sepal_length sepal_width petal_length petal_width            class
count           150         150          150         150              150
unique           36          24           44          23                4
top             5.0         3.0          1.5         0.2  Iris-versicolor
freq             10          26           13          28               50
   sepal_length sepal_width petal_length petal_width        class
6           4.6         3.4            ?         0.3  Iris-setosa
9           4.9         3.1          1.5         0.1   

Pandas treat None and NaN as essentially interchangeable for indicating missing or null values.
But the missing data is not represented by missing values. Instead there is a '?' in the dataset.
Solutions first step: Replace '?' with 'nan', change type (cast object to float values)
We can see, that there are only missing values in 'class' 'Iris-setosa' so we can focus on this values 
and drop the feature.

## 2. Remove nan data

This will reduce the total count of datarows.

In [11]:
def removeNaNValues(df):
    df = df.dropna()
    return df

df1 = readMissingValuesAsNan(path=path)
df1 = removeNaNValues(df1)
print(df1.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46 entries, 0 to 49
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  46 non-null     float64
 1   sepal_width   46 non-null     float64
 2   petal_length  46 non-null     float64
 3   petal_width   46 non-null     float64
dtypes: float64(4)
memory usage: 1.8 KB
None


## 3. Replace missing values with median values

Replacing the nan values with the median value of a single feature. This will interpolate probably the best. Mean could also be used due there no outliers in the dataset.

In [12]:
def replaceNaNValuesWithMedian(df):
    df['sepal_length'] = df['sepal_length'].fillna(df['sepal_length'].median())
    df['sepal_width'] = df['sepal_width'].fillna(df['sepal_width'].median())
    df['petal_length'] = df['petal_length'].fillna(df['petal_length'].median())
    df['petal_width'] = df['petal_width'].fillna(df['petal_width'].median())
    return df

df2 = readMissingValuesAsNan(path=path)
df2 = replaceNaNValuesWithMedian(df2)
print(df2.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  50 non-null     float64
 1   sepal_width   50 non-null     float64
 2   petal_length  50 non-null     float64
 3   petal_width   50 non-null     float64
dtypes: float64(4)
memory usage: 1.7 KB
None


## 4. Replace missing values with 0

Replacing the nan values with free selected values (not the best idea)

In [13]:
def replaceNaNValuesWithConstValue(df, value):
    df['sepal_length'] = df['sepal_length'].fillna(value)
    df['sepal_width'] = df['sepal_width'].fillna(value)
    df['petal_length'] = df['petal_length'].fillna(value)
    df['petal_width'] = df['petal_width'].fillna(value)
    return df

df3 = readMissingValuesAsNan(path=path)
df3 = replaceNaNValuesWithConstValue(df3, 0)
print(df3.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  50 non-null     float64
 1   sepal_width   50 non-null     float64
 2   petal_length  50 non-null     float64
 3   petal_width   50 non-null     float64
dtypes: float64(4)
memory usage: 1.7 KB
None


## 5. Conclusion:

How are the above options changing the statistical properties of the single features?
Print the values (per option) into a table!

In [14]:
print(df1.describe())
print(df2.describe())
print(df3.describe())

       sepal_length  sepal_width  petal_length  petal_width
count     46.000000    46.000000     46.000000    46.000000
mean       4.997826     3.417391      1.471739     0.247826
std        0.309472     0.376713      0.164199     0.106956
min        4.400000     2.300000      1.000000     0.100000
25%        4.800000     3.200000      1.400000     0.200000
50%        5.000000     3.400000      1.500000     0.200000
75%        5.175000     3.600000      1.575000     0.300000
max        5.700000     4.400000      1.900000     0.600000
       sepal_length  sepal_width  petal_length  petal_width
count     50.000000    50.000000     50.000000     50.00000
mean       4.992000     3.420000      1.464000      0.24800
std        0.337965     0.375255      0.173511      0.10349
min        4.300000     2.300000      1.000000      0.10000
25%        4.800000     3.200000      1.400000      0.20000
50%        5.000000     3.400000      1.500000      0.20000
75%        5.175000     3.600000      1.

As we can see the modification of the missing values influences the all statistical values in the dataframe. So we need to be careful, what method we choose to handle it.