In [34]:
import pandas as pd
import numpy as np


data_red = pd.read_csv('data/winequality-red.csv', delimiter=';')
data_white = pd.read_csv('data/winequality-white.csv', delimiter=';')

def create_sampled_datasets(dataset):

    dataset_sampled_10 = dataset.copy()
    dataset_sampled_20 = dataset.copy()
    dataset_sampled_30 = dataset.copy()

    sampled_datasets = [dataset_sampled_10, dataset_sampled_20, dataset_sampled_30]

    for sampled_dataset, fraction in [(dataset_sampled_10, 0.1), (dataset_sampled_20, 0.2), (dataset_sampled_30, 0.3)]:
        for col in sampled_dataset.columns:
            mask = np.random.rand(sampled_dataset.shape[0]) <= fraction
            sampled_dataset.loc[mask, col] = np.nan

        print(f"shape of dataset: {sampled_dataset.shape}")
        if sampled_dataset.isnull().sum().sum() > 0:
            print("Null values found in dataset")
        else:
            print("No null values found in dataset")

    return sampled_datasets

sampled_datasets_red = create_sampled_datasets(data_red)
sampled_datasets_white = create_sampled_datasets(data_white)

sampled_datasets = [sampled_datasets_red, sampled_datasets_white]

shape of dataset: (1599, 12)
Null values found in dataset
shape of dataset: (1599, 12)
Null values found in dataset
shape of dataset: (1599, 12)
Null values found in dataset
shape of dataset: (4898, 12)
Null values found in dataset
shape of dataset: (4898, 12)
Null values found in dataset
shape of dataset: (4898, 12)
Null values found in dataset


For easier operations on datasets, they were converted to pandas dataframes. Then all of them were saved to a list called `datasets`. The first elements of the list are red samples, and the rest are the datasets are white samples. It will make it easier to iterate over them. Then the code puts Null values to random places to make it a missing value. Now we can try different methods to handle missing values.

# Handling Missing Values

There are many ways to handle missing values in a dataset. In this notebook, I will use the simplest one: removing the rows with missing values. I will use the `dropna()` method of pandas dataframes to remove the rows with missing values. I will use the `inplace=True` parameter to make the changes permanent.

In [37]:
for datasets in sampled_datasets:
    for dataset in datasets:
        print(f"shape of dataset: {dataset.shape}")
        dataset.dropna(inplace=True)
        print(f"reduced shape of dataset: {dataset.shape} \n")

shape of dataset: (463, 12)
reduced shape of dataset: (463, 12) 

shape of dataset: (114, 12)
reduced shape of dataset: (114, 12) 

shape of dataset: (19, 12)
reduced shape of dataset: (19, 12) 

shape of dataset: (1422, 12)
reduced shape of dataset: (1422, 12) 

shape of dataset: (352, 12)
reduced shape of dataset: (352, 12) 

shape of dataset: (57, 12)
reduced shape of dataset: (57, 12) 

