In [2]:
# Imports
import os
from datetime import datetime
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer

In [3]:
# Update the filename
FILENAME = 'dummy.csv'

In [4]:
# Constants Declaration
DATASET_DIR = './data/'
RESULT_DIR = './result/'
EXTENSION_MAPPING = {
    'read': {
        'csv': 'read_csv',
        'json': 'read_json',
        'xlsx': 'read_excel'   
    },
    'save': {
        'csv': 'to_csv',
        'json': 'to_json',
        'xlsx': 'to_excel'      
    }
}
np.random.seed(seed=42)

In [38]:
# Dataset Loader
DATASET_FILE = os.path.join(DATASET_DIR, FILENAME)
file_path, file_extension = os.path.splitext(DATASET_FILE)
file_name = file_path.split(os.path.sep)[-1]
file_extension = file_extension.strip('.')
dataset_extracter = EXTENSION_MAPPING['read'].get(file_extension)
if dataset_extracter is None:
    raise ValueError('Dataset type not supported')
df = getattr(pd, dataset_extracter)(DATASET_FILE)

In [39]:
df.head()

Unnamed: 0,id,name,age,year
0,1,john,23,2004.0
1,2,tom,45,2006.0
2,3,sam,64,
3,4,harry,2012,
4,5,jim,23,2014.0


### Dealing with missing values
* Replace with Mean Values
* Replace with Median Values
* Replace with Most Common Values
* Replace with Specific Value
* Drop records with Missing Values

In [58]:
# Preprocessing with Sklearn, Fill with mean values for the required columns.

required_columns = []
imputer = Imputer(missing_values=np.nan, strategy="mean", axis=0)
if len(required_columns) > 0:
    df[required_columns] = pd.DataFrame(imputer.fit_transform(df[required_columns]))

In [55]:
# Preprocessing with Sklearn, Fill with median values for the required columns.

required_columns = []
imputer = Imputer(missing_values=np.nan, strategy="median", axis=0)
if len(required_columns) > 0:
    df[required_columns] = pd.DataFrame(imputer.fit_transform(df[required_columns]))

In [57]:
# Preprocessing with Sklearn, Fill with most frequent values for the required columns.

required_columns = []
imputer = Imputer(missing_values=np.nan, strategy="median", axis=0)
if len(required_columns) > 0:
    df[required_columns] = pd.DataFrame(imputer.fit_transform(df[required_columns]))

In [59]:
# Preprocessing with Pandas, Fill with a specific value.

required_columns = []
if len(required_columns) > 0:
    df[required_columns] = df[required_columns].fillna(0)

In [62]:
# Preprocessing with Pandas, Drop missing values

required_columns = []
if len(required_columns) > 0:
    df.dropna(subset=required_columns, inplace=True, how='any')

In [59]:
# Storage of results.
result_time = datetime.utcnow().strftime('%s')
save_dataset_fn = EXTENSION_MAPPING['save'].get(file_extension.strip('.'))
getattr(df, save_dataset_fn)(os.path.join(RESULT_DIR, '{}.result.{}.{}'.format(file_name, result_time, file_extension)))