In [63]:
# Imports
import os
from datetime import datetime
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split

In [64]:
# Update the filename
FILENAME = 'dummy.csv'

In [71]:
# Constants Declaration
DATASET_DIR = './data/'
RESULT_DIR = './result/'
RANDOM_SEED = 42
EXTENSION_MAPPING = {
    'read': {
        'csv': 'read_csv',
        'json': 'read_json',
        'xlsx': 'read_excel'   
    },
    'save': {
        'csv': 'to_csv',
        'json': 'to_json',
        'xlsx': 'to_excel'      
    }
}
np.random.seed(seed=RANDOM_SEED)

In [38]:
# Dataset Loader
DATASET_FILE = os.path.join(DATASET_DIR, FILENAME)
file_path, file_extension = os.path.splitext(DATASET_FILE)
file_name = file_path.split(os.path.sep)[-1]
file_extension = file_extension.strip('.')
dataset_extracter = EXTENSION_MAPPING['read'].get(file_extension)
if dataset_extracter is None:
    raise ValueError('Dataset type not supported')
df = getattr(pd, dataset_extracter)(DATASET_FILE)

In [39]:
df.head()

Unnamed: 0,id,name,age,year
0,1,john,23,2004.0
1,2,tom,45,2006.0
2,3,sam,64,
3,4,harry,2012,
4,5,jim,23,2014.0


In [112]:
target_columns = list(set(['age']))
dependent_columns = list(set(df.columns) - set(target_columns))

In [113]:
X_train, X_test, y_train, y_test = train_test_split(
    df[dependent_columns], df[target_columns],
    test_size=0.2, random_state=RANDOM_SEED)

### Dealing with missing values
* Replace with Mean Values
* Replace with Median Values
* Replace with Most Common Values
* Replace with Specific Value
* Drop records with Missing Values

In [103]:
# Preprocessing with Sklearn, Fill with mean values for the required columns.

required_columns = []
imputer = Imputer(missing_values=np.nan, strategy="mean", axis=0)
if len(required_columns) > 0:
    X_train[required_columns] = pd.DataFrame(imputer.fit_transform(X_train[required_columns]))
    X_test[required_columns] = pd.DataFrame(imputer.transform(X_test[required_columns]))

In [55]:
# Preprocessing with Sklearn, Fill with median values for the required columns.

required_columns = []
imputer = Imputer(missing_values=np.nan, strategy="median", axis=0)
if len(required_columns) > 0:
    X_train[required_columns] = pd.DataFrame(imputer.fit_transform(X_train[required_columns]))
    X_test[required_columns] = pd.DataFrame(imputer.transform(X_test[required_columns]))

In [104]:
# Preprocessing with Sklearn, Fill with most frequent values for the required columns.

required_columns = []
imputer = Imputer(missing_values=np.nan, strategy="median", axis=0)
if len(required_columns) > 0:
    X_train[required_columns] = pd.DataFrame(imputer.fit_transform(X_train[required_columns]))
    X_test[required_columns] = pd.DataFrame(imputer.transform(X_test[required_columns]))

In [105]:
# Preprocessing with Pandas, Fill with a specific value.

value = 0
required_columns = []
if len(required_columns) > 0:
    X_train[required_columns] = X_train[required_columns].fillna(value)
    X_test[required_columns] = X_test[required_columns].fillna(value)

In [106]:
# Preprocessing with Pandas, Drop missing values

required_columns = []
if len(required_columns) > 0:
    X_train.dropna(subset=required_columns, inplace=True, how='any')
    X_test.dropna(subset=required_columns, inplace=True, how='any')

In [116]:
# Storage of results.
result_time = datetime.utcnow().strftime('%s')
save_dataset_fn = EXTENSION_MAPPING['save'].get(file_extension.strip('.'))
getattr(pd.concat([X_train, y_train], axis=1), save_dataset_fn)(os.path.join(RESULT_DIR, '{}.train.result.{}.{}'.format(file_name, result_time, file_extension)))
getattr(pd.concat([X_test, y_test], axis=1), save_dataset_fn)(os.path.join(RESULT_DIR, '{}.test.result.{}.{}'.format(file_name, result_time, file_extension)))