In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('default')
import pandas as pd
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 100 # 200 e.g. is really fine, but slower
plt.tight_layout()
import matplotlib 
font = {'family' : 'normal',
        'weight' : 'normal',
        'size'   : 20}
matplotlib.rc('font', **font)

<Figure size 1200x800 with 0 Axes>

In [2]:
def changeTextFeatureToNumeric(df, cols):
    for col in cols:
        try:
            a = df[[col]].apply(lambda col: pd.factorize(col, sort=False)[0])[col]
            b = df.drop([col], axis=1)
            df = pd.concat([b, a], axis=1, join='inner')
        except:
            pass
    return df

def splitTrainValidTest(df, percentage, data_name): # Percentage = [end of train, begin of test]
    df_Adelie = df[df['Species'] == 0]
    df_Chinstrap = df[df['Species'] == 1]
    df_Gentoo = df[df['Species'] == 2]

    # Shuffle
    df_Adelie = df_Adelie.sample(frac=1)
    df_Chinstrap = df_Chinstrap.sample(frac=1)
    df_Gentoo = df_Gentoo.sample(frac=1)
    
    def dfGetTrainData(df):
        df_len = len(df)
        return df.head(int(df_len * percentage[0]))

    def dfGetValidData(df):
        df_len = len(df)    
        return df[int(df_len * percentage[0]):int(df_len * percentage[1])]
    
    def dfGetTestData(df):
        df_len = len(df)    
        return df[int(df_len * percentage[1]):int(df_len)]
    
    df_Adelie_train = dfGetTrainData(df_Adelie)
    df_Chinstrap_train = dfGetTrainData(df_Chinstrap)
    df_Gentoo_train = dfGetTrainData(df_Gentoo)
    
    df_Adelie_valid = dfGetValidData(df_Adelie)
    df_Chinstrap_valid = dfGetValidData(df_Chinstrap)
    df_Gentoo_valid = dfGetValidData(df_Gentoo)

    df_Adelie_test = dfGetTestData(df_Adelie)
    df_Chinstrap_test = dfGetTestData(df_Chinstrap)
    df_Gentoo_test = dfGetTestData(df_Gentoo)
    
    frames = [df_Adelie_train, df_Chinstrap_train, df_Gentoo_train]
    df_train = pd.concat(frames).sample(frac=1)
    frames = [df_Adelie_valid, df_Chinstrap_valid, df_Gentoo_valid]
    df_valid = pd.concat(frames).sample(frac=1)
    frames = [df_Adelie_test, df_Chinstrap_test, df_Gentoo_test]
    df_test = pd.concat(frames).sample(frac=1)
    
    try:
        os.makedirs('../data/' + data_name)
    except OSError as e:
        pass
    
    df_train.to_csv('../data/' + data_name + '/train_data.csv', index = False)
    df_valid.to_csv('../data/' + data_name + '/valid_data.csv', index = False)
    df_test.to_csv('../data/' + data_name + '/test_data.csv', index = False)
    
    return df_train, df_valid, df_test

def splitTrainTest(df, percentage, data_name): 
    df_Adelie = df[df['Species'] == 0]
    df_Chinstrap = df[df['Species'] == 1]
    df_Gentoo = df[df['Species'] == 2]

    # Shuffle
    df_Adelie = df_Adelie.sample(frac=1)
    df_Chinstrap = df_Chinstrap.sample(frac=1)
    df_Gentoo = df_Gentoo.sample(frac=1)
    
    def dfGetTrainData(df, percentage):
        df_len = len(df)
        return df.head(int(df_len * percentage))

    def dfGetTestData(df, percentage):
        df_len = len(df)    
        return df.tail(df_len - int(df_len * percentage))
    
    df_Adelie_train = dfGetTrainData(df_Adelie, percentage)
    df_Chinstrap_train = dfGetTrainData(df_Chinstrap, percentage)
    df_Gentoo_train = dfGetTrainData(df_Gentoo, percentage)

    df_Adelie_test = dfGetTestData(df_Adelie, percentage)
    df_Chinstrap_test = dfGetTestData(df_Chinstrap, percentage)
    df_Gentoo_test = dfGetTestData(df_Gentoo, percentage)
    
    frames = [df_Adelie_train, df_Chinstrap_train, df_Gentoo_train]
    df_train = pd.concat(frames).sample(frac=1)
    frames = [df_Adelie_test, df_Chinstrap_test, df_Gentoo_test]
    df_test = pd.concat(frames).sample(frac=1)
    
    try:
        os.makedirs('../data/' + data_name)
    except OSError as e:
        pass
    
    df_train.to_csv('../data/' + data_name + '/train_data.csv', index = False)
    df_test.to_csv('../data/' + data_name + '/test_data.csv', index = False)
    
def getTrainSamples(df, size, source, data_name): # Percentage = [end of train, begin of test]
    
    len_df = len(df)
    percentage = size / len(df)
    df_Adelie = df[df['Species'] == 0]
    df_Chinstrap = df[df['Species'] == 1]
    df_Gentoo = df[df['Species'] == 2]

    # Shuffle
    df_Adelie = df_Adelie.sample(frac=1)
    df_Chinstrap = df_Chinstrap.sample(frac=1)
    df_Gentoo = df_Gentoo.sample(frac=1)
    
    size_Adelie = int(len(df_Adelie) * percentage)
    size_Chinstrap = int(len(df_Chinstrap) * percentage)
    size_Gentoo = int(len(df_Gentoo) * percentage)
    
    def dfGetTrainData(df, size):
        return df.head(size)
    
    df_Adelie_train = dfGetTrainData(df_Adelie, size_Adelie)
    df_Chinstrap_train = dfGetTrainData(df_Chinstrap, int(size) - size_Adelie - size_Gentoo)
    df_Gentoo_train = dfGetTrainData(df_Gentoo, size_Gentoo)
    
    frames = [df_Adelie_train, df_Chinstrap_train, df_Gentoo_train]
    df_train = pd.concat(frames).sample(frac=1)
    
    try:
        os.makedirs('../data/' + source)
    except OSError as e:
        pass
    
    df_train.to_csv('../data/' + source + '/train_data_' + str(size) + '.csv', index = False)
    
    return df_train

In [3]:
a = pd.read_csv("../data/train_0.5_valid_0.7_test/train_data.csv")
len(a)

170

In [11]:
save_folder = 'train_0.5_valid_0.7_test'
df_remove_outliers = pd.read_csv("../data/data_remove_outliers.csv")
df_remove_outliers = changeTextFeatureToNumeric(df_remove_outliers, ['Clutch Completion', 'Sex', 'Species', 'a'])
# df_remove_outliers = df_remove_outliers.fillna(df_remove_outliers.mean())
# splitTrainTest(df_remove_outliers, percentage = 0.7, data_name = save_folder)
a, b, c = splitTrainValidTest(df_remove_outliers, percentage = [0.5, 0.7], data_name = save_folder)
df_remove_outliers

Unnamed: 0,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Delta 15 N (o/oo),Delta 13 C (o/oo),Sex,Species
0,39.1,18.7,181.0,3750.0,,,0,0
1,39.5,17.4,186.0,3800.0,8.94956,-24.69454,1,0
2,40.3,18.0,195.0,3250.0,8.36821,-25.33302,1,0
3,36.7,19.3,193.0,3450.0,8.76651,-25.32426,1,0
4,39.3,20.6,190.0,3650.0,8.66496,-25.29805,0,0
...,...,...,...,...,...,...,...,...
337,47.2,13.7,214.0,4925.0,7.99184,-26.20538,1,2
338,46.8,14.3,215.0,4850.0,8.41151,-26.13832,1,2
339,50.4,15.7,222.0,5750.0,8.30166,-26.04117,0,2
340,45.2,14.8,212.0,5200.0,8.24246,-26.11969,1,2


In [12]:
for i in range(50, 180, 30):
    getTrainSamples(a, i, 'train_0.5_valid_0.7_test', 'a')

# Tackle NaN

In [35]:
def fillNaN(save_folder, file_name):
    df = pd.read_csv('../data/' + save_folder + '/' + file_name + '.csv')
    df = df.fillna(df.mean())
    df.to_csv('../data/' + save_folder + '/' + file_name + '_NaNmean.csv', index = False)

In [16]:
for i in range(50, 180, 30):
    fillNaN('train_0.5_valid_0.7_test', 'train_data_' + str(i))

In [17]:
fillNaN('train_0.5_valid_0.7_test', 'test_data')
fillNaN('train_0.5_valid_0.7_test', 'valid_data')

# Normalization

In [5]:
def normalizationTrainTest(save_folder, file_name):
    eps=1e-9
    df = pd.read_csv('../data/' + save_folder + '/' + file_name + '_NaNmean.csv')
    train_data_mean = np.mean(df, axis=0)
    train_data_mean['Species'] = 1
    train_data_std = np.std(df, axis=0)
    train_data_std['Species'] = 1
    df = (df - train_data_mean) / (train_data_std + eps)
    df.to_csv('../data/' + save_folder + '/' + file_name + '_Normalization.csv', index = False)
    df_test = pd.read_csv('../data/' + save_folder + '/' + 'test_data_NaNmean' + '.csv')
    df_test = (df_test - train_data_mean) / (train_data_std + eps)
    df_test.to_csv('../data/' + save_folder + '/' + file_name + '_test.csv', index = False)
    df_valid = pd.read_csv('../data/' + save_folder + '/' + 'valid_data_NaNmean' + '.csv')
    df_valid = (df_valid - train_data_mean) / (train_data_std + eps)
    df_valid.to_csv('../data/' + save_folder + '/' + file_name + '_valid.csv', index = False)

In [6]:
for i in range(50, 180, 30):
    normalizationTrainTest('train_0.5_valid_0.7_test', 'train_data_' + str(i))

In [18]:
df = pd.read_csv('../data/' + 'train_0.5_valid_0.7_test' + '/' + 'train_data_170' + '.csv')

In [33]:
train_data_mean = np.mean(df, axis=0)
train_data_mean['Species'] = 0
train_data_std = np.std(df, axis=0)
train_data_std['Species'] = 1
train_data_mean, train_data_std

(Culmen Length (mm)       44.008824
 Culmen Depth (mm)        17.228824
 Flipper Length (mm)     201.076471
 Body Mass (g)          4254.558824
 Delta 15 N (o/oo)         8.711458
 Delta 13 C (o/oo)       -25.697805
 Sex                       0.435294
 Species                   0.000000
 dtype: float64,
 Culmen Length (mm)       5.504948
 Culmen Depth (mm)        1.874298
 Flipper Length (mm)     14.526245
 Body Mass (g)          846.470499
 Delta 15 N (o/oo)        0.548068
 Delta 13 C (o/oo)        0.772809
 Sex                      0.541176
 Species                  1.000000
 dtype: float64)