In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('default')
import pandas as pd
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 100 # 200 e.g. is really fine, but slower
plt.tight_layout()
import matplotlib 
font = {'family' : 'normal',
        'weight' : 'normal',
        'size'   : 20}
matplotlib.rc('font', **font)

<Figure size 1200x800 with 0 Axes>

In [32]:
def changeTextFeatureToNumeric(df, cols):
    for col in cols:
        try:
            a = df[[col]].apply(lambda col: pd.factorize(col, sort=False)[0])[col]
            b = df.drop([col], axis=1)
            df = pd.concat([b, a], axis=1, join='inner')
        except:
            pass
    return df

def splitTrainValidTest(df, percentage, data_name): # Percentage = [end of train, begin of test]
    df_Adelie = df[df['Species'] == 0]
    df_Chinstrap = df[df['Species'] == 1]
    df_Gentoo = df[df['Species'] == 2]

    # Shuffle
    df_Adelie = df_Adelie.sample(frac=1)
    df_Chinstrap = df_Chinstrap.sample(frac=1)
    df_Gentoo = df_Gentoo.sample(frac=1)
    
    def dfGetTrainData(df):
        df_len = len(df)
        return df.head(int(df_len * percentage[0]))

    def dfGetValidData(df):
        df_len = len(df)    
        return df[int(df_len * percentage[0]):int(df_len * percentage[1])]
    
    def dfGetTestData(df):
        df_len = len(df)    
        return df[int(df_len * percentage[1]):int(df_len)]
    
    df_Adelie_train = dfGetTrainData(df_Adelie)
    df_Chinstrap_train = dfGetTrainData(df_Chinstrap)
    df_Gentoo_train = dfGetTrainData(df_Gentoo)
    
    df_Adelie_valid = dfGetValidData(df_Adelie)
    df_Chinstrap_valid = dfGetValidData(df_Chinstrap)
    df_Gentoo_valid = dfGetValidData(df_Gentoo)

    df_Adelie_test = dfGetTestData(df_Adelie)
    df_Chinstrap_test = dfGetTestData(df_Chinstrap)
    df_Gentoo_test = dfGetTestData(df_Gentoo)
    
    frames = [df_Adelie_train, df_Chinstrap_train, df_Gentoo_train]
    df_train = pd.concat(frames).sample(frac=1)
    frames = [df_Adelie_valid, df_Chinstrap_valid, df_Gentoo_valid]
    df_valid = pd.concat(frames).sample(frac=1)
    frames = [df_Adelie_test, df_Chinstrap_test, df_Gentoo_test]
    df_test = pd.concat(frames).sample(frac=1)
    
    try:
        os.makedirs('../data/' + data_name)
    except OSError as e:
        pass
    
    df_train.to_csv('../data/' + data_name + '/train_data.csv', index = False)
    df_valid.to_csv('../data/' + data_name + '/valid_data.csv', index = False)
    df_test.to_csv('../data/' + data_name + '/test_data.csv', index = False)
    
    return df_train, df_valid, df_test

def splitTrainTest(df, percentage, data_name): 
    df_Adelie = df[df['Species'] == 0]
    df_Chinstrap = df[df['Species'] == 1]
    df_Gentoo = df[df['Species'] == 2]

    # Shuffle
    df_Adelie = df_Adelie.sample(frac=1)
    df_Chinstrap = df_Chinstrap.sample(frac=1)
    df_Gentoo = df_Gentoo.sample(frac=1)
    
    def dfGetTrainData(df, percentage):
        df_len = len(df)
        return df.head(int(df_len * percentage))

    def dfGetTestData(df, percentage):
        df_len = len(df)    
        return df.tail(df_len - int(df_len * percentage))
    
    df_Adelie_train = dfGetTrainData(df_Adelie, percentage)
    df_Chinstrap_train = dfGetTrainData(df_Chinstrap, percentage)
    df_Gentoo_train = dfGetTrainData(df_Gentoo, percentage)

    df_Adelie_test = dfGetTestData(df_Adelie, percentage)
    df_Chinstrap_test = dfGetTestData(df_Chinstrap, percentage)
    df_Gentoo_test = dfGetTestData(df_Gentoo, percentage)
    
    frames = [df_Adelie_train, df_Chinstrap_train, df_Gentoo_train]
    df_train = pd.concat(frames).sample(frac=1)
    frames = [df_Adelie_test, df_Chinstrap_test, df_Gentoo_test]
    df_test = pd.concat(frames).sample(frac=1)
    
    try:
        os.makedirs('../data/' + data_name)
    except OSError as e:
        pass
    
    df_train.to_csv('../data/' + data_name + '/train_data.csv', index = False)
    df_test.to_csv('../data/' + data_name + '/test_data.csv', index = False)

In [33]:
save_folder = 'train_0.5_valid_0.7_test'
df_remove_outliers = pd.read_csv("../data/data_remove_Island.csv")
df_remove_outliers = changeTextFeatureToNumeric(df_remove_outliers, ['Clutch Completion', 'Sex', 'Species', 'a'])
# df_remove_outliers = df_remove_outliers.fillna(df_remove_outliers.mean())
# splitTrainTest(df_remove_outliers, percentage = 0.7, data_name = save_folder)
a, b, c = splitTrainValidTest(df_remove_outliers, percentage = [0.5, 0.7], data_name = save_folder)
df_remove_outliers

Unnamed: 0,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Delta 15 N (o/oo),Delta 13 C (o/oo),Clutch Completion,Sex,Species
0,39.1,18.7,181.0,3750.0,,,0,0,0
1,39.5,17.4,186.0,3800.0,8.94956,-24.69454,0,1,0
2,40.3,18.0,195.0,3250.0,8.36821,-25.33302,0,1,0
3,36.7,19.3,193.0,3450.0,8.76651,-25.32426,0,1,0
4,39.3,20.6,190.0,3650.0,8.66496,-25.29805,0,0,0
...,...,...,...,...,...,...,...,...,...
337,47.2,13.7,214.0,4925.0,7.99184,-26.20538,1,1,2
338,46.8,14.3,215.0,4850.0,8.41151,-26.13832,0,1,2
339,50.4,15.7,222.0,5750.0,8.30166,-26.04117,0,0,2
340,45.2,14.8,212.0,5200.0,8.24246,-26.11969,0,1,2


In [34]:
a

Unnamed: 0,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Delta 15 N (o/oo),Delta 13 C (o/oo),Clutch Completion,Sex,Species
220,50.0,16.3,230.0,5700.0,8.14756,-25.39369,0,0,2
264,43.6,13.9,217.0,4900.0,8.27141,-26.77650,0,1,2
89,35.7,18.0,202.0,3550.0,8.46531,-26.05621,0,1,0
212,46.8,16.5,189.0,3650.0,9.65061,-24.48153,0,1,1
266,50.5,15.9,225.0,5400.0,8.65803,-26.57585,1,0,2
...,...,...,...,...,...,...,...,...,...
35,38.8,20.0,190.0,3950.0,9.18985,-25.12255,0,0,0
277,43.2,14.5,208.0,4450.0,8.48367,-26.86485,0,1,2
190,53.5,19.9,205.0,4500.0,10.02544,-24.90816,1,0,1
295,47.5,14.2,209.0,4600.0,8.39299,-26.78733,0,1,2


In [35]:
b

Unnamed: 0,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Delta 15 N (o/oo),Delta 13 C (o/oo),Clutch Completion,Sex,Species
146,36.6,18.4,184.0,3475.0,8.68744,-25.83060,0,1,0
165,45.9,17.1,190.0,3575.0,9.12277,-24.90024,0,1,1
175,46.7,17.9,195.0,3300.0,9.74144,-24.59467,1,1,1
171,42.4,17.3,181.0,3600.0,9.35138,-24.68790,0,1,1
163,47.0,17.3,185.0,3700.0,8.72037,-24.80526,0,1,1
...,...,...,...,...,...,...,...,...,...
127,39.0,17.1,191.0,3050.0,9.19031,-25.73722,0,1,0
173,43.2,16.6,187.0,2900.0,9.35416,-25.01185,1,1,1
331,43.5,15.2,213.0,4650.0,8.21634,-26.11046,0,1,2
161,46.6,17.8,193.0,3800.0,8.95063,-24.59897,0,1,1


In [36]:
c

Unnamed: 0,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Delta 15 N (o/oo),Delta 13 C (o/oo),Clutch Completion,Sex,Species
296,51.1,16.3,220.0,6000.0,8.40327,-26.76821,0,0,2
310,52.2,17.1,228.0,5400.0,8.36701,-25.89834,0,0,2
316,46.9,14.6,222.0,4875.0,7.88810,-26.04726,0,1,2
122,41.4,18.5,202.0,3875.0,9.59462,-25.42621,0,0,0
234,49.3,15.7,217.0,5850.0,8.07137,-25.52473,0,0,2
...,...,...,...,...,...,...,...,...,...
38,39.8,19.1,184.0,4650.0,,,1,0,0
324,46.8,16.1,215.0,5500.0,8.32359,-26.05756,0,0,2
209,50.2,18.8,202.0,3800.0,9.74492,-24.40400,0,0,1
181,40.9,16.6,187.0,3200.0,9.08458,-24.54903,0,1,1


In [13]:
df_remove_outliers
df_len = len(df_remove_outliers)
train = df_remove_outliers.head(int(df_len * 0.5))
valid = df_remove_outliers[int(df_len * 0.5):int(df_len * 0.7)]
test = df_remove_outliers[int(df_len * 0.7):int(df_len)]

In [14]:
train

Unnamed: 0,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Delta 15 N (o/oo),Delta 13 C (o/oo),Clutch Completion,Sex,Species
0,39.1,18.7,181.0,3750.0,,,0,0,0
1,39.5,17.4,186.0,3800.0,8.94956,-24.69454,0,1,0
2,40.3,18.0,195.0,3250.0,8.36821,-25.33302,0,1,0
3,36.7,19.3,193.0,3450.0,8.76651,-25.32426,0,1,0
4,39.3,20.6,190.0,3650.0,8.66496,-25.29805,0,0,0
...,...,...,...,...,...,...,...,...,...
166,50.5,19.6,201.0,4050.0,9.80590,-24.72940,0,0,1
167,50.3,20.0,197.0,3300.0,10.02019,-24.54704,1,0,1
168,58.0,17.8,181.0,3700.0,9.14382,-24.57994,1,1,1
169,46.4,18.6,190.0,3450.0,9.32105,-24.64162,0,1,1


In [15]:
valid

Unnamed: 0,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Delta 15 N (o/oo),Delta 13 C (o/oo),Clutch Completion,Sex,Species
171,42.4,17.3,181.0,3600.0,9.35138,-24.68790,0,1,1
172,48.5,17.5,191.0,3400.0,9.42666,-24.26375,0,0,1
173,43.2,16.6,187.0,2900.0,9.35416,-25.01185,1,1,1
174,50.6,19.4,193.0,3800.0,9.28153,-24.97134,1,0,1
175,46.7,17.9,195.0,3300.0,9.74144,-24.59467,1,1,1
...,...,...,...,...,...,...,...,...,...
234,49.3,15.7,217.0,5850.0,8.07137,-25.52473,0,0,2
235,42.0,13.5,210.0,4150.0,7.63884,-25.52627,0,1,2
236,49.2,15.2,221.0,6300.0,8.27376,-25.00169,0,0,2
237,46.2,14.5,209.0,4800.0,7.84057,-25.37899,0,1,2


In [16]:
test

Unnamed: 0,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Delta 15 N (o/oo),Delta 13 C (o/oo),Clutch Completion,Sex,Species
239,50.2,14.3,218.0,5700.0,7.89620,-25.37746,0,0,2
240,45.1,14.5,215.0,5000.0,7.63220,-25.46569,0,1,2
241,46.5,14.5,213.0,4400.0,7.90436,-25.39470,0,1,2
242,46.3,15.8,215.0,5050.0,7.90971,-25.38157,0,0,2
243,42.9,13.1,215.0,5000.0,7.68528,-25.39181,0,1,2
...,...,...,...,...,...,...,...,...,...
337,47.2,13.7,214.0,4925.0,7.99184,-26.20538,1,1,2
338,46.8,14.3,215.0,4850.0,8.41151,-26.13832,0,1,2
339,50.4,15.7,222.0,5750.0,8.30166,-26.04117,0,0,2
340,45.2,14.8,212.0,5200.0,8.24246,-26.11969,0,1,2
