In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('default')
import pandas as pd
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 100 # 200 e.g. is really fine, but slower
plt.tight_layout()
import matplotlib 
font = {'family' : 'normal',
        'weight' : 'normal',
        'size'   : 20}
matplotlib.rc('font', **font)

<Figure size 1200x800 with 0 Axes>

In [2]:
def changeTextFeatureToNumeric(df, cols):
    for col in cols:
        try:
            a = df[[col]].apply(lambda col: pd.factorize(col, sort=False)[0])[col]
            b = df.drop([col], axis=1)
            df = pd.concat([b, a], axis=1, join='inner')
        except:
            pass
    return df

def splitTrainTest(df, percentage, data_name):
    df_Adelie = df[df['Species'] == 0]
    df_Chinstrap = df[df['Species'] == 1]
    df_Gentoo = df[df['Species'] == 2]

    # Suffle
    df_Adelie = df_Adelie.sample(frac=1)
    df_Chinstrap = df_Chinstrap.sample(frac=1)
    df_Gentoo = df_Gentoo.sample(frac=1)
    
    def dfGetTrainData(df, percentage):
        df_len = len(df)
        return df.head(int(df_len * percentage))

    def dfGetTestData(df, percentage):
        df_len = len(df)    
        return df.tail(df_len - int(df_len * percentage))
    
    df_Adelie_train = dfGetTrainData(df_Adelie, percentage)
    df_Chinstrap_train = dfGetTrainData(df_Chinstrap, percentage)
    df_Gentoo_train = dfGetTrainData(df_Gentoo, percentage)

    df_Adelie_test = dfGetTestData(df_Adelie, percentage)
    df_Chinstrap_test = dfGetTestData(df_Chinstrap, percentage)
    df_Gentoo_test = dfGetTestData(df_Gentoo, percentage)
    
    frames = [df_Adelie_train, df_Chinstrap_train, df_Gentoo_train]
    df_train = pd.concat(frames).sample(frac=1)
    frames = [df_Adelie_test, df_Chinstrap_test, df_Gentoo_test]
    df_test = pd.concat(frames).sample(frac=1)
    
    try:
        os.makedirs('../data/' + data_name)
    except OSError as e:
        pass
    
    df_train.to_csv('../data/' + data_name + '/train_data.csv', index = False)
    df_test.to_csv('../data/' + data_name + '/test_data.csv', index = False)

# Create training and testing data

In [22]:
df_remove_outliers = pd.read_csv("../data/data_remove_outliers.csv")
df_remove_outliers = changeTextFeatureToNumeric(df_remove_outliers, ['Island', 'Clutch Completion', 'Sex', 'Species', 'a'])
#df_remove_outliers = df_remove_outliers.fillna(df_remove_outliers.mean())
df_remove_outliers
splitTrainTest(df_remove_outliers, percentage = 0.8, data_name = 'data1')

In [23]:
train_data = pd.read_csv("../data/data1/train_data.csv")
test_data = pd.read_csv("../data/data1/test_data.csv")

X_train = train_data.drop(['Species'], axis = 1)
y_train = train_data[['Species']].copy()
X_train.to_csv('../data/' + 'data1' + '/X_train.csv', index = False)
y_train.to_csv('../data/' + 'data1' + '/y_train.csv', index = False)

X_test = test_data.drop(['Species'], axis = 1)
y_test = test_data[['Species']].copy()
X_test.to_csv('../data/' + 'data1' + '/X_test.csv', index = False)
y_test.to_csv('../data/' + 'data1' + '/y_test.csv', index = False)

# Tackle NaN

In [25]:
df_remove_outliers = pd.read_csv("../data/data_remove_outliers.csv")
df_remove_outliers = changeTextFeatureToNumeric(df_remove_outliers, ['Island', 'Clutch Completion', 'Sex', 'Species', 'a'])
df_remove_outliers = df_remove_outliers.fillna(df_remove_outliers.mean())
df_remove_outliers
splitTrainTest(df_remove_outliers, percentage = 0.8, data_name = 'data_nan_mean')

In [26]:
save_folder = 'data_nan_mean'
train_data = pd.read_csv('../data/' + save_folder + '/train_data.csv')
test_data = pd.read_csv('../data/' + save_folder + '/test_data.csv')

X_train = train_data.drop(['Species'], axis = 1)
y_train = train_data[['Species']].copy()
X_train.to_csv('../data/' + save_folder + '/X_train.csv', index = False)
y_train.to_csv('../data/' + save_folder + '/y_train.csv', index = False)

X_test = test_data.drop(['Species'], axis = 1)
y_test = test_data[['Species']].copy()
X_test.to_csv('../data/' + save_folder + '/X_test.csv', index = False)
y_test.to_csv('../data/' + save_folder + '/y_test.csv', index = False)

In [27]:
save_folder = 'data_nan_zero'
df_remove_outliers = pd.read_csv("../data/data_remove_outliers.csv")
df_remove_outliers = changeTextFeatureToNumeric(df_remove_outliers, ['Island', 'Clutch Completion', 'Sex', 'Species', 'a'])
df_remove_outliers = df_remove_outliers.fillna(0)
splitTrainTest(df_remove_outliers, percentage = 0.8, data_name = save_folder)

train_data = pd.read_csv('../data/' + save_folder + '/train_data.csv')
test_data = pd.read_csv('../data/' + save_folder + '/test_data.csv')

X_train = train_data.drop(['Species'], axis = 1)
y_train = train_data[['Species']].copy()
X_train.to_csv('../data/' + save_folder + '/X_train.csv', index = False)
y_train.to_csv('../data/' + save_folder + '/y_train.csv', index = False)

X_test = test_data.drop(['Species'], axis = 1)
y_test = test_data[['Species']].copy()
X_test.to_csv('../data/' + save_folder + '/X_test.csv', index = False)
y_test.to_csv('../data/' + save_folder + '/y_test.csv', index = False)

# Create training and testing data for no-Island data

In [3]:
save_folder = 'data_nan_mean_no_Island_70_percent'
df_remove_outliers = pd.read_csv("../data/data_remove_Island.csv")
df_remove_outliers = changeTextFeatureToNumeric(df_remove_outliers, ['Clutch Completion', 'Sex', 'Species', 'a'])
df_remove_outliers = df_remove_outliers.fillna(df_remove_outliers.mean())
splitTrainTest(df_remove_outliers, percentage = 0.7, data_name = save_folder)

In [4]:
train_data = pd.read_csv('../data/' + save_folder + '/train_data.csv')
test_data = pd.read_csv('../data/' + save_folder + '/test_data.csv')

X_train = train_data.drop(['Species'], axis = 1)
y_train = train_data[['Species']].copy()
X_train.to_csv('../data/' + save_folder + '/X_train.csv', index = False)
y_train.to_csv('../data/' + save_folder + '/y_train.csv', index = False)

X_test = test_data.drop(['Species'], axis = 1)
y_test = test_data[['Species']].copy()
X_test.to_csv('../data/' + save_folder + '/X_test.csv', index = False)
y_test.to_csv('../data/' + save_folder + '/y_test.csv', index = False)