# Split the dataset into development and final test set

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import sklearn.utils

In [None]:
do_save = False

In [None]:
data_dir = '../../data'
data_name = 'SonyAIBORobotSurface1_IoC'
data_filename = data_dir+'/'+data_name+'/'+data_name+'_ALL.txt'
robot_data = np.loadtxt(Path(data_filename))
print('The shape of robot_data is', robot_data.shape)

In [None]:
y_data = robot_data[:,0]
print('Number of samples of class 1', (y_data == 1.0).sum())
print('Number of samples of class 2', (y_data == 2.0).sum())
y_df = pd.DataFrame(robot_data[:,0])
y_df[0].value_counts().plot(kind='bar')

In [None]:
robot_data = sklearn.utils.shuffle(robot_data)

# Create a balanced dataset

In [None]:
robot_df = pd.DataFrame(robot_data)
class1_df = robot_df[(robot_df[0]==1)]
class2_df = robot_df[(robot_df[0]==2)]
n1 = class1_df.count()[0]
n2 = class2_df.count()[0]
n = min(n1, n2)
print('Selection set sizes:', n1, n2)
print('Min selection set size:', n)
class1_df = class1_df.iloc[:n]
class2_df = class2_df.iloc[:n]
balanced_df = pd.concat([class1_df, class2_df])
balanced_df = sklearn.utils.shuffle(balanced_df)
balanced_df[0].value_counts().plot(kind='bar')

# Save the balanced dataset

In [None]:
data_filename = data_name+'_BALANCED.txt'
if do_save:   
    np.savetxt(data_filename, balanced_df.to_numpy(), fmt='%8e', delimiter='\t')

# Reload and check the balanced dataset

In [None]:
balanced_data = np.loadtxt(Path(data_filename))
print('The shape of balanced_data is', balanced_data.shape)
y_data = balanced_data[:,0]
print('Number of samples of class 1', (y_data == 1.0).sum())
print('Number of samples of class 2', (y_data == 2.0).sum())
y_df = pd.DataFrame(y_data)
y_df[0].value_counts().plot(kind='bar')

# Split the data

In [None]:
train_data, test_data = train_test_split(balanced_data, test_size=100, random_state=21, stratify=balanced_data[:,0])
print('The shape of train_data is', train_data.shape)
print('The shape of test_data is', test_data.shape)
print('Training data:')
print('Number of samples of class 1', (train_data[:,0] == 1.0).sum())
print('Number of samples of class 2', (train_data[:,0] == 2.0).sum())
print('Test data:')
print('Number of samples of class 1', (test_data[:,0] == 1.0).sum())
print('Number of samples of class 2', (test_data[:,0] == 2.0).sum())

# Save the development and final test datasets

In [None]:
if do_save:
    data_filename = data_name+'_DEV.txt'
    np.savetxt(data_filename, train_data, fmt='%8e', delimiter='\t')
    data_filename = data_name+'_FINAL_TEST.txt'
    np.savetxt(data_filename, test_data, fmt='%8e', delimiter='\t')