In [None]:
import sys
sys.path.append('./libraries')

In [None]:
import pandas as pd
import pickle
import math
from libraries.functions import get_test_calves, find_optimal_calf_combinations_for_split, check_for_duplicates

# File Paths

In [None]:
data_amounts_dataset_path = './dataset/information_datasets/data_amounts_per_calf_6_labels.csv'
train_validation_test_set_info_6_labels = './dataset/information_datasets/train_validation_test_set_info_6_labels.pkl'

# Conts / Vars

In [None]:
CONSIDERED_LABELS = ['drinking_milk', 'grooming', 'lying', 'running', 'walking', 'other']

# Importing dataset information

In [None]:
data_amounts_df = pd.read_csv(data_amounts_dataset_path)
data_amounts_df.head()

# Identifying the best test calves

### Calves with data for all the labels

In [None]:
candidate_test_calves = get_test_calves(data_amounts_df, CONSIDERED_LABELS)
print(candidate_test_calves)

# Train_Validation / Test split

In [None]:
train_test_split = 0.3

In [None]:
total_calves = len(data_amounts_df.calf_id.unique())
no_test_calves = int(total_calves*train_test_split)
no_train_validation_calves = total_calves - no_test_calves
print('Total Calves = ' + str(total_calves) + ' / No of Train_Validation Calves = ' + str(no_train_validation_calves) +
     ' / No of Test Calves = ' + str(no_test_calves))

In [None]:
test_calves = find_optimal_calf_combinations_for_split(candidate_test_calves, no_test_calves, data_amounts_df,
                                                       train_test_split)
print(test_calves)

In [None]:
train_validation_calves = [x for x in list(data_amounts_df['calf_id'].unique()) if x not in test_calves]
print('No of Train Calves = ' + str(len(train_validation_calves)) + ' / No of Test Calves = ' + str(len(test_calves)))

# Train / Validation sets

In [None]:
train_validation_split = 0.3

In [None]:
no_of_validation_calves = math.ceil(len(train_validation_calves)*train_validation_split)
no_of_train_calves = len(train_validation_calves) - (no_of_validation_calves)
print('No of Train Calves = ' + str(no_of_train_calves) + ' / No of Validation Calves = ' + str(no_of_validation_calves))

In [None]:
cv = 10

candidate_validation_calves = [x for x in candidate_test_calves if not x in test_calves]

validation_iterations = find_optimal_calf_combinations_for_split(candidate_validation_calves, 
                                                                 no_of_validation_calves, data_amounts_df,
                                                                 train_test_split, is_test_set=False, cv=10)

if len(validation_iterations) != cv:
    print('Number of possible combinations < cv')
else:
    print('CV iterations were successfully generated')
    validation_iterations

In [None]:
validation_iterations

In [None]:
check_for_duplicates(validation_iterations)

In [None]:
validation_test_set_info = {
    'all_calves': list(data_amounts_df.calf_id.unique()),
    'test_set' : test_calves,
    'validation_sets': validation_iterations
}

In [None]:
with open(train_validation_test_set_info_6_labels, 'wb') as f:
    pickle.dump(validation_test_set_info, f)