In [1]:
import sys
sys.path.append('./libraries')

In [2]:
import pandas as pd
import pickle
import math
from libraries.functions import get_test_calves, find_optimal_calf_combinations_for_split, check_for_duplicates

# File Paths

In [3]:
data_amounts_dataset_path = './dataset/information_datasets/data_amounts_per_calf_6_labels.csv'
train_validation_test_set_info_6_labels = './dataset/information_datasets/train_validation_test_set_info_6_labels.pkl'

# Conts / Vars

In [4]:
CONSIDERED_LABELS = ['drinking_milk', 'grooming', 'lying', 'running', 'walking', 'other']

# Importing dataset information

In [5]:
data_amounts_df = pd.read_csv(data_amounts_dataset_path)
data_amounts_df.head()

Unnamed: 0,calf_id,drinking_milk,grooming,lying,other,running,walking
0,1302,51,154,1930,1229,106,38
1,1303,333,32,1093,1933,35,38
2,1306,45,95,794,1582,73,29
3,1308,281,135,1859,1455,161,60
4,1312,36,120,195,2025,50,6


# Identifying the best test calves

### Calves with data for all the labels

In [6]:
candidate_test_calves = get_test_calves(data_amounts_df, CONSIDERED_LABELS)
print(candidate_test_calves)

[1302 1303 1306 1308 1312 1314 1319 1320 1328 1329 1333 1336 1343 1353
 1357 1372 1398 1405 1443 1452 1455]


# Train_Validation / Test split

In [7]:
train_test_split = 0.3

In [8]:
total_calves = len(data_amounts_df.calf_id.unique())
no_test_calves = int(total_calves*train_test_split)
no_train_validation_calves = total_calves - no_test_calves
print('Total Calves = ' + str(total_calves) + ' / No of Train_Validation Calves = ' + str(no_train_validation_calves) +
     ' / No of Test Calves = ' + str(no_test_calves))

Total Calves = 30 / No of Train_Validation Calves = 21 / No of Test Calves = 9


In [9]:
test_calves = find_optimal_calf_combinations_for_split(candidate_test_calves, no_test_calves, data_amounts_df,
                                                       int(no_train_validation_calves/no_test_calves))
print(test_calves)

(1302, 1303, 1314, 1319, 1328, 1329, 1336, 1353, 1405)


In [10]:
train_validation_calves = [x for x in list(data_amounts_df['calf_id'].unique()) if x not in test_calves]
print('No of Train Calves = ' + str(len(train_validation_calves)) + ' / No of Test Calves = ' + str(len(test_calves)))

No of Train Calves = 21 / No of Test Calves = 9


# Train / Validation sets

In [11]:
train_validation_split = 0.3

In [12]:
no_of_validation_calves = math.ceil(len(train_validation_calves)*train_validation_split)
no_of_train_calves = len(train_validation_calves) - (no_of_validation_calves)
print('No of Train Calves = ' + str(no_of_train_calves) + ' / No of Validation Calves = ' + str(no_of_validation_calves))

No of Train Calves = 14 / No of Validation Calves = 7


In [13]:
cv = 10

candidate_validation_calves = [x for x in candidate_test_calves if not x in test_calves]

validation_iterations = find_optimal_calf_combinations_for_split(candidate_validation_calves, no_of_validation_calves, data_amounts_df,
                                                       math.ceil(no_of_train_calves/no_of_validation_calves), is_test_set=False, cv=10)

if len(validation_iterations) != cv:
    print('Number of possible combinations < cv')
else:
    print('CV iterations were successfully generated')
    validation_iterations

CV iterations were successfully generated


In [14]:
validation_iterations

[(1306, 1308, 1312, 1398, 1443, 1452, 1455),
 (1306, 1308, 1320, 1398, 1443, 1452, 1455),
 (1306, 1308, 1320, 1357, 1443, 1452, 1455),
 (1306, 1308, 1312, 1357, 1443, 1452, 1455),
 (1306, 1308, 1320, 1372, 1443, 1452, 1455),
 (1308, 1312, 1320, 1357, 1443, 1452, 1455),
 (1308, 1312, 1320, 1398, 1443, 1452, 1455),
 (1306, 1308, 1357, 1372, 1443, 1452, 1455),
 (1306, 1308, 1372, 1398, 1443, 1452, 1455),
 (1308, 1312, 1357, 1398, 1443, 1452, 1455)]

In [15]:
check_for_duplicates(validation_iterations)

'Success: No Duplicates found'

In [16]:
validation_test_set_info = {
    'all_calves': list(data_amounts_df.calf_id.unique()),
    'test_set' : test_calves,
    'validation_sets': validation_iterations
}

In [17]:
with open(train_validation_test_set_info_6_labels, 'wb') as f:
    pickle.dump(validation_test_set_info, f)