In [1]:
import random
import pandas as pd
from datetime import datetime

from sklearn.model_selection import train_test_split


In [2]:
def participant_train_test_split(dataframe, train_val_ratio=0.8, seed=13):
    """Generate train and test split based on participants. Assigns thermal 
    comfort vote as target variable
    """

    random.seed(seed)
    df = dataframe.copy()
    list_participants = dataframe['user_id'].unique()
    random.shuffle(list_participants)

    test_participants = random.sample(set(list_participants), 
                                      int(round((1 - train_val_ratio) * len(list_participants))))
    df_test = df[df['user_id'].isin(test_participants)]

    print("Testing on participants: {}".format(df_test['user_id'].unique()))
    
    # use the rest for training (the negate of above)
    df_train = df[~df['user_id'].isin(test_participants)]
  
    # move thermal comfort response to the end of the dataframe
    df_train_y = df_train.pop('thermal_cozie')
    df_test_y = df_test.pop('thermal_cozie')
    df_train['thermal_cozie'] = df_train_y
    df_test['thermal_cozie'] = df_test_y
         
    # shuffle 
    df_train.sample(frac=1, random_state=100).reset_index(drop=True, inplace=True)
    df_test.sample(frac=1, random_state=100).reset_index(drop=True, inplace=True)
    
    return df_train, df_test


In [3]:
def personal_train_test_split(dataframe, train_val_ratio=0.8):
    """Generate train and test split for each participant's data. It takes the inital
    `train_val_ratio` instances of each participants as train data and the rest as
    validation data.
    """

    df = dataframe.copy()
    train_max_idx = round(len(df) * train_val_ratio)
    df_train = df.iloc[:train_max_idx]
    df_test = df.iloc[train_max_idx:]
    
    # move thermal comfort response to the end of the dataframe
    df_train_y = df_train.pop('thermal_cozie')
    df_test_y = df_test.pop('thermal_cozie')
    df_train['thermal_cozie'] = df_train_y
    df_test['thermal_cozie'] = df_test_y
    
    print("Personal training instances: {}".format(len(df_train)))
    print("Personal validation instances: {}".format(len(df_test)))
        
    return df_train, df_test
    

In [4]:
def split_save(dataframe, name, participant_split=True, train_val_ratio=0.8, seed=13):
    if participant_split:
        dataframe_train, dataframe_test = participant_train_test_split(dataframe, train_val_ratio, 
                                                                       seed)
    else:
        dataframe_train, dataframe_test = personal_train_test_split(dataframe, train_val_ratio)
    name_train = name + "_train"
    name_test = name + "_val"
    save_df(dataframe_train, name_train, participant_split)
    save_df(dataframe_test, name_test, participant_split)

    return dataframe_train, dataframe_test


In [5]:
def save_df(dataframe, file_name, participant_split):
    if participant_split:
        new_name = file_date + "_" + file_name + ".csv"
    else:
        user_id = dataframe['user_id'].iloc[0]
        new_name = file_date + "_" + file_name + "_" + user_id + ".csv"
    dataframe.to_csv("../data-processed-preferences/" + new_name, index=False)
    

# Variables

In [6]:
seed = 13
dataframes_names = ['fs1', 'fs2', 'fs3', 'fs4', 'fs5', 'fs6']
folder_path = 'data-processed-preferences/'
file_date = '2019-11-15'
df_files = ['fs1.csv', 'fs2.csv', 'fs3.csv', 'fs4.csv', 'fs5.csv', 'fs6.csv']


# Generate Train-Test Split by participant

In [7]:
dataframes = []

for df_file in df_files:
    dataframes.append(pd.read_csv('../' + folder_path + file_date + '_' + df_file))

print(dataframes[0].shape)
print(dataframes[1].shape)
print(dataframes[2].shape)
print(dataframes[3].shape)
print(dataframes[4].shape)
print(dataframes[5].shape)


(1474, 12)
(1474, 14)
(1474, 27)
(1474, 23)
(1474, 22)
(1474, 21)


## Save train-test splits by participants

In [8]:
i = 0
for df in dataframes: 
    split_save(df, dataframes_names[i], participant_split=True, 
               train_val_ratio=0.8, seed=seed)
    i += 1


Testing on participants: ['cresh12' 'cresh06' 'cresh03' 'cresh16' 'cresh29' 'cresh24']
Testing on participants: ['cresh12' 'cresh06' 'cresh03' 'cresh16' 'cresh29' 'cresh24']
Testing on participants: ['cresh12' 'cresh06' 'cresh03' 'cresh16' 'cresh29' 'cresh24']
Testing on participants: ['cresh12' 'cresh06' 'cresh03' 'cresh16' 'cresh29' 'cresh24']
Testing on participants: ['cresh12' 'cresh06' 'cresh03' 'cresh16' 'cresh29' 'cresh24']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Testing on participants: ['cresh12' 'cresh06' 'cresh03' 'cresh16' 'cresh29' 'cresh24']


# Generate Train-Test Split for each participant

In [9]:
participant_list = dataframes[0]['user_id'].unique()


In [10]:
i = 0
for df in dataframes:
    print("Dataframe: {}".format(dataframes_names[i]))
    for participant in participant_list:
        df_participant = df[df['user_id'] == participant]
        print("Participant: {}".format(participant))
        split_save(df_participant, dataframes_names[i], participant_split=False, 
                   train_val_ratio=0.6, seed=seed)
    i += 1
    

Dataframe: fs1
Participant: cresh07
Personal training instances: 28
Personal validation instances: 19
Participant: cresh10
Personal training instances: 28
Personal validation instances: 18
Participant: cresh08
Personal training instances: 24
Personal validation instances: 16
Participant: cresh12
Personal training instances: 30
Personal validation instances: 20
Participant: cresh09
Personal training instances: 31
Personal validation instances: 20
Participant: cresh06
Personal training instances: 73
Personal validation instances: 49
Participant: cresh02
Personal training instances: 35
Personal validation instances: 24
Participant: cresh13
Personal training instances: 31
Personal validation instances: 20
Participant: cresh15
Personal training instances: 22
Personal validation instances: 15
Participant: cresh03
Personal training instances: 27
Personal validation instances: 18
Participant: cresh14
Personal training instances: 41
Personal validation instances: 27
Participant: cresh11
Persona

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


Participant: cresh17
Personal training instances: 25
Personal validation instances: 17
Participant: cresh24
Personal training instances: 28
Personal validation instances: 19
Participant: cresh28
Personal training instances: 18
Personal validation instances: 12
Participant: cresh25
Personal training instances: 23
Personal validation instances: 15
Dataframe: fs2
Participant: cresh07
Personal training instances: 28
Personal validation instances: 19
Participant: cresh10
Personal training instances: 28
Personal validation instances: 18
Participant: cresh08
Personal training instances: 24
Personal validation instances: 16
Participant: cresh12
Personal training instances: 30
Personal validation instances: 20
Participant: cresh09
Personal training instances: 31
Personal validation instances: 20
Participant: cresh06
Personal training instances: 73
Personal validation instances: 49
Participant: cresh02
Personal training instances: 35
Personal validation instances: 24
Participant: cresh13
Persona