# Goal 
Save the test and train trial data for ML in formats that will be easy to load in xGBoost model. 
This is specifically geared to create a training file that will contain the data from all trials, except trial X. It is correspondingly named after trial X, and the corresponding test file (also named for trial X) contains data only from trial X.
Hence for the extrapolating scripts the training set for moth X is used along with the test set for moth X. 
Meanwhile for the interpolating script only the test set for moth X is used. Within the training of the model 75% of the data is used and 25% is withheld. 

In [1]:
import numpy as np 
import pandas 
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score # This seems to only work for classification
from sklearn.metrics import explained_variance_score
import math


In [2]:
# These are the trials that we have good data for 
acceptable_trials = ['m07_t01_15', 'm07_t03_15', 'm07_t06_15','m10_t02_16','m11_t02_16','m11_t04_16',
                   'm12_t02_16','m14_t05_16', 'm14_t03_16', 'm15_t01_16', 'm15_t03_16']

# Split the data into test and train sets
test_trials = []
train_trials = []
for i in np.arange(0,len(acceptable_trials)):
    test_trials.append(acceptable_trials[i])
    train_trials.append((acceptable_trials[:(i)] + acceptable_trials[(i+1):]))
    
 

In [5]:
# train on N-1 of N trials and test on the removed trial

test_column = 'D10'
predictor_columns = ['Wingbeat_freq', 'M6_c', 'I20_I10', 'M3_i', 'A59_c']

for i in np.arange(0,len(train_trials)): 
    # Prep the data
    # Base case
    trial = train_trials[i][0]
    d = pandas.read_csv('../DataProcessing/ProcessedData/' + trial + '_det.csv')
    d = d[['D10', 'D11', 'D20', 'I20I11_I10', 'I20_I10',
            'fitting_error', 'seconds', 'tif_num', 'M3_c', 'M6_c', 'A51_c', 'A59_c',
            'M3_i', 'M6_i', 'A59_i', 'A51_i', 'Sum', 'peaks', 'ISI']]

    d = d.interpolate() # Fill any nan values
    d = d[['D10', 'I20_I10', 'M3_i', 'A59_c', 'M6_c']]
    d = d.dropna(how = 'any').reset_index(drop = 'True')

    columns = ['M3_i', 'M6_c', 'I20_I10', 'A59_c']
    shift_column_names = []
    for col in columns:
        d[col + '_' + str(0)] = d[col].shift(-1)
        shift_column_names.append(col + '_' + str(0))

    for j in np.arange(1,11):
        for col in columns:
            d[col + '_' + str(j)] = d[col + '_' + str(j - 1)].shift(-1)
            shift_column_names.append(col + '_' + str(j))

    d = d.dropna()

    # Subset the data into predictors (X) and predicted (y)
    X = (d[columns + shift_column_names])
    y = (d[['D10']])


    # Subsequent cases

    # Create training set
    for trial in train_trials[i][1:]: 
        d = pandas.read_csv('../DataProcessing/ProcessedData/' + trial + '_det.csv')
        d = d[['D10', 'D11', 'D20', 'I20I11_I10', 'I20_I10',
               'fitting_error', 'seconds', 'tif_num', 'M3_c', 'M6_c', 'A51_c', 'A59_c',
                'M3_i', 'M6_i', 'A59_i', 'A51_i', 'Sum', 'peaks', 'ISI']]

        d = d.interpolate()
        d = d[['D10', 'I20_I10', 'M3_i', 'A59_c', 'M6_c']]
        d = d.dropna(how = 'any').reset_index(drop = 'True')
        shift_column_names = []
        for col in columns:
            d[col + '_' + str(0)] = d[col].shift(-1)
            shift_column_names.append(col + '_' + str(0)) # This was commented... I Don't know why?!

        for j in np.arange(1,11):
            for col in columns:
                d[col + '_' + str(j)] = d[col + '_' + str(j - 1)].shift(-1)
                shift_column_names.append(col + '_' + str(j))
        d = d.dropna()

        # Subset the data into predictors (X) and predicted (y)
        X = X.append(d[columns + shift_column_names])
        y = y.append(d[['D10']])

    X = X.reset_index(drop = 'True')
    y = y.reset_index(drop = 'True')
    
    X.to_csv('./MLFormattedData/Train/' + test_trials[i] + '_TrainX.csv', index = False)
    y.to_csv('./MLFormattedData/Train/' + test_trials[i] + '_TrainY.csv', index = False)
    
    del X,y

    # Use the last df as a test set
    trial = test_trials[i]
    d = pandas.read_csv('../DataProcessing/ProcessedData/' + trial + '_det.csv')
    d = d[['D10', 'D11', 'D20', 'I20I11_I10', 'I20_I10',
            'fitting_error', 'seconds', 'tif_num', 'M3_c', 'M6_c', 'A51_c', 'A59_c',
           'M3_i', 'M6_i', 'A59_i', 'A51_i', 'Sum', 'peaks', 'ISI']]
    d = d.interpolate()
    d = d[['D10', 'I20_I10', 'M3_i', 'A59_c', 'M6_c']]
    d = d.dropna(how = 'any').reset_index(drop = 'True')

    # Create time shifted predictor data
    for col in columns:
        d[col + '_' + str(0)] = d[col].shift(-1)
    for j in np.arange(1,11):
        for col in columns:
            d[col + '_' + str(j)] = d[col + '_' + str(j - 1)].shift(-1)
    d = d.dropna()

    X = (d[columns + shift_column_names])
    y = (d[['D10']])
    
    X.to_csv('./MLFormattedData/Test/' + test_trials[i] + '_TestX.csv', index = False)
    y.to_csv('./MLFormattedData/Test/' + test_trials[i] + '_TestY.csv', index = False)




In [6]:
11/200

0.055