# Description

Generates the training and testing CSVs used for CNN_6_3 and after. Details:
 - Splits by file
 - Noramlizes the expression from 0 (no expression) to 1 (highest expression). This is the inverse of 'Observed log(TX/Txref)'

It does not include augmented data, just takes the data from La Fleur's supplemental materials including:
 - La Fleur et al (and De Novo Designs)
 - Urtecho et al
 - Hossain et al
 - Yu et al
 - Lagator (36N, Pl, and Pr)
 - Anderson Series

In [4]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import glob

# Load all the data
files = glob.glob('../Data/Cross Validation/*.csv')
data = []

# Read and concatenate data for fitting the scaler
for file in files:
    data.append(pd.read_csv(file))

data = pd.concat(data)

# Fit the scaler using the absolute values of the observed expression
scaler = MinMaxScaler()
scaler.fit(data[['Observed log(TX/Txref)']].abs())

# Normalize the expression values and save the files
for file in files:
    df = pd.read_csv(file)[['Promoter Sequence', 'Observed log(TX/Txref)']]
    df['Promoter Sequence'] = df['Promoter Sequence'].astype(str).str.upper()
    df['Normalized Expression'] = scaler.transform(df[['Observed log(TX/Txref)']].abs())
    df = df[['Promoter Sequence', 'Observed log(TX/Txref)', 'Normalized Expression']]

    # Drop nan values
    print(f'{file} - {len(df) - len(df.dropna())} NaN values dropped')
    df = df.dropna()

    df.to_csv(file, index=False)


../Data/Cross Validation\Anderson_Series.csv - 4 NaN values dropped
../Data/Cross Validation\De_Novo_Designs.csv - 3 NaN values dropped
../Data/Cross Validation\Hossain_et_al.csv - 0 NaN values dropped
../Data/Cross Validation\Lagator_36N.csv - 0 NaN values dropped
../Data/Cross Validation\Lagator_Pl.csv - 0 NaN values dropped
../Data/Cross Validation\Lagator_Pr.csv - 0 NaN values dropped
../Data/Cross Validation\La_Fleur_et_al.csv - 0 NaN values dropped
../Data/Cross Validation\Urtecho_et_al.csv - 0 NaN values dropped
../Data/Cross Validation\Yu_et_al.csv - 0 NaN values dropped
