# Description

Generates the training and testing CSVs used for CNN_6_3 and after. Details:
 - Splits by file
 - Noramlizes the expression from 0 (no expression) to 1 (highest expression). This is the inverse of 'Observed log(TX/Txref)'

It does not include augmented data, just takes the data from La Fleur's supplemental materials including:
 - La Fleur et al (and De Novo Designs)
 - Urtecho et al
 - Hossain et al
 - Yu et al
 - Lagator (36N, Pl, and Pr)
 - Anderson Series

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import glob

In [2]:
# Each of the files used in the model training and testing

file_names = {'La_Fleur_et_al',
              'Urtecho_et_al',
              'Hossain_et_al',
              'Yu_et_al',
              'Lagator_36N',
              'Lagator_Pl',
              'Lagator_Pr',
              'Anderson_Series',
              'De_Novo_Designs',
}

In [3]:
excel_file = '../Data/LaFleur/41467_2022_32829_MOESM5_ESM.xlsx'
xls = pd.ExcelFile(excel_file)

for sheet_name in xls.sheet_names:
    new_sheet_name = sheet_name.split(" (")[0].replace(' ', '_')
    if not new_sheet_name in file_names:
        continue
    df = xls.parse(sheet_name)
    output_csv = f'../Data/LaFleur_supp/{new_sheet_name}.csv'

    df.to_csv(output_csv, index=False)
    print(f"Saved {output_csv}")


Saved ../Data/LaFleur_supp/La_Fleur_et_al.csv
Saved ../Data/LaFleur_supp/La_Fleur_et_al.csv
Saved ../Data/LaFleur_supp/Urtecho_et_al.csv
Saved ../Data/LaFleur_supp/Hossain_et_al.csv
Saved ../Data/LaFleur_supp/Yu_et_al.csv
Saved ../Data/LaFleur_supp/Lagator_36N.csv
Saved ../Data/LaFleur_supp/Lagator_Pl.csv
Saved ../Data/LaFleur_supp/Lagator_Pr.csv
Saved ../Data/LaFleur_supp/Anderson_Series.csv
Saved ../Data/LaFleur_supp/De_Novo_Designs.csv


In [4]:
# Load all unprocessed select data
files = glob.glob('../Data/LaFleur_supp/*.csv')
data = []

# Read and concatenate data for fitting the scaler
for file in files:
    df = pd.read_csv(file)
    if 'Sequence' in df.columns:
        df = df.rename(columns={'Sequence': 'Promoter Sequence'})
    df['Promoter Sequence'] = df['Promoter Sequence'].astype(str).str.upper()
    df['File Name'] = file.split('\\')[-1]
    data.append(df[['File Name', 'Promoter Sequence', 'Observed log(TX/Txref)']])

# Concatenate all data into a single DataFrame
concatenated_data = pd.concat(data, ignore_index=True)
scaler = MinMaxScaler()
concatenated_data['Normalized Expression'] = scaler.fit_transform(concatenated_data[['Observed log(TX/Txref)']].abs())
concatenated_data.to_csv('../Data/Cross Validation/all_data.csv', index=False)

In [5]:
for file in files:
    df = pd.read_csv(file)
    if 'Sequence' in df.columns:
        df = df.rename(columns={'Sequence': 'Promoter Sequence'})
    df['Promoter Sequence'] = df['Promoter Sequence'].astype(str).str.upper()
    df['Normalized Expression'] = scaler.transform(df[['Observed log(TX/Txref)']].abs())

    df = df[['Promoter Sequence', 'Normalized Expression']]

    print(f'{file} - {len(df) - len(df.dropna())} NaN values dropped')
    df = df.dropna()
    
    df.to_csv(file.replace('LaFleur_supp', 'Cross Validation'), index=False)

../Data/LaFleur_supp\Anderson_Series.csv - 0 NaN values dropped
../Data/LaFleur_supp\De_Novo_Designs.csv - 0 NaN values dropped
../Data/LaFleur_supp\Hossain_et_al.csv - 0 NaN values dropped
../Data/LaFleur_supp\Lagator_36N.csv - 0 NaN values dropped
../Data/LaFleur_supp\Lagator_Pl.csv - 0 NaN values dropped
../Data/LaFleur_supp\Lagator_Pr.csv - 0 NaN values dropped
../Data/LaFleur_supp\La_Fleur_et_al.csv - 0 NaN values dropped
../Data/LaFleur_supp\Urtecho_et_al.csv - 0 NaN values dropped
../Data/LaFleur_supp\Yu_et_al.csv - 0 NaN values dropped
