## Module for Agilent Fragment Analzyer data analysis

### Important Instructions:
The fragment analyzer file **MUST** be exported with only three selection in export section.<br> 
`Peak ID` `Size (bp)` `% (Conc.) (ng/uL)`

After Export, The header of exported csv will have to be changed to <br>
`Well`, `Size (bp)`, `Concentration` <br>
by adding a new row at top. Otherwise, this  script won't work. <br>

This script will generate a output file that contains the `Well`, `highest intesity peak` and `Concentration`. 

- Wells that are empty, i.e. only containing lower and upper marker bands, won't get their data analyzed. <br>
- Some wells, even though empty, may contain peaks except the markers, those will be analyzed. <br>
- The wells with ladder won't be analyzed. (These wells must have `ladder` in their name instead of `SampA2`)<br>
- The script find the highest peak and compares against expected size. If this fails, it will find the second highest peak for comparison.  
- Need the input PCR file, which is also the Echo worklist for setting up PCR. 
- All files must in same folder.
- All PCRs when run on Fragment analyzer, must be run on continuous wells. <br>
- No ladders between samples. <br>
- Add ladders before the first sample or after the last sample. <br>

- Must enter an starting well for the FA. 
- The script assumes that the order of PCR in Echo_PCR file is same as the order of FA loading begining starting well of FA as provided above. 
- If expected and observed are bands are within 150bp, then it is correct. 
- If greater than >250bp, then incorrect.

In [8]:
import os
import pandas as pd
import numpy as np
import re
import sys
import glob
from natsort import natsorted



def find_max_peak(well_df):
    
    column = well_df["Concentration"]
    
    numpy_array = column.to_numpy()  # Change to numpy array
    numpy_array = numpy_array.astype(float)  # Set datatype to float
    max_val = np.amax(numpy_array)  # Find maximum value
    
    if max_val % 1 == 0:
        max_val = int(max_val)
    
    max_val = str(max_val)   # Change it back to string

    max_index = well_df.loc[well_df['Concentration'] == max_val] #Find the row with max_val

    return max_index, max_val


def find_2nd_peak(df_temp, max_val_I, Exp_size):

    column = df_temp["Concentration"]
    numpy_array = column.to_numpy()  # Change to numpy array
    numpy_array = numpy_array.astype(float)  # Set datatype to float
    numpy_array.sort()
    max_val = numpy_array[-2]  # Find 2nd highest value
    
    if max_val % 1 == 0:
        max_val = int(max_val)
    
    diff_I = abs(max_val - Exp_size)
    diff_II = abs(float(max_val_I) - Exp_size)
    
    if diff_I < diff_II or max_val < 12.5:
        decision = False
    else:
        decision = True
    
    max_val = str(max_val)
    max_index = df_temp.loc[df_temp['Concentration'] == max_val] #Find the row with max_val

    return max_index, max_val, decision

        
def slice_FA_file(FA_file, PCR_file):
    
    FA_df, PCR_df = csv_to_dataframe(FA_file, PCR_file)

    column_names = ["Well", "Size (bp)", "Concentration"]
    df_temp = pd.DataFrame(columns = column_names)
    df_consolidated = pd.DataFrame(columns = column_names)
    
    column_names = ['PCR_well','FA_well','Primer','Expected_size','FA_size','FA_Conc','Result','Comments']
    Comparison_df  = pd.DataFrame(columns = column_names)

    start_ind = -1
    end_ind = -1
    list2 =[]
    FA_list = []
    FA_list_2 = []
    # Chnage the ladder to appropriate values depedning upon the kit used
    Ladder_LM = '35 (LM)'
    Ladder_UM = '5000 (UM)'

    for index, row in FA_df.iterrows():
        name = FA_df['Well'].loc[index]
        marker_size = FA_df['Size (bp)'].loc[index]

        if name == 'Peak ID':        
            wellname = FA_df['Well'].loc[index-1]
            ladder = FA_df['Size (bp)'].loc[index-1]
            if ladder == 'ladder':
                continue

        if marker_size == Ladder_LM and start_ind < 0:
            start_ind = index+1

        if marker_size == Ladder_UM and end_ind < 0:
            end_ind = index
    
        if start_ind > 1 and end_ind > 1 and (start_ind + 1 < end_ind):

            df_temp = FA_df[start_ind:end_ind]
            start_ind = -1
            end_ind = -1
            
            df_temp = df_temp.assign(Well=wellname)
            df_temp = df_temp.reset_index(drop=True)

            if df_temp.empty:
                continue
            
            max_index, max_val = find_max_peak(df_temp)
            max_index = max_index.reset_index(drop = True)
            
            Comments = ''
            return_list = compare_with_expected_PCR_size(max_index, max_val, PCR_df)
            Result = ''
            
            if return_list[0] != None:
                Result = return_list[-1]
                Exp_size = return_list[-4]
                Conc = float(return_list[-2]) 
            
            if Result == 'INCORRECT!!' and Conc < 90:
                max_index, max_val, decision  = find_2nd_peak (df_temp, max_val, Exp_size)
                max_index = max_index.reset_index(drop = True)
                
                if decision == True:
                    Comments = '2nd peak'
                    return_list_1 = compare_with_expected_PCR_size(max_index, max_val, PCR_df)
                    FA_list_2 = [Comments]                                                         
                    return_list_1.extend(FA_list_2)
                    b_series = pd.Series(return_list_1, index = Comparison_df.columns)
                    Comparison_df = Comparison_df.append(b_series, ignore_index=True)
                    Comments = '1st peak'
                    
                else:
                    pass
            
            FA_list = [Comments]                                                         
            return_list.extend(FA_list)
            
            a_series = pd.Series(return_list, index = Comparison_df.columns)
            Comparison_df = Comparison_df.append(a_series, ignore_index=True)

    filename = 'Compare_PCR_FA_'+FA_file[18:-4]+'.csv'
    print('Writing to file:- ', filename)
    Comparison_df.to_csv(filename, index = False)
    
    
def compare_with_expected_PCR_size(max_index, max_val, PCR_df):
    FA_well = max_index['Well'].loc[0]
    Obs_size = float(max_index['Size (bp)'].loc[0])
    Conc = max_index['Concentration'].loc[0]
    find_PCR_well = PCR_df.loc[PCR_df['Destination Well'] == FA_well]
    return_list = []
    
    if not find_PCR_well.empty:
        
        for index, row in find_PCR_well.iterrows():
            PCR_well = find_PCR_well['Destination Well'].loc[index]
            Primer_Name = find_PCR_well['Primer Name'].loc[index]
            Expected_size = float(find_PCR_well['PCR size'].loc[index])

            if np.isnan(Expected_size):
                dd = 0
                continue
                
            Expected_size = int(Expected_size)
            compare = abs(Obs_size - Expected_size)
            
            if compare < 150:
                Result = 'Correct'
            elif compare > 150 and compare < 250:
                Result =  '150-250bp difference' 
            else:
                Result = 'INCORRECT!!'
                                                                     
            return_list = [PCR_well, FA_well, Primer_Name, Expected_size, Obs_size, Conc, Result]
    else:
        return_list = [None]*7
        Result = ''

        
    return return_list                                                              

    
def csv_to_dataframe (FA_file, PCR_file):

    FA_df = pd.read_csv(FA_file)
    keep = ['Well', 'Size (bp)', 'Concentration']
    
    for column in FA_df:
        if column in keep:
            continue
        else:
            FA_df  = FA_df.drop(column, axis=1)
            
    PCR_df = pd.read_csv(PCR_file)
    return FA_df, PCR_df


    
def iterate_FA_files():
    
    FA_files = glob.glob('*Fragment_Analyzer_EColi_PCR*.csv')
    FA_files = natsorted(FA_files)

    PCR_Files = glob.glob('*EColi_PCR_plate*.csv')
    PCR_Files = natsorted(PCR_Files)
    counter = 1

    for FA_file, PCR_file in zip(FA_files, PCR_Files):
        
        print('\n FA File:-  ' , FA_file, '\nPCR file:-    ',PCR_file)
        slice_FA_file(FA_file, PCR_file)    
    

if __name__ == '__main__':
    
    print(os.getcwd())
    
    iterate_FA_files()

    


/Users/nilmani/Desktop/Python/Ecoli_Picklists/FA_Data_Aug18

 FA File:-   Fragment_Analyzer_EColi_PCR_1.csv 
PCR file:-     EColi_PCR_plate_1.csv
Writing to file:-  Compare_PCR_FA_EColi_PCR_1.csv

 FA File:-   Fragment_Analyzer_EColi_PCR_2.csv 
PCR file:-     EColi_PCR_plate_2.csv
Writing to file:-  Compare_PCR_FA_EColi_PCR_2.csv
