In [1]:
## This module checks for two major quality control items:
## The fragment numbering is correct in all annotated plasmids
## The DNA sequence for annotated fragments is continuous and there are no mistakes. 



from natsort import natsorted, natsort_keygen
import pandas as pd
import glob
import numpy as np
import re
from snapgene_reader import snapgene_file_to_dict
import os



def fragment_validation(snap_file, frag_df):
    
    temp_df = pd.DataFrame(columns = ['Plasmid_name', 'Numbering', 'Result'])
    frag_list = []
    list1 = []
    list2 =[]
    
    for index, row in frag_df.iterrows():
        Fragment_no = frag_df['Fragment_name'].loc[index]
        number = Fragment_no.split(' ')            
        list1.append(int(number[1]))
        
    flag = Sum_consecutive_numbers(list1)
    
    if flag == 0:
        Result = 'Correct '
    elif flag == 1:
        Result = 'ERROR!!!'
        
    list2.append(snap_file[:-4])
    list2.append(natsorted(list1))
    list2.append(Result)
    
    a_series = pd.Series(list2, index = temp_df.columns)
    temp_df = temp_df.append(a_series, ignore_index = True)

    return temp_df
    
    
def Sum_consecutive_numbers(list1):
    
    sum_consecutive = 0
    min_list = min(list1)
    
    for i in range(min(list1), (max(list1)+1)):
        sum_consecutive = sum_consecutive + min_list 
        min_list = min_list + 1
        
    if sum_consecutive == sum(list1):
        flag = 0
    else:
        flag = 1
        
    return flag

        
def fragment_seq_validation(frag_df, snap_file, plasmid_len):
    
    Temp_Sequences = pd.DataFrame(columns = ['Plasmid_name', 'Result'])
    list1 = []

    frag_df = frag_df.sort_values(by="Fragment_name", key=natsort_keygen())  
    frag_df = frag_df.reset_index(drop =True)
    
    Df_len = len(frag_df)
    counter = 0
    Result = 'Correct'

    for index, row in frag_df.iterrows():
        frag_name = (frag_df['Fragment_name'].loc[index])
        start_ind = int(frag_df['Start_ind'].loc[index])
        Flag = False

        if start_ind == 1 and counter == 0:
            counter = counter + 1
            continue
        elif start_ind != 1 and counter == 0:
            End_ind_prev = int(frag_df['End_ind'].loc[Df_len-1])
            
        if start_ind != 1 and counter != 0:
            End_ind_prev = int(frag_df['End_ind'].loc[index-1])
            
        if start_ind == 1 and counter != 0:
            End_ind_prev = int(frag_df['End_ind'].loc[index-1])
            if End_ind_prev == plasmid_len:
                Flag = True
        
        counter = counter + 1
        if End_ind_prev + 1 == start_ind:
            Flag = True
            
        if Flag == False: 
            Result = "ERROR!!!"
            #print(frag_name, '\n',frag_df.to_string(), Flag)
    
    list1 = [snap_file[:-4], Result]
    a_series = pd.Series(list1, index = Temp_Sequences.columns)
    Temp_Sequences = Temp_Sequences.append(a_series, ignore_index = True)

    return Temp_Sequences
    
    
def Print_output(Input_df, test_type):
    
    List_4 = Input_df['Result'].tolist()
    #print(Input_df.to_string())
    if "ERROR!!!" not in List_4:
        print('\n{} correct in all plasmids.'.format(test_type))
    else:
        print('\n{} ERROR in some plasmid(s)!!'.format(test_type))

    
def read_DNA_files():
    frag_dict ={}
    frag_list = []

    frag_df = pd.DataFrame(columns = ['Fragment_name', 'Start_ind', 'End_ind'])
    
    temp_df = pd.DataFrame(columns = ['Plasmid_name', 'Numbering', 'Result'])
    Result_df = pd.DataFrame(columns = ['Plasmid_name', 'Numbering', 'Result'])
    
    Temp_Sequences = pd.DataFrame(columns = ['Plasmid_name', 'Result'])
    Frag_Sequences = pd.DataFrame(columns = ['Plasmid_name', 'Result'])

    DNA_files = natsorted(glob.glob("*.dna"))
    
    for snap_file in DNA_files:
        print(' Filename:  ', snap_file)
        frag_df = frag_df.iloc[0:0]
        dictionary = snapgene_file_to_dict(snap_file)
        plasmid_len = len(dictionary['seq'])
        
        for i in range(len(dictionary['features'])):
            if ('fragment' in dictionary['features'][i]['name']) or ('Fragment' in dictionary['features'][i]['name']):
                Fragment_name = dictionary['features'][i]['name']
                frag_start, frag_end = next(iter(dictionary['features'][i]['segments'][0].items()))[1].split('-')
                
                frag_list = [Fragment_name, frag_start, frag_end]
                a_series = pd.Series(frag_list, index = frag_df.columns)
                frag_df = frag_df.append(a_series, ignore_index = True)
          
        frag_df = frag_df.sort_values(by=['Fragment_name'])
        Temp_Sequences = fragment_seq_validation(frag_df, snap_file, plasmid_len)
        Frag_Sequences = Frag_Sequences.append(Temp_Sequences)
        
        temp_df = fragment_validation(snap_file, frag_df)
        Result_df = Result_df.append(temp_df)
        
    Print_output(Frag_Sequences, 'Sequeunce position')
    Print_output(Result_df, 'Fragment numbering')
    
    Result_df = Result_df.append(Frag_Sequences)
    Result_df.to_csv('Plasmid_Fragment_Order_Analysis.csv', index = False)

    
if __name__ == "__main__":         
    
    print(os.getcwd(), '\n')
    read_DNA_files()

C:\Users\Nilmani\Desktop\Validations\Mammalian_validations 

 Filename:   1MZ.dna
 Filename:   2MZ.dna
 Filename:   3MZ.dna
 Filename:   4MZ.dna
 Filename:   5MZ.dna
 Filename:   6MZ.dna
 Filename:   7MZ.dna
 Filename:   8MZ.dna
 Filename:   9MZ.dna
 Filename:   10MZ.dna

Sequeunce position correct in all plasmids.

Fragment numbering correct in all plasmids.
