In [24]:
'''
This software serves to prepare the input files for Genomic database.
The expected input file is the sequencing report from BGI,
where the header consists of #Chr,Loci, REF, series of sample name, and Sequencing Details.
This script partitions the original file into three csv files
following the entities format of the Genomic database.

Developed by Sausan Nafisah | September 2021
'''
import pandas as pd
import re

#_ordering sample    
def atoi(text):
    return int(text) if text.isdigit() else text

def natural_keys(text):
    return [ atoi(c) for c in re.split('(\d+)',text) ]

#_main program
def genomic_db_data_preparation_software(file):
    global crop
    #data manipulation
    df = pd.read_csv(file, sep = '\t')

    Chr = df['#Chr'].tolist()
    Loci = df['Loci'].tolist()
    Ref = df['REF'].tolist()
    Sequence_detail = df['Sequence_Detail'].tolist()

    #__create index
    row, cols = df.shape
    index = []
    location = Chr[0]
      
    for i in range(row):
        item = '{0}_{1}_{2}'.format(crop, location, Loci[i])
        index.append(item)

    #__check point
    #print('index [600]: ',index[600])

    #__Obtaining the sample name
    column_name = df.columns.tolist()

    moved = ['#Chr','Loci', 'REF', 'Sequence_Detail']

    for item in moved:
        #print('removing {} from the dataset'.format(item))
        column_name.remove(item)
        
    #__ordering sample 
    column_name.sort(key=natural_keys)
    #__Check point
    #print (column_name)

    #__preparing genotype dataset
    genotype = {}
    for column in column_name:
        genotype1 = {'{}'.format(column):df['{}'.format(column)].tolist()}
        genotype.update(genotype1)
        
    #__preparing input file
    SNP_Information = {'rsID':index, 'Chr':Chr, 'Loci':Loci, 'Ref':Ref}
    Sequence_data = {'rsID':index, 'Sequencing_detail':Sequence_detail}
    Sample_Genotype = {'rsID':index}
    Sample_Genotype.update(genotype)
    
    input_file1 = pd.DataFrame(SNP_Information)
    input_file2 = pd.DataFrame(Sequence_data)
    input_file3 = pd.DataFrame(Sample_Genotype)
    
    return input_file1, input_file2, input_file3

#_Working with single input file
def single_file():
    file = input('Enter file path: ')
    
    print("Working with {} file".format(file))
    input_file1, input_file2, input_file3 = genomic_db_data_preparation_software(file)
    
    input_file1.to_csv('{0}_snp-001_SNP_Information.csv'.format(crop), index=False)
    print("Now you have {0}_snp-001_SNP_Information.csv file".format(crop))
    
    input_file2.to_csv('{0}_snp-001_Sequence_data.csv'.format(crop), index=False)
    print("Now you have {0}_snp-001_Sequence_data.csv file".format(crop))
    
    input_file3.to_csv('{0}_snp-001_Sample_genotype.csv'.format(crop), index=False)
    print("Now you have {0}_snp-001_Sample_genotype.csv file".format(crop))
    
#_Working with multiple input files
def multi_files():
    file_list = input('Enter the path to the file containing the file list: ')
    
    for line in open(file_list):
        edit_line = line.split('\n')
        print("Working with {} file".format(edit_line[0]))
        file = '/mnt/d/Genotype Result 2/Solanum_lycopersicum/{}'.format(edit_line[0])
        input_file1, input_file2, input_file3 = genomic_db_data_preparation_software(file)
        
        #_Creating input files
        file_id = edit_line[0].split('.')
        input_file1.to_csv('{0}_{1}_SNP_Information.csv'.format(crop, file_id[0]), index=False)
        print("Now you have {0}_{1}_SNP_Information.csv file".format(crop, file_id[0]))
        
        input_file2.to_csv('{0}_{1}_Sequence_data.csv'.format(crop, file_id[0]), index=False)
        print('Now you have {0}_{1}_Sequence_data.csv file'.format(crop, file_id[0]))
        
        input_file3.to_csv('{0}_{1}_Sample_genotype.csv'.format(crop, file_id[0]), index=False)
        print('Now you have {0}_{1}_Sample_genotype.csv file'.format(crop, file_id[0]))
        print('\n')
        
if __name__ == "__main__":
    #input crop info
    global crop
    crop = input('What crop are you going to input?\n \
        Choose one of the following:\n \
        Ca/To/Ke/Te/Me/Ja/Bm/Ka/Se/Kp/Cs\n')
    
    crop_list = ['Ca','To','Ke','Te','Me','Ja','Bm','Ka','Se','Kp','Cs']
    
    while crop not in crop_list:
        crop = input('What crop are you going to input?\n \
            Choose one of the following:\n \
            Ca/To/Ke/Te/Me/Ja/Bm/Ka/Se/Kp/Cs\n')
    
    #choose working type
    working_type = input('Are you working with single input file or multiple input files? Choose [S/M] \n ')
    
    if working_type == 'S':
        single_file()
    elif working_type == 'M':
        multi_files()
    else:
        print('you typed a wrong answer.')

What crop are you going to input?
         Choose one of the following:
         Ca/To/Ke/Te/Me/Ja/Bm/Ka/Se/Kp/Cs
/mnt/d/Genotype Result 2/Capsicum_annuum/Pltd.snp.xls
What crop are you going to input?
             Choose one of the following:
             Ca/To/Ke/Te/Me/Ja/Bm/Ka/Se/Kp/Cs
Ca
Are you working with single input file or multiple input files? Choose [S/M] 
 S
Enter file path: /mnt/d/Genotype Result 2/Capsicum_annuum/Pltd.snp.xls
Working with /mnt/d/Genotype Result 2/Capsicum_annuum/Pltd.snp.xls file
Now you have Ca_snp-001_SNP_Information.csv file
Now you have Ca_snp-001_Sequence_data.csv file
Now you have Ca_snp-001_Sample_genotype.csv file
