In [49]:
from Bio import SeqIO
import csv

In [50]:
# From ASV/OTU table to Arlequin's arq file(.arq)
# Input: 1. ASV/OTU table in csv format for all your sample. Columns for the ASVs/OTUs. Rows for the samples.
#        2. A fasta file for mapper from ASV to specific sequences.
# Output: 1. Arlequin file for software input  

In [51]:
# PATH for ASV/OTU table and fasta file for mapper

PATH_ASV_table = "Cleaned_ASV_table.csv"
PATH_ASV_fasta = "COX3_ASV_hard.fasta"

In [52]:
# Change the variables here to change the info in heades of the arlequin file

title = "COX3"
sample_num = len(list(ASV_copy_dict.keys()))
GenotypicData=0
LocusSeparation="NONE"
DataType="DNA"
MissingData="?"

indent = "\t"

In [53]:
#Extracting and cleaning all the useful info from ASV table and fasta file

ASV_dict = {}
with open(PATH_ASV_fasta,'r') as handle:
    records = SeqIO.parse(handle,'fasta')
    for record in records:
        ASV_dict.update({record.name:record.seq})

with open(PATH_ASV_table,'r') as handle:
    rows = csv.DictReader(handle)
    ASV_copy_dict = dict()
    for row in rows:
        site_ASV_copy_dict = dict()
        location =  row["Site"]
        row.pop("Site")
        site_ASV_copy_dict.update({key:int(value) for key, value in row.items() if int(value) != 0 and key in ASV_dict.keys()})
        ASV_copy_dict.update({location:site_ASV_copy_dict})

In [54]:
# Create a list for the header rows

header_lines = [f"[Profile]",
          f"{indent}Title=\"{title}\"",
          f"{indent}NbSamples={sample_num}",
          f"{indent}GenotypicData={GenotypicData}",
          f"{indent}DataType={DataType}",
          f"{indent}LocusSeparator={LocusSeparation}",
          f"{indent}MissingData={MissingData}"]

In [55]:
# Write sample lines

i = 0
reads_list = []
sample_lines = []
sample_lines.append("[Data]")
sample_lines.append("\t[[Sample]]")

for key in ASV_copy_dict.values():
    reads_list.append(sum(list(key.values())))

for site, key in ASV_copy_dict.items():
    sample_lines.append(f"{indent*2}SampleName=\"{site}\"")
    sample_lines.append(f"{indent*2}SampleSize=\"{reads_list[i]}\"")
    sample_lines.append("\t\tSampleData={")
    for ASV, read in key.items():
        sample_lines.append(f"{indent*3}{ASV}\t{read}\t{ASV_dict[ASV]}")
    sample_lines.append("\t\t}")
    i += 1

In [56]:
# Output to .arp file

with open("COX3_arlequin.arp",'w') as handle:
    for line in header_lines:
        handle.write(f"{line}\n")
    for line in sample_lines:
        handle.write(f"{line}\n")