In [None]:
# Biopython Exercise 2: FASTA fun with protein!!! Yay...

## 1) Download fasta files for NCBI protein accessions AGI40145.1, AGJ87295.1, WVV45440.1, and WVS05366.1

## 2) Write a python script called genbank_parse.py that is executable on the command line that is called fasta_parse.py

## 3) Fasta_parse.py should output a csv table called protein_info.csv that has the fasta IDs as row names, and as columns, the first 10 amino acids in the protein, the length of the protein, the number of cysteines in the protein. You should use biopython’s parsing of seq and seqrecord objects and seq methods to accomplish this

### This assumes all files are in your current directory 

In [None]:
from Bio import SeqIO 
import csv

def parse_fasta_files(fasta_files): #Defining our function
    protein_info = [] #Setting our empty list
    for fasta_file in fasta_files: #Creating a for loop to iterate through
        with open(fasta_file, "r") as file: #Using `with()` so we do not have to use `f.in` or `f.out`
            record = SeqIO.read(file, "fasta") #Creating the variable 'record' and assigning it to read a file in fasta format with `SeqIO`'s `read()` function
            first_10_aa = str(record.seq[:10]) #Getting the first 10 aa's with using `str()` to get our record's seq(uence)
            length = len(record.seq) #Setting our length variable to be the length of the amino acid sequence
            num_cysteines = record.seq.count("C") #Creating a variable to count for the number of cysteines in the record's sequence
            protein_info.append({"ID": record.id, "First10AminoAcids": first_10_aa, "Length": length, "NumCysteines": num_cysteines}) 
            #Appending all of this to our 'protein_info' empty list
    return protein_info

def write_csv(protein_info, output_filename): #Creating our next function!!! Yay
    with open(output_filename, "w", newline="") as csvfile: #Using `with` and `open` to write our output csv file
        fieldnames = ["ID", "First10AminoAcids", "Length", "NumCysteines"] #Creating our column headers
        writing = csv.DictWriter(csvfile, fieldnames=fieldnames) #Creating a `writing` function that will use the `DictWriter()` function 
        writing.writeheader() #Writing our header
        writing.writerows(protein_info) #Writing our rows using our previously created 'protein_info' variable

if __name__ == "__main__": #Conditional which will allow code to run but not as a module
    fasta_files = ["AGI40145_1.fasta", "AGJ87295_1.fasta", "WVV45440_1.fasta", "WVS05366_1.fasta"] #Once again, assumes these are in your cwd
    protein_info = parse_fasta_files(fasta_files) #Creating a variable which is the output of our function with our given fasta files
    write_csv(protein_info, "protein_info.csv") #Writing our .csv file!!!