# Opening the .xml's and Saving to a .csv

In [5]:
import math
import os
import requests

from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
import numpy as np
import pandas as pd

In [18]:
# File names in list that I've downloaded into .xml's
abc_xml = ['abc001_10.xml', 'abc001_11.xml','abc001_12.xml','abc001_13.xml','abc001_14.xml', 'abc002_1.xml']
dna_xml = ['dna001_1.xml', 'dna002_4.xml', 'dna004_2.xml']

# Files names to write the .csv's
abc_csv = ['abc001_10.csv', 'abc001_11.csv','abc001_12.csv','abc001_13.csv','abc001_14.csv', 'abc002_1.csv']
dna_csv = ['dna001_1.csv', 'dna002_4.csv', 'dna004_2.csv']

In [18]:
def save_to_csv(xml_file, csv_filename):
    """Save a BLAST .xml hits file to a .csv.  Returns the hits dataframe used to make the .csv"""
    # Open and read the .xml from the BLAST
    result_handle = open(xml_file)
    blast_record = NCBIXML.read(result_handle)

    # Initiate arrays
    hit_names = []
    hit_values = []
    hit_norm_scores = []
    #db_blast_name = []

    # Add alginment data to hits_array
    for alignment in blast_record.alignments:
        #name = blast_record.description.title
        #db_blast_name.append(name)
        for hsp in alignment.hsps:
            hit_names.append(alignment.title)
                
            alignment_values = ['string0', 'string1'] #initiate new list for values to convert to floats
            alignment_values[0] = hsp.expect
            alignment_values[1] = hsp.score
            #make expect and score floats from strings
            alignment_values[0] = float(alignment_values[0])
            alignment_values[1] = float(alignment_values[1])
            #append the alignment data to the database hits array
            hit_values.append(alignment_values)

    for m in range(len(hit_values)):
        norm_score = hit_values[m][1] / hit_values[0][1] * 100
        hit_norm_scores.append(norm_score)
    
    # Convert Python arrays/lists to Numpy arrays/lists
    #names_arr = np.array(hit_names)
    #values_arr = np.array(hit_values)
    #norm_arr = np.array(hit_norm_scores)
         
    # Convert Numpy arrays to Pandas DataFrame
    column_names = ['e_value', 'score']
    hits_df = pd.DataFrame(hit_values, columns=column_names)
    hits_df['hit_name'] = pd.DataFrame(hit_names)
    hits_df['norm_score'] = pd.DataFrame(hit_norm_scores)
    #hits_df['db_blast_name'] = pd.DataFrame(db_blast_name)

    # Convert Pandas DataFrame to .csv
    hits_df.to_csv(csv_filename)
    
    return hits_df

In [20]:
# Test
xml_file = 'abc002_1.xml'
csv_filename = 'abc002_1.csv'
save_to_csv(xml_file, csv_filename)

ValueError: Your XML file did not start with '<?xml'... but instead ',e_value,score,hit_n'

# Run the function for all .xml's

In [62]:
for i in range(len(abc_xml)):
    xml_file = abc_xml[i]
    csv_filename = abc_csv[i]
    save_to_csv(xml_file, csv_filename)
    
for j in range(len(dna_xml)):
    xml_file = dna_xml[j]
    csv_filename = dna_csv[j]
    save_to_csv(xml_file, csv_filename)

ValueError: Wrong number of items passed 0, placement implies 1

# Don't look past here

## Open the .xml's in the same directory as this notebook

In [3]:
# Open and read the .xml from the BLAST
result_handle = open('dna002_4.xml')
blast_record = NCBIXML.read(result_handle)

In [28]:
# Use Numpy array to store data
# Initiate array
pdb_hits_array = []

# Leave out the normalized score!!  Hard to iterate through when not iterating integers

# Add alginment data to hits_array
for alignment in blast_record.alignments:
    for hsp in alignment.hsps:
        alignment_data = ['string0', 'string1', 'string2']
        alignment_data[0] = alignment.title
        alignment_data[1] = hsp.expect
        alignment_data[2] = hsp.score
        #make expect and score floats from strings
        alignment_data[1] = float(alignment_data[1])
        alignment_data[2] = float(alignment_data[2])
        #append the alignment data to the database hits array
        pdb_hits_array.append(alignment_data)

# Convert Python array to Pandas DataFrame
column_names = ['hit_title', 'e_value', 'score']
pdb_hits_df = pd.DataFrame(pdb_hits_array, columns=column_names)

# Calculate the normalized score
norm_score = [100]
for i in range(1,len(pdb_hits_array)):
    norm_score_i = pdb_hits_df['score'][i] / pdb_hits_df['score'][0] *100
    norm_score.append(norm_score_i)

# Add Pandas Series from this list of normalized scores to the hits DataFrame
pdb_hits_df['norm_score'] = pd.Series(norm_score)

# Convert Pandas DataFrame to .csv
pdb_hits_df.to_csv('dna002_4.csv')

In [21]:
pdb_hits_array

[]

## Printing hits_results using tutorial code

In [27]:
result_handle = open('abc001_10.xml')
blast_record = NCBIXML.read(result_handle)

# Getting info out of code
E_VALUE_THRESH = 0.0000000000000000000000000000000000000000000000001
count = 0
for alignment in blast_record.alignments:
    count += 1
    for hsp in alignment.hsps:
        #if hsp.expect < E_VALUE_THRESH:
        print('****Alignment****')
        print('sequence:', alignment.title)
        print('e value:', hsp.expect)
        print('score:', hsp.score)
            #print('identities:', hsp.identities)
print('There are', count, 'sequences in the BLAST output')

****Alignment****
sequence: pdb|4LVR|A Crystal structure of IFT81 N-terminal domain [Chlamydomonas reinhardtii]
e value: 1.93761
score: 59.0
****Alignment****
sequence: pdb|4LVP|A Crystal structure of IFT81 N-terminal domain [Chlamydomonas reinhardtii]
e value: 2.02984
score: 59.0
****Alignment****
sequence: pdb|6BVD|A Structure of Botulinum Neurotoxin Serotype HA Light Chain [Clostridium botulinum] >pdb|6BVD|B Structure of Botulinum Neurotoxin Serotype HA Light Chain [Clostridium botulinum]
e value: 2.08369
score: 60.0
****Alignment****
sequence: pdb|6CWX|B Crystal structure of human ribonuclease P/MRP proteins Rpp20/Rpp25 [Homo sapiens]
e value: 6.30001
score: 56.0
****Alignment****
sequence: pdb|6W2D|y Structures of Capsid and Capsid-Associated Tegument Complex inside the Epstein-Barr Virus [Human herpesvirus 4 strain B95-8] >pdb|6W2D|z Structures of Capsid and Capsid-Associated Tegument Complex inside the Epstein-Barr Virus [Human herpesvirus 4 strain B95-8] >pdb|6W2E|y Structures 