# Use BLASTn to get nucleotide-to-nucleotide alignments and store as .xml

### Import packages

In [1]:
import math
import os
import requests

from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
import numpy as np
import pandas as pd

### Upload ENTREZID's and Log2FC's for each of 3 organisms

In [2]:
#Upload the data
#Upload from .txt files in this folder
#First column is ENTREZID, second column log2FC(Earth v Space), third column is adj. p value (Earth v Space)
abc001 = np.loadtxt('abc001.txt', skiprows=1)
abc002 = np.loadtxt('abc002.txt', skiprows=1)
abc004 = np.loadtxt('abc004.txt', skiprows=1)

### Function to BLAST an EID

In [3]:
def blast_eid_in_nt(ENTREZID, save_file_xml):
    """
    Function: Take EID, BLAST in a the NCBI PDB
    ENTREZID: ENTREZID from 001.csv as added to bioproject_df, float
    save_file_xml: file to save to as ncbidb_index.xml, string
    eid_index: index of the ENTREZID, float
    ncbi_database: the NCBI database to be used for BLASTing, string
    """
    eid = ENTREZID
    result_handle = NCBIWWW.qblast('blastn', 'nt', eid)
    with open(save_file_xml, 'w+') as save_to:
        save_to.write(result_handle.read())
        result_handle.close() 

### Make file names to save results into (.xml)

In [4]:
xml_names001 = ['nabc001_1.xml', 'nabc001_2.xml', 'nabc001_3.xml', 'nabc001_4.xml', 'nabc001_5.xml', 'nabc001_6.xml', 
                'nabc001_7.xml', 'nabc001_8.xml', 'nabc001_9.xml', 'nabc001_10.xml', 'nabc001_11.xml', 'nabc001_12.xml', 
                'nabc001_13.xml', 'nabc001_14.xml', 'nabc001_15.xml', 'nabc001_16.xml', 'nabc001_17.xml', 'nabc001_18.xml', 
                'nabc001_19.xml', 'nabc001_20.xml', 'nabc001_21.xml', 'nabc001_22.xml', 'nabc001_23.xml', 'nabc001_24.xml',
                'nabc001_25.xml', 'nabc001_26.xml', 'nabc001_27.xml', 'nabc001_28.xml', 'nabc001_29.xml', 'nabc001_30.xml', 
                'nabc001_31.xml', 'nabc001_32.xml', 'nabc001_33.xml', 'nabc001_34.xml', 'nabc001_35.xml', 'nabc001_36.xml', 
                'nabc001_37.xml', 'nabc001_38.xml', 'nabc001_39.xml']
xml_names002 = ['nabc002_1.xml', 'nabc002_2.xml', 'nabc002_3.xml', 'nabc002_4.xml', 'nabc002_5.xml', 'nabc002_6.xml', 
                'nabc002_7.xml', 'nabc002_8.xml', 'nabc002_9.xml', 'nabc002_10.xml', 'nabc002_11.xml', 'nabc002_12.xml', 
                'nabc002_13.xml', 'nabc002_14.xml', 'nabc002_15.xml', 'nabc002_16.xml', 'nabc002_17.xml', 'nabc002_18.xml', 
                'nabc002_19.xml', 'nabc002_20.xml']
xml_names004 = ['nabc004_1.xml', 'nabc004_2.xml', 'nabc004_3.xml', 'nabc004_4.xml', 'nabc004_5.xml', 'nabc004_6.xml', 
                'nabc004_7.xml', 'nabc004_8.xml', 'nabc004_9.xml', 'nabc004_10.xml', 'nabc004_11.xml', 'nabc004_12.xml', 
                'nabc004_13.xml', 'nabc004_14.xml', 'nabc004_15.xml', 'nabc004_16.xml', 'nabc004_17.xml', 'nabc004_18.xml', 
                'nabc004_19.xml', 'nabc004_20.xml', 'nabc004_21.xml', 'nabc004_22.xml', 'nabc004_23.xml', 'nabc004_24.xml', 
                'nabc004_25.xml', 'nabc004_26.xml', 'nabc004_27.xml', 'nabc004_28.xml', 'nabc004_29.xml', 'nabc004_30.xml', 
                'nabc004_31.xml']

### BLAST the ENTREZID's using blastn - 'nt'

In [5]:
#Iterate through EID's in dna_poly001 and use BLAST function for NCBI PDB
for i in range(len(xml_names001)):
    ENTREZID = int(abc001[i,0]) #make eid an integer
    save_file_xml = xml_names001[i] #define name to save BLAST hits to as .xml
    try:
        blast_eid_in_nt(ENTREZID, save_file_xml)
        print('saved 001 ENTREZID', i)
    except:
        print('failed with 001 ENTREZID:', i)
        continue

#Iterate through EID's in dna_poly002
for i in range(len(xml_names002)):
    ENTREZID = int(abc002[i,0]) #make eid an integer
    save_file_xml = xml_names002[i] #define name to save BLAST hits to as .xml
    try:
        blast_eid_in_nt(ENTREZID, save_file_xml)
        print('saved 002 ENTREZID', i)
    except:
        print('failed with 002 ENTREZID:', i)
        continue

#Iterate through EID's in dna_poly004
for i in range(len(xml_names004)):
    ENTREZID = int(abc004[i,0]) #make eid an integer
    save_file_xml = xml_names004[i] #define name to save BLAST hits to as .xml
    try:
        blast_eid_in_nt(ENTREZID, save_file_xml)
        print('saved 004 ENTREZID', i)
    except:
        print('failed with 004 ENTREZID:', i)
        continue

failed with 001 ENTREZID: 0
failed with 001 ENTREZID: 1
failed with 001 ENTREZID: 2
failed with 001 ENTREZID: 3
failed with 001 ENTREZID: 4
failed with 001 ENTREZID: 5
failed with 001 ENTREZID: 6
failed with 001 ENTREZID: 7
failed with 001 ENTREZID: 8
failed with 001 ENTREZID: 9
failed with 001 ENTREZID: 10
failed with 001 ENTREZID: 11
failed with 001 ENTREZID: 12
failed with 001 ENTREZID: 13
failed with 001 ENTREZID: 14
failed with 001 ENTREZID: 15
failed with 001 ENTREZID: 16
failed with 001 ENTREZID: 17
failed with 001 ENTREZID: 18
failed with 001 ENTREZID: 19
failed with 001 ENTREZID: 20
failed with 001 ENTREZID: 21
failed with 001 ENTREZID: 22
failed with 001 ENTREZID: 23
failed with 001 ENTREZID: 24
failed with 001 ENTREZID: 25
failed with 001 ENTREZID: 26
failed with 001 ENTREZID: 27
failed with 001 ENTREZID: 28
failed with 001 ENTREZID: 29
failed with 001 ENTREZID: 30
failed with 001 ENTREZID: 31
failed with 001 ENTREZID: 32
failed with 001 ENTREZID: 33
failed with 001 ENTREZID