## Generate GI number for each REF_SEQ
DAVID (DAVID Functional Annotation Bioinformatics Microarray Analysis, https://david.ncifcrf.gov/conversion.jsp ) was used to convert REF_SEQ accession numbers to Protein_GI_Accession numbers. 

After this, the protein sequences for each GI number will be retrieved from NCBI. **pull_protein_seq_AK.ipynb**

In [1]:
# import relevant libraries
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('compiled_final_REFSEQ.csv')
df = pd.DataFrame(data)
df.head()

Unnamed: 0,GENENAME,ORG,ENTREZID,REFSEQ,LOG2FC
0,ABC_transporter,PA,879411,NP_248876.1,-0.038469
1,ABC_transporter,PA,883108,NP_248894.1,-0.207718
2,ABC_transporter,PA,878380,NP_249014.1,-0.187309
3,ABC_transporter,PA,880771,NP_249293.1,0.085173
4,ABC_transporter,PA,879023,NP_249295.1,0.038834


In [3]:
# remove decimal point from REFSEQ column
df['REFSEQ'] = df['REFSEQ'].astype(str).replace('\.1', '', regex=True)
df['REFSEQ'] = df['REFSEQ'].astype(str).replace('\.2', '', regex=True)
df['REFSEQ'] = df['REFSEQ'].astype(str).replace('\.3', '', regex=True)
df.head()

Unnamed: 0,GENENAME,ORG,ENTREZID,REFSEQ,LOG2FC
0,ABC_transporter,PA,879411,NP_248876,-0.038469
1,ABC_transporter,PA,883108,NP_248894,-0.207718
2,ABC_transporter,PA,878380,NP_249014,-0.187309
3,ABC_transporter,PA,880771,NP_249293,0.085173
4,ABC_transporter,PA,879023,NP_249295,0.038834


In [4]:
# save modified REFSEQ column to new DataFrame
DAVID = df[['REFSEQ']].copy()

DAVID.head()

Unnamed: 0,REFSEQ
0,NP_248876
1,NP_248894
2,NP_249014
3,NP_249293
4,NP_249295


In [6]:
# export DataFrame as a .csv file for input into DAVID
DAVID.to_csv('DAVID.csv', index = False)

# DAVID outputs PROTEIN_GI as a .txt file for each organism
# two values for each REFSEQ, need to pull every other value from the .txt file

In [27]:
# convert the .txt to .csv with latest PROTEIN_GI

PA_DAVID = pd.read_csv("pa_david.csv")
EC_DAVID = pd.read_csv("ec_david.csv")
BS_DAVID = pd.read_csv("bs_david.csv")

PA_DAVID.head()

Unnamed: 0.1,Unnamed: 0,From,To,Species,Gene Name
0,0,NP_253120,15599626,Pseudomonas aeruginosa PAO1,cytochrome b(PA4430)
1,1,NP_253761,15600267,Pseudomonas aeruginosa PAO1,ABC transporter ATP-binding protein(PA5074)
2,2,NP_254178,15600684,Pseudomonas aeruginosa PAO1,cytochrome(PA5491)
3,3,NP_249786,15596292,Pseudomonas aeruginosa PAO1,B-type flagellar protein FliS(PA1095)
4,4,NP_253918,15600424,Pseudomonas aeruginosa PAO1,ABC transporter ATP-binding protein/permease(P...


In [8]:
# combine dataframes
frames = [PA_DAVID, EC_DAVID, BS_DAVID]
DAVID_df = pd.concat(frames)
DAVID_df.head()

Unnamed: 0.1,Unnamed: 0,From,To,Species,Gene Name
0,0,NP_253120,15599626,Pseudomonas aeruginosa PAO1,cytochrome b(PA4430)
1,1,NP_253761,15600267,Pseudomonas aeruginosa PAO1,ABC transporter ATP-binding protein(PA5074)
2,2,NP_254178,15600684,Pseudomonas aeruginosa PAO1,cytochrome(PA5491)
3,3,NP_249786,15596292,Pseudomonas aeruginosa PAO1,B-type flagellar protein FliS(PA1095)
4,4,NP_253918,15600424,Pseudomonas aeruginosa PAO1,ABC transporter ATP-binding protein/permease(P...


### <span style="color:purple">Julia's solution
<span style="color:purple">Add 'real' ENTREZ ID's from DAVID_df to df based on REF_SEQ

In [31]:
# convert the .txt to .csv with latest PROTEIN_GI
PA_DAVID = pd.read_csv("pa_david.csv")
EC_DAVID = pd.read_csv("ec_david.csv")
BS_DAVID = pd.read_csv("bs_david.csv")

# combine dataframes
frames = [PA_DAVID, EC_DAVID, BS_DAVID]
DAVID_df = pd.concat(frames, ignore_index=True)
DAVID_df = DAVID_df.drop('Unnamed: 0', axis=1)
DAVID_df.head()

Unnamed: 0,From,To,Species,Gene Name
0,NP_253120,15599626,Pseudomonas aeruginosa PAO1,cytochrome b(PA4430)
1,NP_253761,15600267,Pseudomonas aeruginosa PAO1,ABC transporter ATP-binding protein(PA5074)
2,NP_254178,15600684,Pseudomonas aeruginosa PAO1,cytochrome(PA5491)
3,NP_249786,15596292,Pseudomonas aeruginosa PAO1,B-type flagellar protein FliS(PA1095)
4,NP_253918,15600424,Pseudomonas aeruginosa PAO1,ABC transporter ATP-binding protein/permease(P...


In [36]:
df

Unnamed: 0,GENENAME,ORG,ENTREZID,REFSEQ,LOG2FC,True_EID
0,ABC_transporter,PA,879411,NP_248876,-0.038469,15595384
1,ABC_transporter,PA,883108,NP_248894,-0.207718,15595400
2,ABC_transporter,PA,878380,NP_249014,-0.187309,15595520
3,ABC_transporter,PA,880771,NP_249293,0.085173,15595799
4,ABC_transporter,PA,879023,NP_249295,0.038834,15595801
...,...,...,...,...,...,...
431,RNA_polymerase,BS,939937,NP_388354,-0.055848,728883360
432,RNA_polymerase,BS,936362,NP_391300,-0.140629,728886343
433,RNA_polymerase,BS,938729,NP_390226,-0.528350,728885268
434,RNA_polymerase,BS,939953,NP_389416,0.303125,728884442


In [34]:
# Initialize results lists
eids = []

for i in range(len(df['REFSEQ'])):
    for j in range(len(DAVID_df['From'])):
        if df['REFSEQ'][i] == DAVID_df['From'][j]:
            eids.append(DAVID_df['To'][j])
            
df['True_EID'] = pd.DataFrame(eids)
df.head()

Unnamed: 0,GENENAME,ORG,ENTREZID,REFSEQ,LOG2FC,True_EID
0,ABC_transporter,PA,879411,NP_248876,-0.038469,15595384
1,ABC_transporter,PA,883108,NP_248894,-0.207718,15595400
2,ABC_transporter,PA,878380,NP_249014,-0.187309,15595520
3,ABC_transporter,PA,880771,NP_249293,0.085173,15595799
4,ABC_transporter,PA,879023,NP_249295,0.038834,15595801


In [35]:
df

Unnamed: 0,GENENAME,ORG,ENTREZID,REFSEQ,LOG2FC,True_EID
0,ABC_transporter,PA,879411,NP_248876,-0.038469,15595384
1,ABC_transporter,PA,883108,NP_248894,-0.207718,15595400
2,ABC_transporter,PA,878380,NP_249014,-0.187309,15595520
3,ABC_transporter,PA,880771,NP_249293,0.085173,15595799
4,ABC_transporter,PA,879023,NP_249295,0.038834,15595801
...,...,...,...,...,...,...
431,RNA_polymerase,BS,939937,NP_388354,-0.055848,728883360
432,RNA_polymerase,BS,936362,NP_391300,-0.140629,728886343
433,RNA_polymerase,BS,938729,NP_390226,-0.528350,728885268
434,RNA_polymerase,BS,939953,NP_389416,0.303125,728884442


In [37]:
df.to_csv('GI_Numbers_combo.csv')