In [15]:
import os
import csv
import numpy as np
import scipy.spatial as sp

def fileread(gfile):
# Reads gpdb file and appends CG to 'cg_nums', [x, y, z] to 'values', AA number to 'aa_nums' and Chain number to 'chain' which are lists
    with open(gfile, 'r') as tfile:
        tmpfile = tfile.readlines()
    
    cg_nums = []
    values = []
    aa_nums = []
    chain = []
    for i, dat in enumerate(tmpfile):
        if dat[0:6].strip()=="ATOM":
            cg_nums.append(dat[12:16].strip())
            values.append([dat[31:38].strip(), dat[38:46].strip(), dat[46:54].strip()])
            aa_nums.append(dat[22:27].strip())
            chain.append(dat[21:22].strip())
    return cg_nums, values, aa_nums, chain


def nn_in_dmax(values, dmax):
    tree = sp.cKDTree(values)
    
    dmax = float(dmax)
    
    # Gives index of all values within dmax radius
    nn_inds = tree.query_ball_point(values, dmax)
    
    return nn_inds


def make_NRaa(nn_inds, aa_nums, chain):
    # Removes CGs from same amino acid as central CG
    for i in range(len(nn_inds)):
        for neighbour in nn_inds[i]:
            if chain[int(neighbour)] == chain[i] and aa_nums[int(neighbour)] == aa_nums[i]:
                nn_inds[i].remove(neighbour)
    
    return nn_inds
    
    
def write_db(csv_file, f_mode, gpdb_num, cg_nums, NRaa_nn_inds, values):
    # Write into csv_file with format:
    # ls = ['GPDB #','Central Group','x','y','z','Group1','x1','y1','z1','Group2','x2','y2','z2', ...]
    
    f = open(csv_file, f_mode)
    w = csv.writer(f)
    
    for i in range(len(NRaa_nn_inds)):
        tmp = []
        tmp.append(gpdb_num)
        tmp.append(cg_nums[i])
        tmp.extend(values[i])
        for neighbour in NRaa_nn_inds[i]:
            tmp.append(cg_nums[int(neighbour)])
            tmp.extend(values[int(neighbour)])
            
        w.writerow(tmp)
    

gpdb_dir = os.getcwd()
csv_file = 'feat_sample.csv'
dmax = 8

for filename in os.listdir(gpdb_dir):
    if filename.endswith(".gpdb"):
        gpdb_num = filename[:-5]
        cg_nums, values, aa_nums, chain = fileread(filename)
        nn_inds = nn_in_dmax(values, dmax)
        NRaa_nn_inds = make_NRaa(nn_inds, aa_nums, chain)
        write_db(csv_file, 'a+', gpdb_num, cg_nums, NRaa_nn_inds, values)
        
    

---
# Extract input features
---

In [1]:
# Count number of columns in the file
import csv
import sys

csv_file = 'feat_db_5_sample.csv'

f = open(csv_file,'r')
r = csv.reader(f)

arr = []
i = 0
mx = 0
ind = 0
for line in r:
        #arr.append(len(line))
        if mx < len(line):
                mx = len(line)
                ind = i
        i+=1

print('Max # of col =', mx)
print('Row with max cols =', ind)


Max # of col = 137
Row with max cols = 775272


In [11]:
'''
Make input data of the form:
[Central CG, # of r1, # of r2, ..., # of r16]
'''

import csv
import numpy as np

csv_file = 'feat_5_sample.csv'

f = open(csv_file,'r')
r = csv.reader(f)

g = open('feat_5_PSSM.csv','w')
w = csv.writer(g)

ls = ['Central Group', '# of r1', '# of r2', '# of r3', '# of r4', '# of r5', '# of r6', '# of r7', '# of r8', '# of r9', '# of r10', '# of r11', '# of r12', '# of r13', '# of r14', '# of r15', '# of r16']
w.writerow(ls)


for line in r:
    temp = [0]*17 #np.zeros((17,))
    temp[0] = int(line[1][1:]) - 1
    i = 0
    for ele in line[5:]:
        if i%4 == 0:
            cg = int(ele[1:])
            temp[cg]+=1
        i+=1
    w.writerow(temp)

f.close()
g.close()


---
---
# Rough Work
---
---

In [2]:
myList = [[2, 4, 6], [9, 11, 13]]
myList[0].remove(2)
print(myList)

[[4, 6], [9, 11, 13]]


In [36]:
a = []
for i in range(1,21):
    a.append('PSSM col%s'%i)
print(a)

['PSSM col1', 'PSSM col2', 'PSSM col3', 'PSSM col4', 'PSSM col5', 'PSSM col6', 'PSSM col7', 'PSSM col8', 'PSSM col9', 'PSSM col10', 'PSSM col11', 'PSSM col12', 'PSSM col13', 'PSSM col14', 'PSSM col15', 'PSSM col16', 'PSSM col17', 'PSSM col18', 'PSSM col19', 'PSSM col20']


In [1]:
import json

with open('PSSM/101M','r') as f:
    data = json.load(f)    

print(len(data))
print(data['A'][0])

1
{'index': 1, 'res_id': 0, 'aa': 'M', 'iter': {'2': {'pssm': ['-3', '-4', '-5', '-6', '-4', '-3', '-5', '-5', '-4', '-1', '0', '-4', '10', '-2', '-5', '-4', '-3', '-4', '-3', '-2'], 'psfm': ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '100', '0', '0', '0', '0', '0', '0', '0'], 'a': '2.33', 'b': '0.52'}, '3': {'pssm': ['-3', '-3', '-4', '-5', '-3', '-2', '-4', '-5', '-4', '-1', '0', '-3', '10', '-2', '-5', '-4', '-3', '-3', '-3', '-1'], 'psfm': ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '100', '0', '0', '0', '0', '0', '0', '0'], 'a': '2.11', 'b': '0.44'}}}


In [35]:
arr = [0]*42
# for i in range(len(data['A'])):
#     arr.append(data['A'][i]['iter']['3']['pssm'])
# arr = [int(i) for i in arr]
arr[:20] = data['A'][0]['iter']['3']['pssm']
arr[20:] = data['A'][1]['iter']['3']['pssm']

print(len(arr))
print(arr)

40
['-3', '-3', '-4', '-5', '-3', '-2', '-4', '-5', '-4', '-1', '0', '-3', '10', '-2', '-5', '-4', '-3', '-3', '-3', '-1', '0', '-2', '-1', '-2', '-2', '-1', '-1', '3', '-2', '-2', '-2', '-2', '0', '-3', '-2', '3', '2', '-4', '-3', '0']


In [43]:
'''
Make input data of the form:
[Central CG, # of r1, # of r2, ..., # of r16]
'''

import csv
import numpy as np

csv_file = 'feat_db_5_NN_train.csv'

f = open(csv_file,'r')
r = csv.reader(f)

g = open('feat_5_PSSM.csv','w')
w = csv.writer(g)

ls = ['Central Group', '# of r1', '# of r2', '# of r3', '# of r4', '# of r5', '# of r6', '# of r7', '# of r8', '# of r9', '# of r10', '# of r11', '# of r12', '# of r13', '# of r14', '# of r15', '# of r16', 'PSSM col1', 'PSSM col2', 'PSSM col3', 'PSSM col4', 'PSSM col5', 'PSSM col6', 'PSSM col7', 'PSSM col8', 'PSSM col9', 'PSSM col10', 'PSSM col11', 'PSSM col12', 'PSSM col13', 'PSSM col14', 'PSSM col15', 'PSSM col16', 'PSSM col17', 'PSSM col18', 'PSSM col19', 'PSSM col20']
w.writerow(ls)


for line in r:
    temp = [0]*37 #np.zeros((17,))
    temp[0] = int(line[3][1:]) - 1
    i = 0
    for ele in line[7:]:
        if i%4 == 0:
            cg = int(ele[1:])
            temp[cg]+=1
        i+=1
        
    pdb = line[0]
    chain = line[1]
    aa_ind = int(line[2])
    p_file = 'PSSM/%s'%pdb
    with open(p_file,'r') as p:
        data = json.load(p)
    pssv = data[chain][aa_ind]['iter']['3']['pssm']
    pssv = [int(k) for k in pssv]
    temp[17:] = pssv

    w.writerow(temp)
    break
    
f.close()
g.close()



In [1]:
import pandas as pd
arr = pd.Series([1,2,3,4])

print(arr.idxmax())

3
