# 데이터 단순 추출

In [22]:
import os
import pandas as pd

def transform_u_to_t(seq):
    return seq.replace('U', 'T')

def get_base_list(rslt_lst, seq):
    n_pos = seq.find('N')
    
    if n_pos >= 0:  # N(Any)이 있는 경우
        for base in ['A', 'C', 'G', 'T']:  # 재귀적으로 추가
            get_base_list(rslt_lst, seq[:n_pos] + base + seq[n_pos+1:])
    else:  # N이 없는 경우 정보를 list에 저장
        rslt_lst.append(seq)

def merge_data():
    name = []
    seq = []
    bound = []
    base = ['A', 'C', 'G', 'T']

    datalist = os.listdir('./data')

    for d in datalist:
        f = open('./data/' + d, 'r')
        protein = "_".join(d.split('_')[:-4])
        f.readline()  # 맨 첫 줄은 제외

        print(protein, 'progressing...')
        for l in f.readlines():
            t = l.rstrip().split('\t')
            
            # 가능한 염기 서열 정보 파악
            seq_one = transform_u_to_t(t[-2])
            seq_lst = []
            get_base_list(seq_lst, seq_one)
            
            for s in seq_lst:
                name.append(protein)
                seq.append(s)
                bound.append(t[-1])
                
        f.close()

        print(protein, 'clear!')

    bound_df = pd.DataFrame({'protein': name, 'seq': seq, 'bound': bound})
    return bound_df

In [23]:
bound_df = merge_data()
print()
print(bound_df.head())
bound_df.to_csv('raw_data.csv', index=False)

Alx1_DBD progressing...
Alx1_DBD clear!
Alx1_DBD progressing...
Alx1_DBD clear!
ALX3_FL progressing...
ALX3_FL clear!
ALX3_FL progressing...
ALX3_FL clear!
AR_DBD progressing...
AR_DBD clear!
AR_DBD progressing...
AR_DBD clear!
AR_FL progressing...
AR_FL clear!
AR_FL progressing...
AR_FL clear!

    protein                   seq bound
0  Alx1_DBD  GCAGATAATCTAATTACCCC     1
1  Alx1_DBD  CTCAGTCCTCGTCTCGATGG     1
2  Alx1_DBD  TCATAATCTAATTACGCTCG     1
3  Alx1_DBD  GACTTCCTCAATCTAATTAG     1
4  Alx1_DBD  GCAGTTAATCTAATTAACCG     1


# 데이터 정제

In [3]:
import pandas as pd

bound_df = pd.read_csv('raw_data.csv')

print(bound_df.head())
print()
print(bound_df.shape)

    protein                   seq  bound
0  Alx1_DBD  GCAGATAATCTAATTACCCC      1
1  Alx1_DBD  CTCAGTCCTCGTCTCGATGG      1
2  Alx1_DBD  TCATAATCTAATTACGCTCG      1
3  Alx1_DBD  GACTTCCTCAATCTAATTAG      1
4  Alx1_DBD  GCAGTTAATCTAATTAACCG      1

(1132457, 3)


In [4]:
from itertools import product

def kmer(seq, k):
    base = ['A', 'C', 'G', 'T']
    
    keys = list(map(''.join, product(base, repeat=k)))
    dicts = {}
    for key in keys: dicts[key] = 0
    
    for i in range(len(seq)-k+1):
        now = seq[i:i+k]
        dicts[now] += 1
    
    for key in keys: dicts[key] /= (len(seq)-k+1)
    return dicts

In [19]:
from time import sleep

split_by = 100  # 분할 횟수
split_cnt = len(bound_df)//split_by  # 메모리 과부하 방지를 위해 분할하여 데이터 저장

for split_no in range(split_by+1):
    meta_list = [[], [], [], []]
    print('progressing... batch ' + str(split_no), end='   ')

    # 메타정보 파악
    for i in bound_df.index[split_cnt*split_no : min(split_cnt*(split_no+1), len(bound_df))]:  # 특정 범위 index의 데이터만 추출
        val = bound_df.loc[i, 'seq']
        for j in range(1, 5):
            meta_list[j-1].append(kmer(val, j))
            
    # 각각 DataFrame 생성
    for i in range(len(meta_list)):
        meta_list[i] = pd.DataFrame(meta_list[i])

    # 기존 Dataframe 정보랑 병합 및 메타정보 저장
    bdorigin = bound_df.iloc[split_cnt*split_no : min(split_cnt*(split_no+1), len(bound_df))].reset_index()
    concats = pd.concat([bdorigin, pd.concat(meta_list, axis=1)], axis=1)
    concats.to_csv('./data/meta_data_' + str(split_no) + '.csv', index=False)

    print('done!  -- wait 5 sec.')
    sleep(5)

progressing... batch 0   done!  -- wait 5 sec.
progressing... batch 1   done!  -- wait 5 sec.
progressing... batch 2   done!  -- wait 5 sec.
progressing... batch 3   done!  -- wait 5 sec.
progressing... batch 4   done!  -- wait 5 sec.
progressing... batch 5   done!  -- wait 5 sec.
progressing... batch 6   done!  -- wait 5 sec.
progressing... batch 7   done!  -- wait 5 sec.
progressing... batch 8   done!  -- wait 5 sec.
progressing... batch 9   done!  -- wait 5 sec.
progressing... batch 10   done!  -- wait 5 sec.
progressing... batch 11   done!  -- wait 5 sec.
progressing... batch 12   done!  -- wait 5 sec.
progressing... batch 13   done!  -- wait 5 sec.
progressing... batch 14   done!  -- wait 5 sec.
progressing... batch 15   done!  -- wait 5 sec.
progressing... batch 16   done!  -- wait 5 sec.
progressing... batch 17   done!  -- wait 5 sec.
progressing... batch 18   done!  -- wait 5 sec.
progressing... batch 19   done!  -- wait 5 sec.
progressing... batch 20   done!  -- wait 5 sec.
pr

KeyboardInterrupt: 

In [None]:
# 단일 데이터로 병합