In [1]:
import pandas as pd
import numpy as np
import csv
import os

### CRF genes in complexes: annotation

In [2]:
go_use_file = '/Volumes/Yolanda1TB/CRF_Screen/Ref/GO_terms/complex_count_rank_select_for_plotting.csv'
go_use_df = pd.read_csv(go_use_file)

crf_go_file = 'https://raw.githubusercontent.com/ScrippsPipkinLab/CRF_Screen/master/Ref/GO_terms/CRM_complexes_count.csv'
crf_go_df = pd.read_csv(crf_go_file)

crf_anno_file = 'https://raw.githubusercontent.com/ScrippsPipkinLab/CRF_Screen/master/Ref/HGSCore_only-CRF_anno.csv'
crf_anno_df = pd.read_csv(crf_anno_file)

*Find alternative gene names*

In [3]:
crf_alt_names_file = 'https://raw.githubusercontent.com/ScrippsPipkinLab/CRF_Screen/master/Ref/CRF_alternative_gn.csv'
crf_alt_names_df = pd.read_csv(crf_alt_names_file)
alt_names_dict = {crf_alt_names_df['gene_name'][i]:crf_alt_names_df['Alternative'][i] 
                  for i in range(len(crf_alt_names_df))}
alt_names_dict_rev = {crf_alt_names_df['Alternative'][i]:crf_alt_names_df['gene_name'][i] 
                  for i in range(len(crf_alt_names_df))}

def alt_genes(in_list):
    out_list = []
    for i in in_list:
        if i in alt_names_dict.keys():
            out_list.append(alt_names_dict[i])
        elif i in alt_names_dict_rev.keys():
            out_list.append(alt_names_dict_rev[i])
    return(out_list)

## Compile

In [4]:
def remove_genes_from_dict(genes_list, keep_key, in_dict):
    for key in in_dict.keys():
        if key != keep_key:
            in_dict[key] = list(set(in_dict[key]) - set(genes_list))
    return(in_dict)

In [5]:
all_complexes_dict = {}

In [6]:
uniq_complexes = [x for x in np.unique(crf_anno_df['complexNames'].tolist()) if str(x) != 'nan']

for i in uniq_complexes:
    i_crfs = crf_anno_df[crf_anno_df['complexNames'] == i]['gene_name'].tolist()
    i_crfs = [x.capitalize() for x in i_crfs]
    i_crfs += alt_genes(i_crfs)
    all_complexes_dict[i] = i_crfs

In [7]:
go_use_terms = go_use_df[go_use_df['use'] == 'yes']['complex'].tolist()

for i in go_use_terms:
    i_simp_name = go_use_df[go_use_df['complex'] == i]['abbr'].tolist()[0]
    i_crfs = crf_go_df[crf_go_df[i] == 'Yes']['gene_name'].tolist()
    i_crfs = [x.capitalize() for x in i_crfs]
    i_crfs += alt_genes(i_crfs)
    if i_simp_name not in all_complexes_dict.keys():
        all_complexes_dict[i_simp_name] = i_crfs
    else:
        all_complexes_dict[i_simp_name] += i_crfs

### Special cases

In [8]:
def reserve_genes_for_key(genes_list, keep_key, in_dict):
    # Add genes_list to specific category
    if keep_key not in in_dict.keys():
        in_dict[keep_key] = genes_list
    else:
        in_dict[keep_key] = list(set(in_dict[keep_key] + genes_list))
    
    # Remove genes_list from other categories
    for key in in_dict.keys():
        if key != keep_key:
            in_dict[key] = list(set(in_dict[key]) - set(genes_list))
    return(in_dict)

In [9]:
chd_genes = [x for x in crf_go_df['gene_name'] if 'Chd' in x]
all_complexes_dict = reserve_genes_for_key(chd_genes, 'Chd', all_complexes_dict)

In [10]:
bromo_genes = ['Bptf', 'Brdt', 'Brpf1','Brpf3','Brwd1', 'Brwd3'] + ['Brd%s'%x for x in range(1,10)]
all_complexes_dict = reserve_genes_for_key(bromo_genes, 'Bromodomain', all_complexes_dict)

In [11]:
cbx_genes = ['Cbx%s'%x for x in range(1,9)]
all_complexes_dict = reserve_genes_for_key(cbx_genes, 'Chromodomain', all_complexes_dict)

In [12]:
hdac_genes = [x for x in crf_go_df['gene_name'] if 'Hdac' in x] + ['Sirt']
all_complexes_dict = reserve_genes_for_key(hdac_genes, 'HDAC', all_complexes_dict)

In [13]:
mbd_genes = [x for x in crf_go_df['gene_name'] if 'Mbd' in x]
all_complexes_dict = reserve_genes_for_key(mbd_genes , 'DNA Methylation - Demethylation', all_complexes_dict)

In [14]:
hdm_genes = [x for x in crf_go_df['gene_name'] if 'Kdm' in x] + [x for x in crf_go_df['gene_name'] if 'Jmj' in x]
all_complexes_dict = reserve_genes_for_key(hdm_genes, 'Histone Demethylation', all_complexes_dict)

In [15]:
all_complexes_dict['ISWI - Ino80'] = all_complexes_dict['ISWI'] + all_complexes_dict['Ino80']

### Write only selected

In [16]:
use_complexes = ['BAF', 'ISWI - Ino80', 'Chd',
                 'Histone Acetylation', 'Histone Methylation', 
                 'Histone Ubiquitination', 'Arginine Methylation', 
                 'DNA Methylation - Demethylation', 
                 'HDAC', 'Histone Demethylation', 
                 'Bromodomain', 'Chromodomain']

In [17]:
out_file = 'CRF_complexes.csv'
with open(out_file, "w") as fout:
    wfout = csv.writer(fout, delimiter=",")
    wfout.writerow(['complex', 'gene_name'])
    
    for i in all_complexes_dict.keys():
        if i in use_complexes:
            i_genes = all_complexes_dict[i]
            for g in i_genes:
                wfout.writerow([i, g])