In [53]:
import pandas as pd
import os
import glob
try:
    code_dir
except NameError:
    code_dir = os.getcwd()
    base_dir = code_dir.replace("/codes_local", "")

### 0. Filter counts that are not in the right direction

In [73]:
def filterWrongDir(in_file):
    #in_file = '/media/pipkin/Yolanda/Exp337CD25KONascent_new/2_featureCounts/337_1_neg_mRNA.txt'
    out_file = in_file.replace(".txt", "_flt.csv")
    if "neg" in in_file:
        in_strand = "+"
    else:
        in_strand = "-"
    in_df = pd.read_csv(in_file, sep="\t", skiprows=1)
    in_df.columns = in_df.columns.tolist()[:-1] + ["count"]
    in_df_flt = in_df[in_df['Strand'] == in_strand]
    in_df_flt.to_csv(out_file, index=False)
    
    kept_pctg = sum(in_df_flt.iloc[:,6]) / sum(in_df.iloc[:,6]) * 100
    print(in_file.split("/")[-1], "kept:", kept_pctg, "%")

In [74]:
count_outs = glob.glob("%s/2_featureCounts/*mRNA.txt"%base_dir)
for count_out in count_outs:
    filterWrongDir(count_out)

337_23_pos_mRNA.txt kept: 84.34447720228316 %
337_3_pos_mRNA.txt kept: 85.40283124033745 %
337_6_neg_mRNA.txt kept: 76.36161405675732 %
337_5_neg_mRNA.txt kept: 77.47129121850405 %
337_11_pos_mRNA.txt kept: 85.23022122255053 %
337_14_pos_mRNA.txt kept: 77.48930695227538 %
337_8_neg_mRNA.txt kept: 79.36359057471279 %
337_2_neg_mRNA.txt kept: 77.20361643194484 %
337_24_pos_mRNA.txt kept: 85.09121280989696 %
337_13_pos_mRNA.txt kept: 84.54401398538445 %
337_9_pos_mRNA.txt kept: 83.97136366869874 %
337_11_neg_mRNA.txt kept: 88.7163544520093 %
337_14_neg_mRNA.txt kept: 85.09889999499491 %
337_12_neg_mRNA.txt kept: 82.14490804353326 %
337_20_pos_mRNA.txt kept: 86.00292040642825 %
337_13_neg_mRNA.txt kept: 84.32762514777356 %
337_3_neg_mRNA.txt kept: 85.66139362726531 %
337_20_neg_mRNA.txt kept: 84.93970285698569 %
337_9_neg_mRNA.txt kept: 86.18701457212585 %
337_18_neg_mRNA.txt kept: 84.85339124506687 %
337_4_pos_mRNA.txt kept: 85.87462199452047 %
337_16_neg_mRNA.txt kept: 82.44113990305554 

### 1. Compile

In [157]:
### Get all genes in the reference to compile csv file
ref_gff = "%s/codes_hpc/Mus_musculus.GRCm38.102.mRNA.simp.rmdup.srt.gff3"%base_dir
gff_df = pd.read_csv(ref_gff, sep='\t', dtype={0:'str'}, header=None)
all_genes = [x.replace("ID=","") for x in gff_df[8]]

compiled_df = pd.DataFrame({"gene_name": all_genes})
for i in range(1,25):
    if i != 22:
        i_pos = "%s/2_featureCounts/337_%s_pos_mRNA_flt.csv"%(base_dir, i)
        i_neg = "%s/2_featureCounts/337_%s_neg_mRNA_flt.csv"%(base_dir, i)
        i_pos_df = pd.read_csv(i_pos)
        i_neg_df = pd.read_csv(i_neg)
        i_df = i_pos_df.append(i_neg_df)[['Geneid', 'count']]
        i_df.columns = ["gene_name", "337_%s"%i]
        compiled_df = compiled_df.merge(i_df, how="left")

In [158]:
info_file = "%s/info/sample_sheet.csv"%base_dir
info_df = pd.read_csv(info_file)
info_dict = {x.replace("-","_"):y for index, (x, y) in
            enumerate(zip(info_df['sp_order'].tolist(), info_df['sp_name'].tolist()))}

In [159]:
compiled_df = compiled_df.set_index("gene_name")
compiled_df.columns = [info_dict[x] for x in compiled_df.columns]
compiled_df.to_csv("%s/2_featureCounts_merged/Exp337_count.csv"%base_dir)

In [160]:
gene_max_n = compiled_df.max(axis=1)
compiled_df_c5 = compiled_df[[True if x >=5 else False for x in gene_max_n]]
compiled_df_c10 = compiled_df[[True if x >=10 else False for x in gene_max_n]]
compiled_df_c5.to_csv("%s/2_featureCounts_merged/Exp337_count_c5.csv"%base_dir)
compiled_df_c10.to_csv("%s/2_featureCounts_merged/Exp337_count_c10.csv"%base_dir)