## Make normalized counts of each position in the genome from a bed file
### Note: this was made to specifically work with bedifles that only have 1bp position (in this case the 3' end of a read)

### input: 3' end bedfile (or any bedfile) if the range covers the whole read rather than 1bp it will only fill in the value for the end coordinate position and the rest of the postions will show a 0 count (ex. a read in pos. 180-190 in the genome will only show a value for 190)

### output: will save dataframes to csv in the directory you are currently in

In [1]:
import pandas as pd
import numpy as np

In [7]:
def make_counts(file_lis,g_length):
    for f in file_lis:
        if 'plus' in f:
            p_3end=pd.read_table(f,header=None)
        else:
            if 'minus' in f:
                m_3end=pd.read_table(f,header=None)
            else:
                raise Exception('There must be one plus file and one minus file. Did not find file with plus and/or minus in the file name')
    p_counts=pd.DataFrame(p_3end[2].value_counts()).sort_index().reindex(range(0,g_length),fill_value=0)
    m_counts=pd.DataFrame(m_3end[2].value_counts()).sort_index().reindex(range(0,g_length),fill_value=0)
    df=pd.concat([p_counts,m_counts],axis=1)
    df.columns=['p_count','m_count']
    
    return df

def make_norm_df(sample_lis,norm_lis):
    df_lis=[]
    for i in range(len(sample_lis)):
        df=make_counts(['{}_plus_3end.bed'.format(sample_lis[i]),'{}_minus_3end.bed'.format(sample_lis[i])],g_length)
        df=(df/norm_value[i])*1000000
        df_lis.append(df)
        print('read '+str(sample_lis[i])+' df')
    return df_lis
    



In [9]:
g_length=4639675 #length of the reference genome
sample_lis=['cv02','cv05','cv06','cv09','cv10']
norm_lis=[13550939,8775189,8917885,13773751,13743872] #used number of aligned reads for norm value

df_lis=make_norm_df(sample_lis,norm_lis)
for i in range(len(df_lis)):
    df_lis[i].to_csv(sample_lis[i]+"_normcounts_perpos",index=False)

read cv02 df
read cv05 df
read cv06 df
read cv09 df
read cv10 df
