In [1]:
import pandas as pd
from weblogo import *

### Configuration

In [2]:
sea = "sea"  # change to specific directory of MEME/bin/sea
df_name = "nanopore.csv"  # input, be careful for multiindex
motif_col = "Flanking_10"
cluster_col = "Cluster"
meme_fn = "nanopore.meme"
fasta_fn = "nanopore.fa"
sea_out = "sea_out"

### Run

In [3]:
df = pd.read_csv(df_name, index_col=None, header=[0])
# df = pd.read_csv(df_name, index_col=[0,1,2], header=[0, 1]) # for dual header

In [4]:
df

Unnamed: 0,Sites (hg38),Flanking_10,Center base,X,Y,Cluster,Phase shift
0,6@ENSG00000124785@5998888,TTCTTTGGGGACGTTGTGATT,A,-9.886736,9.979774,2,-1
1,X@ENSG00000123562@103676621,CTTGTTGAGGACTGGGACTTA,A,-10.839322,8.247921,2,0
2,19@ENSG00000123144@12730812,ATTCTCCAGGACTCTTTTTTT,A,-8.630402,9.256767,2,0
3,15@ENSG00000117899@80979409,GAGATCAAGGACTTTTTGGTC,A,-9.439362,5.915771,2,0
4,15@ENSG00000167004@43771249,TTGTAAAAGGACTCTTCCATC,A,-10.553759,8.413573,2,0
...,...,...,...,...,...,...,...
3545,X@ENSG00000262919@153588256,AGCTGGGAGGACTGGTTGTGC,A,-9.962901,6.885458,2,-1
3546,17@ENSG00000170689@48623014,CTCACCAGGGACCGTAGGCAC,A,-9.299487,8.017858,2,-1
3547,20@ENSG00000171222@35953701,CTCTGTGCGGACTGGGGCCAT,A,-10.861468,6.478575,2,-1
3548,17@ENSG00000127774@3669524,GATCGGGAGGACTGTGGCCAG,A,-9.411388,6.425912,2,-1


In [5]:
def extact_all_fasta(df_in, column, fn_out, rna=True):
    N = 0
    with open(fn_out, "w") as output:
        for idx, row in df_in.iterrows():
            if rna == True:
                output.write(">{}\n{}\n".format(N, row[column].replace("T", "U")))
            else:
                output.write(">{}\n{}\n".format(N, row[column]))
            N += 1        

In [6]:
extact_all_fasta(df, motif_col, fasta_fn)

In [7]:
def generate_meme_file(df_in, id_column, motif_column, fn_out, rna=True):
    all_ids = set(df_in[id_column].tolist())
    temp_mat_name = fn_out+".temp.mat"
    print(all_ids)
    with open(temp_mat_name, "w") as output:
        for ID in all_ids:
            print(ID)
            subdf = df_in[df_in[id_column]==ID]
            count_data = {}
            for _, row in subdf.iterrows():
                if "N" in row[motif_column]:
                    continue
                if rna == True:
                    iterseq = row[motif_column].replace("T", "U")
                else:
                    iterseq = row[motif_column]
                for idx, base in enumerate(list(iterseq)):
                    if idx not in count_data:
                        count_data[idx] = {"A":0, "C": 0, "G": 0, "U":0}
                    count_data[idx][base] += 1
            count_df = pd.DataFrame.from_dict(count_data).T
            seqs = count_df.values
            seqs = np.array(seqs)
            logodata = LogoData.from_counts(counts=seqs, alphabet='ACGU')

            temp = []
            for i in range(logodata.counts.shape[0]):
                # temp.extend(list(logodata.entropy[i]*logodata.counts[i]/logodata.counts[i].sum()))
                output.write("{}\t{}\t{}\t{}\n".format(logodata.counts[i][0], logodata.counts[i][1], logodata.counts[i][2], logodata.counts[i][3]))
            output.write("\n")
        
    !matrix2meme -rna < $temp_mat_name > $fn_out

In [8]:
generate_meme_file(df, cluster_col, motif_col, meme_fn)

{1, 2}
1
2


In [9]:
!$sea -oc $sea_out -p $fasta_fn -m $meme_fn

# Checking alphabets in 1 motif files.
# Loading motifs from file 'nanopore.meme'
# Alphabet: RNA
# NOTE: Will convert any DNA sequences to RNA.
# Positive sequences "nanopore.fa" - training: 3195 hold-out: 355
# Negative sequences are shuffled primary sequences (2-order) - training: 3195 hold-out: 355
# Estimating background model from control sequences.
# Background: A 0.257 C 0.249 G 0.274 U 0.22
# Background order: 2 Background size: 84
# Using Fisher Exact test for p-values.
# Computing q-values.
#   Cannot estimate pi_0 accurately from fewer than 100 p-values.
#   Total p-values = 2. Using pi_zero = 1.0.
# Freeing storage...


### It is better to read the html.

In [10]:
df_sea = pd.read_csv("./{}/sea.tsv".format(sea_out), header=0, sep="\t")

In [11]:
df_sea

Unnamed: 0,RANK,DB,ID,ALT_ID,CONSENSUS,TP,TP%,FP,FP%,ENR_RATIO,SCORE_THR,PVALUE,LOG_PVALUE,EVALUE,LOG_EVALUE,QVALUE,LOG_QVALUE
0,1,nanopore.meme,2.0,NNNNNNNNRGACHNNNNNNNN,NNNNNNNDGGACWNNNNNNNN,2484.0,77.75,493.0,15.43,5.03,0.24,0.0,-1349.84,0.0,-1349.15,0.0,-1349.15
1,2,nanopore.meme,1.0,NNNNNNNNNCARNNNNNNNNN,NNBNNNNMNCAGDNNNHNNNN,705.0,22.07,368.0,11.52,1.91,0.15,4.98e-30,-67.47,9.95e-30,-66.78,4.98e-30,-67.47
2,# SEA (Simple Enrichment Analysis): Version 5....,,,,,,,,,,,,,,,,
3,# The format of this file is described at http...,,,,,,,,,,,,,,,,
4,# sea -oc sea_out -p nanopore.fa -m nanopore.meme,,,,,,,,,,,,,,,,
