In [13]:
import pandas as pd
from weblogo import *

### Configuration

In [2]:
sea = "sea"  # change to specific directory of MEME/bin/sea
df_name = "HeLa_Noc.csv"  # input, be careful for multiindex
motif_col = "Flanking_10"
cluster_col = "Cluster"
meme_fn = "HeLa.meme"
fasta_fn = "HeLa.fa"
sea_out = "sea_out"

### Run

In [3]:
df = pd.read_csv(df_name, index_col=[0,1,2], header=[0, 1]) # for dual header

In [4]:
df

Unnamed: 0.1,Unnamed: 0,HeLa (Noc),NSUN2 KO,NSUN6 KO,NSUN5 KO,siNop2,Flanking_10,X,Y,Cluster
0,10@100143502@-,False,False,False,True,False,ATTCCAAGTGCTCCATTGGAT,-0.373057,9.218974,Type II
1,10@101370486@-,False,False,True,False,False,GGCTCTCCCCCAGGATTTTGG,-3.283076,7.512163,Type I
2,10@101464351@+,False,True,False,False,False,ACGCCGAATTCACCACCGACA,-0.676019,9.043642,Type II
3,10@101610402@+,False,False,False,False,True,GGCTCTGCTTCGGAAATCCAA,-4.309327,8.119048,Type I
4,10@102046383@-,True,False,False,True,True,ACTGGCGACCCGGAGTGATGA,-4.697515,6.445199,Type I
...,...,...,...,...,...,...,...,...,...,...
8212,X@74282176@-,True,False,False,False,False,TCGTTAGATTCGATTACTGAA,-1.934125,2.511517,Type IV
8213,X@86958997@-,True,True,False,True,True,GTGTTATATCCTCCAGGAATA,-0.350233,8.720179,Type II
8214,X@9433409@+,True,False,False,False,False,GGCCGGGACTCGGAGGCTGCC,-4.914104,6.357834,Type I
8215,X@9685665@+,True,False,False,False,True,ACTGCAGAAGCCGTCGAAACC,-5.400805,7.249480,Type I


In [5]:
def extact_all_fasta(df_in, column, fn_out, rna=True):
    N = 0
    with open(fn_out, "w") as output:
        for idx, row in df_in.iterrows():
            if rna == True:
                output.write(">{}\n{}\n".format(N, row[column].replace("T", "U")))
            else:
                output.write(">{}\n{}\n".format(N, row[column]))
            N += 1        

In [6]:
extact_all_fasta(df, motif_col, fasta_fn)

In [7]:
def generate_meme_file(df_in, id_column, motif_column, fn_out, rna=True):
    all_ids = set(df_in[id_column].tolist())
    temp_mat_name = fn_out+".temp.mat"
    print(all_ids)
    with open(temp_mat_name, "w") as output:
        for ID in all_ids:
            print(ID)
            subdf = df_in[df_in[id_column]==ID]
            count_data = {}
            for _, row in subdf.iterrows():
                if "N" in row[motif_column]:
                    continue
                if rna == True:
                    iterseq = row[motif_column].replace("T", "U")
                else:
                    iterseq = row[motif_column]
                for idx, base in enumerate(list(iterseq)):
                    if idx not in count_data:
                        count_data[idx] = {"A":0, "C": 0, "G": 0, "U":0}
                    count_data[idx][base] += 1
            count_df = pd.DataFrame.from_dict(count_data).T
            seqs = count_df.values
            seqs = np.array(seqs)
            logodata = LogoData.from_counts(counts=seqs, alphabet='ACGU')

            temp = []
            for i in range(logodata.counts.shape[0]):
                # temp.extend(list(logodata.entropy[i]*logodata.counts[i]/logodata.counts[i].sum()))
                output.write("{}\t{}\t{}\t{}\n".format(logodata.counts[i][0], logodata.counts[i][1], logodata.counts[i][2], logodata.counts[i][3]))
            output.write("\n")
        
    !matrix2meme -rna < $temp_mat_name > $fn_out

In [8]:
generate_meme_file(df, cluster_col, motif_col, meme_fn)

{'Type IV', 'Type II', 'Type III', 'Type I'}
Type IV
Type II
Type III
Type I


In [9]:
!$sea -oc $sea_out -p $fasta_fn -m $meme_fn

# Checking alphabets in 1 motif files.
# Loading motifs from file 'HeLa.meme'
# Alphabet: RNA
# NOTE: Will convert any DNA sequences to RNA.
# Positive sequences "HeLa.fa" - training: 7396 hold-out: 821
# Negative sequences are shuffled primary sequences (2-order) - training: 7396 hold-out: 821
# Estimating background model from control sequences.
# Background: A 0.18 C 0.319 G 0.332 U 0.17
# Background order: 2 Background size: 84
# Using Fisher Exact test for p-values.
# Computing q-values.
#   Cannot estimate pi_0 accurately from fewer than 100 p-values.
#   Total p-values = 4. Using pi_zero = 1.0.
# Freeing storage...


### It is better to read the html.

In [10]:
df_sea = pd.read_csv("./{}/sea.tsv".format(sea_out), header=0, sep="\t")

In [11]:
df_sea

Unnamed: 0,RANK,DB,ID,ALT_ID,CONSENSUS,TP,TP%,FP,FP%,ENR_RATIO,SCORE_THR,PVALUE,LOG_PVALUE,EVALUE,LOG_EVALUE,QVALUE,LOG_QVALUE
0,1,HeLa.meme,4.0,NNNNNNNNNNCVGGVNNNNNN,SSSSSNNNHBCRGGGSNNNSN,6051.0,81.81,1487.0,20.11,4.07,0.0044,0.0,-3035.19,0.0,-3033.81,0.0,-3033.81
1,2,HeLa.meme,1.0,NNNNNNNNUUCGANGUBNNNN,NNNNNNNCUUCGADGUBNNNN,1225.0,16.56,243.0,3.29,5.02,0.53,9.88e-174,-398.36,3.9500000000000004e-173,-396.97,1.98e-173,-397.67
2,3,HeLa.meme,2.0,NNNNBNNNNHCUCCANNNNNN,NVBBSVVNNYCUCCANSSBSN,235.0,3.18,35.0,0.47,6.56,5.1,1.8999999999999997e-38,-86.86,7.59e-38,-85.47,2.5299999999999997e-38,-86.57
3,4,HeLa.meme,3.0,NNNNNKUDGCCAMMUGNNNNN,NNNNDGUKGCCAMAUGNNNNN,308.0,4.16,72.0,0.97,4.23,1.4,5.2399999999999996e-37,-83.54,2.09e-36,-82.15,5.2399999999999996e-37,-83.54
4,# SEA (Simple Enrichment Analysis): Version 5....,,,,,,,,,,,,,,,,
5,# The format of this file is described at http...,,,,,,,,,,,,,,,,
6,# sea -oc sea_out -p HeLa.fa -m HeLa.meme,,,,,,,,,,,,,,,,
