In [1]:
import pandas as pd
from weblogo import *

### Configuration

In [2]:
sea = "sea"  # change to specific directory of MEME/bin/sea
df_name = "variant_G.csv"  # input, be careful for multiindex
motif_col = "motif_F10"
cluster_col = "Cluster"
meme_fn = "variant.meme"
fasta_fn = "variant.fa"
sea_out = "sea_out"

### Run

In [3]:
df = pd.read_csv(df_name, index_col=None, header=[0])
# df = pd.read_csv(df_name, index_col=[0,1,2], header=[0, 1]) # for dual header

In [4]:
df

Unnamed: 0.1,Unnamed: 0,Tan et al,motif_F10,base,num_of_sample,known_mod,X,Y,Cluster
0,chr14@102228202@+,True,CGGCAGGGGCGGCGGCGGCGG,G,428,,3.113396,5.159237,3
1,chr17@8129600@-,True,TGGTTAAAGCGCCTGTCTAGT,G,11,m22G,0.459602,7.089931,13
2,chr12@56991420@-,True,TGTGTGTGTCGGCCTCAGAGC,G,259,,-0.983974,7.385086,1
3,chr17@8129984@-,True,TGGTTAAGGCGATGGACTAGA,G,168,m22G,0.414791,7.292553,13
4,chrX@53589096@-,True,AGGAGGAGGAGGAGGAAGATG,G,221,,-1.303561,7.057323,1
...,...,...,...,...,...,...,...,...,...
7220,chr12@22213806@+,True,GCTGAACATAGTGTGGATATA,G,6,,3.221249,7.254860,6
7221,chr19@36128080@+,True,GATCTTCTGTGGGGATCTGGG,G,6,,2.868535,7.738324,6
7222,chr4@99983466@+,True,ATAGTGATGAGAAAGGAATTT,G,6,,0.464026,10.502323,4
7223,chr7@135613051@-,True,GTGTGTGGGTGTGTGTGTGTA,G,6,,3.890571,7.946128,2


In [5]:
def extact_all_fasta(df_in, column, fn_out, rna=True):
    N = 0
    with open(fn_out, "w") as output:
        for idx, row in df_in.iterrows():
            if rna == True:
                output.write(">{}\n{}\n".format(N, row[column].replace("T", "U")))
            else:
                output.write(">{}\n{}\n".format(N, row[column]))
            N += 1        

In [6]:
extact_all_fasta(df, motif_col, fasta_fn)

In [7]:
def generate_meme_file(df_in, id_column, motif_column, fn_out, rna=True):
    all_ids = set(df_in[id_column].tolist())
    temp_mat_name = fn_out+".temp.mat"
    print(all_ids)
    with open(temp_mat_name, "w") as output:
        for ID in all_ids:
            print(ID)
            subdf = df_in[df_in[id_column]==ID]
            count_data = {}
            for _, row in subdf.iterrows():
                if "N" in row[motif_column]:
                    continue
                if rna == True:
                    iterseq = row[motif_column].replace("T", "U")
                else:
                    iterseq = row[motif_column]
                for idx, base in enumerate(list(iterseq)):
                    if idx not in count_data:
                        count_data[idx] = {"A":0, "C": 0, "G": 0, "U":0}
                    count_data[idx][base] += 1
            count_df = pd.DataFrame.from_dict(count_data).T
            seqs = count_df.values
            seqs = np.array(seqs)
            logodata = LogoData.from_counts(counts=seqs, alphabet='ACGU')

            temp = []
            for i in range(logodata.counts.shape[0]):
                # temp.extend(list(logodata.entropy[i]*logodata.counts[i]/logodata.counts[i].sum()))
                output.write("{}\t{}\t{}\t{}\n".format(logodata.counts[i][0], logodata.counts[i][1], logodata.counts[i][2], logodata.counts[i][3]))
            output.write("\n")
        
    !matrix2meme -rna < $temp_mat_name > $fn_out

In [8]:
generate_meme_file(df, cluster_col, motif_col, meme_fn)

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25


In [9]:
!$sea -oc $sea_out -p $fasta_fn -m $meme_fn

# Checking alphabets in 1 motif files.
# Loading motifs from file 'variant.meme'
# Alphabet: RNA
# NOTE: Will convert any DNA sequences to RNA.
# Positive sequences "variant.fa" - training: 6503 hold-out: 722
# Negative sequences are shuffled primary sequences (2-order) - training: 6503 hold-out: 722
# Estimating background model from control sequences.
# Background: A 0.245 C 0.236 G 0.281 U 0.239
# Background order: 2 Background size: 84
# Using Fisher Exact test for p-values.
# Computing q-values.
#   Cannot estimate pi_0 accurately from fewer than 100 p-values.
#   Total p-values = 25. Using pi_zero = 1.0.
# Freeing storage...


### It is better to read the html.

In [10]:
df_sea = pd.read_csv("./{}/sea.tsv".format(sea_out), header=0, sep="\t")

In [11]:
df_sea

Unnamed: 0,RANK,DB,ID,ALT_ID,CONSENSUS,TP,TP%,FP,FP%,ENR_RATIO,SCORE_THR,PVALUE,LOG_PVALUE,EVALUE,LOG_EVALUE,QVALUE,LOG_QVALUE
0,1,variant.meme,20.0,NNNNNNNNBUGNNNSHNNNRN,NDNNHBBUBUGNBNCHNNNRN,2631.0,40.46,774.0,11.9,3.4,0.016,3.34e-313,-719.5,8.34e-312,-716.29,8.34e-312,-716.29
1,2,variant.meme,21.0,NNNHNNDNYUGNNNNNNNVDN,NVVHNVUBYUGNKCYNNNVDN,2422.0,37.24,716.0,11.01,3.38,0.0038,2.03e-279,-641.71,5.0699999999999997e-278,-638.49,2.54e-278,-639.19
2,3,variant.meme,19.0,NNNNNNDBBUGNNNCNNNNNN,NNBNNNWSYUGHNCCUBVNNV,2400.0,36.91,704.0,10.83,3.41,0.02,2.1799999999999998e-278,-639.34,5.450000000000001e-277,-636.12,1.82e-277,-637.22
3,4,variant.meme,12.0,NNNDDNNNBUGNNNNNNNDNN,NNNDDNHBUUGHWGVDNARDR,2468.0,37.95,758.0,11.66,3.25,0.073,7.33e-275,-631.22,1.83e-273,-628.0,4.58e-274,-629.39
4,5,variant.meme,11.0,NNNDNNNNNKGVDVNNNDDNN,NVADDNNNUUGMWGNDNADNN,2601.0,40.0,852.0,13.1,3.05,0.0026,7.54e-274,-628.89,1.8800000000000002e-272,-625.67,3.7699999999999995e-273,-627.28
5,6,variant.meme,14.0,BNNNNNNNBYGNNNSNHNBNN,SNNVNSCUSUGYCNSYHBBNS,2414.0,37.12,742.0,11.41,3.25,0.03,8.44e-267,-612.66,2.1100000000000002e-265,-609.44,3.52e-266,-611.23
6,7,variant.meme,17.0,NNNNNNNNBUGNNNNNNNVNN,NNNNNWYHBUGBUKSWARRRW,2259.0,34.74,672.0,10.33,3.36,0.048,8.249999999999999e-254,-582.75,2.0599999999999999e-252,-579.53,2.95e-253,-581.47
7,8,variant.meme,13.0,NVNNNNNNBKGVNVVNSNNNN,DSNNNVDSBKGMWGSWSVRRV,2490.0,38.29,841.0,12.93,2.96,0.018,5.46e-249,-571.65,1.3700000000000001e-247,-568.43,1.7100000000000002e-248,-570.51
8,9,variant.meme,5.0,NNNNNHDNNKGDRWNNHNNNN,DNNUNHDNDUGAAWSHADNWN,2344.0,36.04,748.0,11.5,3.13,0.037,3.4999999999999997e-246,-565.18,8.74e-245,-561.97,9.71e-246,-564.16
9,10,variant.meme,15.0,NNNNNNNNBUGDDNNNNNNNN,HNHNHAWNUUGDDDDNWWDND,2337.0,35.94,749.0,11.52,3.12,0.0018,5.24e-244,-560.17,1.3099999999999999e-242,-556.96,1.31e-243,-559.26
