# 7-mer Word Chains
This notebook makes a graph where nodes represent 7-mer sequences and edges represent a change in sequence (ex: AAA->AAC). 

Here, we are first generating all 7-mer sequences. These sequences are also present in 7-mer dict. 

In [108]:
import itertools
WORD_LIST = '7-mer dict'
WORD_LEN = 7
MAX_ITERS = 100
import pydot 
import networkx as nx
from networkx.drawing.nx_pydot import from_pydot
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
def seq_gen():
    # Generate all 7-character combinations of A, G, T, C
    combinations = [''.join(p) for p in itertools.product('AGTC', repeat=7)]
    print(f"Total number of combinations: {len(combinations)}")
    #write to file
    with open("7-mer dict.txt", 'w') as file :
        for line in combinations:
            file.write(line+"\n")
    print("done")
   
        

In [31]:
seq_gen()

Total number of combinations: 16384
done


## Creating 7-mer Network
This code creates a graph with 7-mer nodes and edges denoting a character relationship (for example, 'AAAAAAA'---'AAAAAAT'). 

In [17]:
# Convert a string to a number
def make_number(word):
    num = 0
    mult = 1
    for w in word:
        num += (ord(w) - ord('A')) * mult
        mult *= 256
    return num

# Convert a number to a string
def make_word(number):
    word = ""
    for i in range(WORD_LEN):
        word += chr((number & 0xFF) + ord('A'))
        number >>= 8
    return word

#makes dictionary of connections
def make_dict():
    # Create a lookup table for 1 letter diffs (fastest)
    print('Creating Diff Lookup Table...')
    pair_lut = set()
    for i in range(WORD_LEN):
        for j in range(32):
            pair_lut.add(j << (i * 8))
    print('Loading Dictionary...')
    all_words = []
    with open(WORD_LIST + '.txt', 'r') as fin:
        for word in fin:
            word = word.strip().upper()
            if len(word) == WORD_LEN:
                all_words.append(make_number(word))
    print('Loaded ' + str(len(all_words)) + ' words.')
    print('Finding All Connections...')
    all_pairs = []
    for i in range(len(all_words)):
        w1 = all_words[i]
        word1 = make_word(w1)
        for j in range(i):
            w2 = all_words[j]
            #p = are_pair_num(w1, w2)
            p = (w1 ^ w2) in pair_lut
            if p:
                all_pairs.append((word1, make_word(w2)))
    print("Found " + str(len(all_pairs)) + " connections.")
    print("Writing file...")
    with open("graph.dot",'w') as fout:
        fout.write('graph words {\n')
        for w in all_words:
            word = make_word(w)
            fout.write('  "' + word + '";\n')
        for w1,w2 in all_pairs:
            fout.write('  "' + w1 + '" -- "' + w2 + '";\n')
        fout.write('}\n')
    print("done")
    return all_words, pair_lut
    
    #returns the number of edges from one node to another
def get_edges(node1, node2, all_words, pair_lut):
    from_word = make_number(node1)
    to_word = make_number(node2)
    if from_word != 0 and not from_word in all_words:
        print("No connections to " + make_word(from_word))
        return -1
    if not to_word in all_words:
        print("No connections to " + make_word(to_word))
        return -1
    connections = {}
    dist = dict([(word,-1) for word in all_words])
    dist[to_word] = 0
    is_found = False
    for iter in range(MAX_ITERS):
        made_changes = False
        for w1 in all_words:
            if dist[w1] == iter:
                for w2 in all_words:
                    if dist[w2] != -1: continue
                    if (w1 ^ w2) not in pair_lut: continue
                    dist[w2] = iter + 1
                    connections[w2] = w1
                    made_changes = True
                    if w2 == from_word:
                        is_found = True
                        break
            if is_found: break
        if is_found or (not made_changes): break    
    if from_word != 0:
        if not from_word in connections:
            print('Can not connect!')
            return -1
        else:
            w = from_word
            while True:
                if w == to_word: break
                w = connections[w]
            return dist[from_word]
    else:
        for word in all_words:
            if dist[word] > 0:
                return dist[word]
            
def get_k_nearest(node, k, graph):
    neighbors = []
    neighbors.append(node)
    nx_graph = from_pydot(graph)
    for i in range(0,k):
        new_neighbors = [] 
        for n in neighbors:
            new_neighbors = new_neighbors+list(nx_graph.neighbors(n))
        new_neighbors = list(set(new_neighbors))
        neighbors = new_neighbors
    return neighbors

In [5]:
all_words, pair_lut = make_dict()
(graph,) = pydot.graph_from_dot_file("graph.dot")

Creating Diff Lookup Table...
Loading Dictionary...
Loaded 16384 words.
Finding All Connections...
Found 172032 connections.
Writing file...
done


# Utilizing Data
We now combine the network data with Mean RNA-binding proteins (RBP) Values. In this case, we are calculating the mean RBP for a set of 7-mers, decided by the network data. 

In [196]:
def meanRBP_values(motif_set, data_df):
    total_RBP = 0
    for motif in motif_set:
        total_RBP += data_df.loc[data_df['Motif'] == motif, 'Mean_RBP'].values[0]
    return total_RBP

def get_density(motif_set, graph):
    subgraph = pydot.Subgraph()
    for node in motif_set:
        subgraph.add_node(pydot.Node(node))
    for edge in graph.get_edges():
        src = edge.get_source().replace('"', '')
        dst = edge.get_destination().replace('"', '')
        if src in motif_set and dst in motif_set:
            subgraph.add_edge(pydot.Edge(src, dst))
    nx_graph = nx.nx_pydot.from_pydot(subgraph)
    #print(nx_graph)
    return nx.density(nx_graph)

def get_connectivity(motif_set):
    total_distance = 1
    for motif1 in motif_set:
        for motif2 in motif_set:
            if motif1 != motif2:
                distance = get_edges(motif1, motif2, all_words, pair_lut)
                if distance == -1:
                    print("ERROR: Distance = -1")
                total_distance += distance
    return len(motif_set)/total_distance

def prep_c_rbp(in_file):
    df = pd.read_csv("data/"+in_file+".csv")[["Motif","Mean_RBP"]]
    df = df.sort_values(by=['Mean_RBP'], ascending=False)
    df.insert(2, "Connectivity", [None] * len(df), True)
    df.insert(2, "Connectivity * Mean_RBP", [None] * len(df), True)
    df = df.reset_index(drop=True)
    df.to_csv("data/"+in_file+"_Connectivity.csv")

def calc_c_rbp(in_file):
    df = pd.read_csv("data/"+in_file+"_Connectivity.csv")
    while df.isna().any().any():
        motif_set = []
        first_none_index = df["Connectivity * Mean_RBP"].isnull().idxmax() if df["Connectivity * Mean_RBP"].isnull().values.any() else None
        print(first_none_index)
        rbp_sum = sum(df['Mean_RBP'][:first_none_index+1])
        motif_set = df['Motif'][:first_none_index]
        connectivity = get_connectivity(motif_set)
        c_rbp = connectivity * rbp_sum
        df.at[first_none_index, 'Connectivity'] = connectivity
        df.at[first_none_index, 'Connectivity * Mean_RBP'] = c_rbp
        df.to_csv("data/"+in_file+"_Connectivity.csv")
    print("done")

In [198]:
#prep_c_rbp("ZF_7N_PCBP2_vs_Adar-1")
calc_c_rbp("ZF_7N_PCBP2_vs_Adar-1")

13


KeyboardInterrupt: 