In [3]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns

%matplotlib inline
plt.rcParams['figure.figsize'] = (14,8)
plt.rcParams['figure.dpi'] = 150
sns.set()
sns.set_context("talk")

In [19]:
# functions for data processing

def extract_quadrumers(aptamer_sequence):
    #takes in one 18-mer and return a table of quadrumers, with a position column and a quadrumer column
    quadrumers = []
    for i in np.arange(15):
        quad = aptamer_sequence[i:i+4]
        quadrumers = np.append(quadrumers,quad)
    return quadrumers

def quadrumer_with_position(aptamer_sequence):
    #return a table with position and relative quadrumer sequence for a given aptamer sequence
    quadrumers = extract_quadrumers(aptamer_sequence)
    positions = np.arange(1,16)
    return pd.DataFrame({'Position':positions,'Quadrumer':quadrumers})

def full_table_weighted(tbl):
    #takes a tbl like RnE or RnC, and returns a list of trimed sequence with weights as copy number
    #output a tbl with three columns 'Position' 'Quadrumer' and 'Weighted count'
    
    #normalize RPM
    rpm_total = sum(tbl['RPM'])
    
    total_position = []
    total_quads = []
    total_weighted_count = []
    
    for i in np.arange(len(tbl)):
        temp_18mer = tbl['Trimed'][i]
        temp_rpm = tbl['RPM'][i]/rpm_total*1000
        temp_df = quadrumer_with_position(temp_18mer).groupby(['Position','Quadrumer']).size().to_frame().reset_index().rename(columns={0:'count'})
        total_position = np.append(total_position,temp_df['Position'])
        total_quads = np.append(total_quads,temp_df['Quadrumer'])
        total_weighted_count = np.append(total_weighted_count, temp_df['count']*temp_rpm)
        
    return pd.DataFrame({'Position':total_position,'Quadrumer':total_quads,'Weighted Count':total_weighted_count}).astype({'Position': 'int64'})

def weighted_freq_only(full_weighted_tbl):
    #takes a full_weighted_tbl, an output from the full_table_weighted function
    #return a table with weighted frequency only. two columns: 'Quadrumer' and "Weighted frequency" (a sum!)
    return full_weighted_tbl.groupby('Quadrumer').sum().loc[:,["Weighted Count"]].rename(columns={'Weighted Count':'Weighted frequency'})

def wise_append(original_list, temp_list):
    if len(temp_list) == 0:
        return np.append(original_list, 0)
    else:
        return np.append(original_list, temp_list[0])
    

def expand_df(df):
    #takes a df with quads as index, and one 'R2E count' column and one 'R6E count' columns
    #return a expand it to R3,R4, and R5 using the given index as reference.
    R3E_count_6rich2low = []
    R4E_count_6rich2low = []
    R5E_count_6rich2low = []

    for i in df.index:
        R3E_temp_6rich2low = R3E_frequency.query("Quadrumer == @i")['Weighted frequency']
        R4E_temp_6rich2low = R4E_frequency.query("Quadrumer == @i")['Weighted frequency']
        R5E_temp_6rich2low = R5E_frequency.query("Quadrumer == @i")['Weighted frequency']
    
        R3E_count_6rich2low = wise_append(R3E_count_6rich2low,R3E_temp_6rich2low)
        R4E_count_6rich2low = wise_append(R4E_count_6rich2low,R4E_temp_6rich2low)
        R5E_count_6rich2low = wise_append(R5E_count_6rich2low,R5E_temp_6rich2low)
    
    return pd.DataFrame({'R2E':df['R2E count'],
                                        'R3E':R3E_count_6rich2low,
                                        'R4E':R4E_count_6rich2low,
                                        'R5E':R5E_count_6rich2low,
                                        'R6E':df['R6E count'],
                                       })

def expand_df_ctrl(df):
    #takes a df with quads as index, and one 'R2E count' column and one 'R6E count' columns
    #return a expand it to R3,R4, and R5 using the given index as reference.
    R3C_count_6rich2low = []
    R4C_count_6rich2low = []
    R5C_count_6rich2low = []

    for i in df.index:
        R3C_temp_6rich2low = R3C_frequency.query("Quadrumer == @i")['Weighted frequency']
        R4C_temp_6rich2low = R4C_frequency.query("Quadrumer == @i")['Weighted frequency']
        R5C_temp_6rich2low = R5C_frequency.query("Quadrumer == @i")['Weighted frequency']
    
        R3C_count_6rich2low = wise_append(R3C_count_6rich2low,R3C_temp_6rich2low)
        R4C_count_6rich2low = wise_append(R4C_count_6rich2low,R4C_temp_6rich2low)
        R5C_count_6rich2low = wise_append(R5C_count_6rich2low,R5C_temp_6rich2low)
    
    return pd.DataFrame({'R2C':df['R2C count'],
                                        'R3C':R3C_count_6rich2low,
                                        'R4C':R4C_count_6rich2low,
                                        'R5C':R5C_count_6rich2low,
                                        'R6C':df['R6C count'],
                                       })

**Import raw data**

In [5]:
repo_url = 'https://raw.githubusercontent.com/Xiaoqi-Sun/aptamer_scoring/main/'

Serotonin

In [8]:
for i in np.arange(2,7):
    exec("R{}E = pd.read_csv(repo_url+'serotonin%20raw%20data/{}RE.csv')".format(i, i))
    exec("R{}C = pd.read_csv(repo_url+'serotonin%20raw%20data/{}RC.csv')".format(i, i))

In [11]:
R6E

Unnamed: 0,Sequence,Rank,Reads,RPM,Trimed
0,CCCCCCACACACACAACGACGCGGCCCCCC,39570,17,0.52,ACACACACAACGACGCGG
1,CCCCCCAGCACAACACGGCAACCTCCCCCC,39570,17,0.52,AGCACAACACGGCAACCT
2,CCCCCCAACACACCACAGACTCTGCCCCCC,42054,16,0.49,AACACACCACAGACTCTG
3,CCCCCCACACACCATCAGACGCCGCCCCCC,48466,14,0.43,ACACACCATCAGACGCCG
4,CCCCCCAGCAGCACACGACACACTCCCCCC,48466,14,0.43,AGCAGCACACGACACACT
...,...,...,...,...,...
995,CCCCCCACGCCACAAACACCGGTGCCCCCC,247029,4,0.12,ACGCCACAAACACCGGTG
996,CCCCCCACACACCTCAGCCGCCTGCCCCCC,247029,4,0.12,ACACACCTCAGCCGCCTG
997,CCCCCCACCGATCCAAACAGCACGCCCCCC,247029,4,0.12,ACCGATCCAAACAGCACG
998,CCCCCCACACAGCTCACTTCCGCTCCCCCC,247029,4,0.12,ACACAGCTCACTTCCGCT


Oxytocin

In [9]:
for i in np.arange(3,7):
    exec("R{}E_O = pd.read_csv(repo_url+'Oxytocin%20raw%20data/R{}E_O.csv')".format(i, i))

In [10]:
R3E_O

Unnamed: 0,Sequence,Reads,RPM
0,GGGGTTACTATATGACAT,20,0.39
1,CCCCCCCCCCCCCCCCCC,12,0.23
2,AGGGGAGCGTGCGGAGGC,12,0.23
3,GACTTGGGCTCATGCTGT,10,0.19
4,TAGACAGGGCTGACTGTG,10,0.19
...,...,...,...
995,ACATCAAAGGGTGTGGAG,5,0.10
996,TAGCGGTCGTCTTCTCAG,5,0.10
997,ATACCAGCACGGTCAGGT,5,0.10
998,GGTGTACAACGAGCACGG,5,0.10


**Data processing:** processed data are saved into csv files

In [6]:
# for i in np.arange(2,7):
#    exec('R{}E_full_table_weighted = full_table_weighted(R{}E)'.format(i, i))
#    exec('R{}E_frequency = weighted_freq_only(R{}E_full_table_weighted)'.format(i, i))
#    exec('R{}C_full_table_weighted = full_table_weighted(R{}C)'.format(i, i))
#    exec('R{}C_frequency = weighted_freq_only(R{}C_full_table_weighted)'.format(i, i))

**Export processed data:** all RnE_frequency and full_table_weighted

In [7]:
#for i in np.arange(2,7):
#    exec("R{}E_frequency.to_csv('R{}E_frequency.csv', index=True)".format(i, i))
#    exec("R{}E_full_table_weighted.to_csv('R{}E_full_table_weighted.csv', index=True)".format(i, i))
#    exec("R{}C_frequency.to_csv('R{}C_frequency.csv', index=True)".format(i, i))
#    exec("R{}C_full_table_weighted.to_csv('R{}C_full_table_weighted.csv', index=True)".format(i, i))

**Re-import processed data:**  the location is moved

In [30]:
for i in np.arange(2,7):
    exec("R{}E_frequency = pd.read_csv(repo_url+'serotonin%20data%20processed/R{}E_frequency.csv',index_col='Quadrumer')".format(i, i))
    exec("R{}C_frequency = pd.read_csv(repo_url+'serotonin%20data%20processed/R{}C_frequency.csv',index_col='Quadrumer')".format(i, i))
    exec("R{}E_full_table_weighted = pd.read_csv(repo_url+'serotonin%20data%20processed/R{}E_full_table_weighted.csv',index_col=0)".format(i, i))
    exec("R{}C_full_table_weighted = pd.read_csv(repo_url+'serotonin%20data%20processed/R{}C_full_table_weighted.csv',index_col=0)".format(i, i))

In [31]:
print('There are', len(R6E_frequency.index.unique()),'unique quadrumers in R6E')
print('There are', len(R6C_frequency.index.unique()),'unique quadrumers in R6C')

There are 228 unique quadrumers in R6E
There are 253 unique quadrumers in R6C


In [9]:
R6E_top10 = R6E_frequency.sort_values('Weighted frequency',ascending=False).head(10)
R6E_top10

Unnamed: 0,Quadrumer,Weighted frequency
17,ACAC,955.754268
66,CACA,675.415167
67,CACC,518.755081
20,ACCA,411.682731
79,CCAC,359.133666
197,TCCG,319.881547
63,CAAC,276.390663
69,CACT,271.687377
86,CCGA,260.36465
141,GCAC,258.738822


In [30]:
R6C_top10 = R6C_frequency.sort_values('Weighted frequency',ascending=False).head(10)
R6C_top10

Unnamed: 0,Quadrumer,Weighted frequency
109,CGTG,276.067596
17,ACAC,269.934825
133,GACG,240.693876
237,TGTG,225.031924
27,ACGT,213.980331
96,CGAC,205.721488
184,GTGC,203.953631
69,CACG,193.253619
67,CACA,190.54379
22,ACCG,190.076121
