In [1]:
from pyod.models.abod import ABOD
from pyod.models.knn import KNN


In [2]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import ipywidgets as widgets
%matplotlib inline


plt.rcParams['figure.figsize'] = (14,8)
#plt.rcParams['figure.dpi'] = 150
sns.set()
sns.set_context("talk")

In [3]:
repo_url = 'https://raw.githubusercontent.com/Xiaoqi-Sun/aptamer_scoring/main/'

# raw data
for i in np.arange(2,7):
    exec("R{}E = pd.read_csv(repo_url+'serotonin%20raw%20data/{}RE.csv')".format(i, i))
    exec("R{}C = pd.read_csv(repo_url+'serotonin%20raw%20data/{}RC.csv')".format(i, i))
    
# processed data
for i in np.arange(2,7):
    exec("R{}E_frequency = pd.read_csv(repo_url+'serotonin%20processed%20data/R{}E_frequency.csv',index_col='Quadrumer')".format(i, i))
    exec("R{}C_frequency = pd.read_csv(repo_url+'serotonin%20processed%20data/R{}C_frequency.csv',index_col='Quadrumer')".format(i, i))
    exec("R{}E_full_table_weighted = pd.read_csv(repo_url+'serotonin%20processed%20data/R{}E_full_table_weighted.csv',index_col=0)".format(i, i))
    exec("R{}C_full_table_weighted = pd.read_csv(repo_url+'serotonin%20processed%20data/R{}C_full_table_weighted.csv',index_col=0)".format(i, i))

In [4]:
# experimental dF/F values

# round 1: Sanghwa's 80 sequences
dFF = pd.read_csv(repo_url+'dFF%20data/dFF_r1.csv',usecols=[0,1,2])

# round 2: Xiaoqi's Prediction
dFF2_new = pd.read_csv(repo_url+'dFF%20data/dFF_r2_new.csv')
#dFF2 = pd.read_excel('dFF2.xlsx').loc[:,['df/f','Trimed']].rename(columns={'Trimed':'Sequence'})
#dFF2_old = pd.DataFrame({'Name':["N/A" for x in range(10)], 'Sequence':dFF2['Sequence'], 'dFF':dFF2['df/f']})

# round 3: Payam's prediction
#dFF3_old = pd.read_csv(repo_url+'dFF%20data/dFF_r3_old.csv',usecols=[0,1,3]).rename(columns={'dFF_1195':'dFF'}) #use dFF value at 1195nm
dFF3_new = pd.read_csv(repo_url+'dFF%20data/dFF_r3_new.csv',usecols=[0,1,3]).rename(columns={'dFF_1195':'dFF'}) #use dFF value at 1195nm



In [5]:
# general function for calculating quad score

def max_freq_ratio(quad_seq):
    #find the last term of the score definition
    r2=R2E_frequency[R2E_frequency.index==quad_seq]['Weighted frequency'][0] if quad_seq in R2E_frequency.index else 0
    r3=R3E_frequency[R3E_frequency.index==quad_seq]['Weighted frequency'][0] if quad_seq in R3E_frequency.index else 0
    r4=R4E_frequency[R4E_frequency.index==quad_seq]['Weighted frequency'][0] if quad_seq in R4E_frequency.index else 0
    r5=R5E_frequency[R5E_frequency.index==quad_seq]['Weighted frequency'][0] if quad_seq in R5E_frequency.index else 0      
    r6=R6E_frequency[R6E_frequency.index==quad_seq]['Weighted frequency'][0]
    
    # handle the inf case
    r6r5 = 0 if r5==0 else r6/r5
    r5r4 = 0 if r4==0 else r5/r4
    r4r3 = 0 if r3==0 else r4/r3
    r3r2 = 0 if r3==0 else r3/r2
    return max(r3r2,r4r3,r5r4,r6r5)

def max_freq_ratio_ctrl(quad_seq):
    #find the last term of the score definition
    r2=R2C_frequency[R2C_frequency.index==quad_seq]['Weighted frequency'][0] if quad_seq in R2C_frequency.index else 0
    r3=R3C_frequency[R3C_frequency.index==quad_seq]['Weighted frequency'][0] if quad_seq in R3C_frequency.index else 0
    r4=R4C_frequency[R4C_frequency.index==quad_seq]['Weighted frequency'][0] if quad_seq in R4C_frequency.index else 0
    r5=R5C_frequency[R5C_frequency.index==quad_seq]['Weighted frequency'][0] if quad_seq in R5C_frequency.index else 0      
    r6=R6C_frequency[R6C_frequency.index==quad_seq]['Weighted frequency'][0]
    
    # handle the inf case
    r6r5 = 0 if r5==0 else r6/r5
    r5r4 = 0 if r4==0 else r5/r4
    r4r3 = 0 if r3==0 else r4/r3
    r3r2 = 0 if r3==0 else r3/r2
    
    return max(r3r2,r4r3,r5r4,r6r5)

def extract_quadrumers(aptamer_sequence):
    #takes in one 18-mer and return a table of quadrumers, with a position column and a quadrumer column
    quadrumers = []
    for i in np.arange(15):
        quad = aptamer_sequence[i:i+4]
        quadrumers = np.append(quadrumers,quad)
    return quadrumers


def quad_score_exp(set_1_percentile, exp_ind_weight, name):
    #inputs: set_1_percentile -> see definition of set 1; {exp,ctrl}weight: weight for two indicator functions
    #return a dataframe with quadrumer as index and weighted frequency and scores as two columns
    #using quadrumers in R6E for calculation 
    
    #set 1: Kmers with frequencies once greater than 99.5th percentile of the kmers in the control round
    control_percentile = np.percentile(R2E_frequency['Weighted frequency'], set_1_percentile)
    set1 = R6E_frequency[R6E_frequency['Weighted frequency']>control_percentile]
    set1_list = set1.index

    #set 2: : with the same class size and consisting of kmers with the largest amplification-fold values was then defined. 
    set2 = R2E_frequency.merge(R6E_frequency,on='Quadrumer').rename(columns={'Weighted frequency_x':'R2E freq', 'Weighted frequency_y':'R6E freq'})
    set2['amp-fold value'] = set2['R6E freq']/set2['R2E freq']
    set2 = set2.sort_values('amp-fold value', ascending=False).head(len(set1))
    set2_list = set2.index
    
    score_r6 = []
    for i in R6E_frequency.index:
        term1 = (i in set1_list) or (i in set2_list)
        term2 = (i in set1_list)
        term3 = max_freq_ratio(i)
        score_r6 = np.append(score_r6, term1*exp_ind_weight + term2*exp_ind_weight + term3)
        
    R6E_with_score = R6E_frequency.copy()
    R6E_with_score[name+'_exp'] = score_r6
    return R6E_with_score

def quad_score_ctrl(set_1_percentile, ctrl_ind_weight, name):
    #return a dataframe with quadrumer as index and weighted frequency and scores as two columns
    #using quadrumers in R6E for calculation 
    
    #set 1 NOTE: using 99.5 percentile only gives 3 quadrumers
    control_percentile = np.percentile(R2C_frequency['Weighted frequency'],set_1_percentile) 
    set1 = R6C_frequency[R6C_frequency['Weighted frequency'] > control_percentile]
    set1_list = set1.index
    
    #set 2: : with the same class size and consisting of kmers with the largest amplification-fold values was then defined. 
    set2 = R2C_frequency.merge(R6C_frequency,on='Quadrumer').rename(columns={'Weighted frequency_x':'R2C freq', 'Weighted frequency_y':'R6C freq'})
    set2['amp-fold value'] = set2['R6C freq']/set2['R2C freq']
    set2 = set2.sort_values('amp-fold value', ascending=False).head(len(set1))
    set2_list = set2.index
    
    score_r6 = []
    for i in R6C_frequency.index:
        term1 = (i in set1_list) or (i in set2_list)
        term2 = (i in set1_list)
        term3 = max_freq_ratio_ctrl(i)
        score_r6 = np.append(score_r6, term1*ctrl_ind_weight + term2*ctrl_ind_weight + term3)
        
    R6C_with_score = R6C_frequency.copy()
    R6C_with_score[name+'_ctrl'] = score_r6
    return R6C_with_score

def quad_score_full(set_1_percentile_exp, set_1_percentile_ctrl, exp_ind_weight, ctrl_ind_weight, exp_weight, ctrl_weight, name):
    quad_exp = quad_score_exp(set_1_percentile_exp, exp_ind_weight, name)
    quad_ctrl = quad_score_ctrl(set_1_percentile_ctrl, ctrl_ind_weight, name)
    
    merged = quad_exp.merge(quad_ctrl, how='left', left_index=True, right_index=True)
    
    merged[name] = exp_weight*merged[name+ '_exp'] - ctrl_weight*merged[name+'_ctrl']
    return pd.DataFrame({'Weighted frequency': merged['Weighted frequency_x'], #weighted frequence is from R6E
                        name : merged[name] })

In [6]:
# general functions for calculating aptamer score 

def aptamer_score(RnE, quad_score, name):
    # Returns a dataframe like R6E, with 18-mer sequence and score for each aptamer
    # Inputs: RnE: a dataframe with 18-mer sequences;
    #        quad_score: a df , out put of quad_score_full function,
    
    quadrumer_score = quad_score
    score_name = quadrumer_score.columns[1]
    
    aptamer_score = []
    aptamer_freqsum = []
    for apt_seq in RnE['Trimed']:
        all_quads = extract_quadrumers(apt_seq)
        one_score = 0
        one_freqsum = 0
        for quad in all_quads:
            if len(quadrumer_score[quadrumer_score.index==quad]) != 0:
                one_score += quadrumer_score.loc[quad][1]
                one_freqsum += quadrumer_score.loc[quad][0]
        aptamer_score = np.append(aptamer_score, one_score)
        aptamer_freqsum = np.append(aptamer_freqsum, one_freqsum)
        
    tbl_with_score = RnE.copy().loc[:,['Trimed']]
    tbl_with_score[name]=aptamer_score
    tbl_with_score['Weighted frequency'] = aptamer_freqsum
    tbl_with_score.index = tbl_with_score.index + 1 # reset index to match!
    
    
    tbl_with_score[name+' percent'] = 100*tbl_with_score[name]/max(tbl_with_score[name])
    tbl_with_score[name+' su'] = (tbl_with_score[name]-np.mean(tbl_with_score[name]))/np.std(tbl_with_score[name])


    return tbl_with_score


def aptamer_score_dFF(dFF_tbl, quad_score, name):
    # for incorporating all 80 sequences of dFF table
    #Returns a dataframe like R6E, with 18-mer sequence and score for each aptamer
    #Inputs: RnE: a dataframe with 18-mer sequences;
    #        quad_score: a df , out put of quad_score_full function,
    quadrumer_score = quad_score
    score_name = quadrumer_score.columns[1]
    
    aptamer_score = []
    aptamer_freqsum = []
    for apt_seq in dFF_tbl['Sequence']:
        all_quads = extract_quadrumers(apt_seq)
        one_score = 0
        #one_freqsum = 0
        for quad in all_quads:
            if len(quadrumer_score[quadrumer_score.index==quad]) != 0:
                one_score += quadrumer_score.loc[quad][1]
                #one_freqsum += quadrumer_score.loc[quad][0]
        aptamer_score = np.append(aptamer_score, one_score)
        
    tbl_with_score = dFF_tbl.copy().loc[:,['Name','Sequence','dFF']]
    tbl_with_score[name]=aptamer_score
    tbl_with_score.index = tbl_with_score.index + 1 # reset index to match!
    
    
    tbl_with_score[name+' percent'] = 100*tbl_with_score[name]/max(tbl_with_score[name])
    tbl_with_score[name+' su'] = (tbl_with_score[name]-np.mean(tbl_with_score[name]))/np.std(tbl_with_score[name])


    return tbl_with_score

def dFF_with_score(threshold, dFF_tbl, quad_score, score_name):
    dFF_with_score = aptamer_score_dFF(dFF_tbl, quad_score, name=score_name)
    dFF_with_score['Y/N'] = dFF_with_score['dFF'].map(lambda x: 'Y' if x>threshold else 'N')
    return dFF_with_score

## Iterations
### Parameters in integer
- exp_ind_weight: 1 - 10
- ctrl_ind_weight: 1 - 10
- exp_weight: 1 - 10 
- ctrl_weight: 1- 10

In [7]:
def r_OD(dFF_with_score_tbl, X_train, clf):
    outlier_label = clf.fit(X_train).predict(X_train)
    dFF_with_outlier = dFF_with_score_tbl.copy()
    dFF_with_outlier['outlier'] = outlier_label
    outliers_dropped = dFF_with_outlier[dFF_with_outlier['outlier']==0]
    r_OD = outliers_dropped.corr().iloc[0,1] ## raw score and dFF
    return r_OD

# Version 1 
    all_exp_ind_weights.append(exp_ind_weight)
                all_ctrl_ind_weights.append(ctrl_ind_weight)
                all_exp_weights.append(exp_weight)
                all_ctrl_weights.append(ctrl_weight)
                all_overall_r.append(overall_r)
                all_ABOD_r.append(ABOD_r)
                all_KNN_r.append(KNN_r)
                all_OCSVM_r.append(OCSVM_r)

# Version 2

%%time
all_exp_ind_weights = []
all_ctrl_ind_weights = []
all_exp_weights = []
all_ctrl_weights = []
all_overall_r = []
all_ABOD_r = []
all_KNN_r = []
all_OCSVM_r = []
n = 3 # range 1-10

for exp_ind_weight in np.arange(1,n):
    for ctrl_ind_weight in np.arange(1,n):
        for exp_weight in np.arange(1,n):
            for ctrl_weight in np.arange(1,n):
                
                # Set up DataFrames
                temp_quads_score = quad_score_full(99.5, 99.5, exp_ind_weight, ctrl_ind_weight,exp_weight, ctrl_weight, 'temp')
                dFF_with_score_tbl = dFF_with_score(threshold=1.5, quad_score=temp_quads_score, score_name='temp')
                X_train = dFF_with_score_tbl.loc[:,['dFF','temp su']].values

                # overall_r
                overall_r = dFF_with_score_tbl.corr().iloc[0,3]
                
                # Outlier detecting r
                ABOD_r = r_OD(dFF_with_score_tbl, X_train, ABOD())
                KNN_r = r_OD(dFF_with_score_tbl, X_train, KNN())
                OCSVM_r = r_OD(dFF_with_score_tbl, X_train, OCSVM())
                
                # append to list
                all_exp_ind_weights = np.append(all_exp_ind_weights, exp_ind_weight)
                all_ctrl_ind_weights = np.append(all_ctrl_ind_weights, ctrl_ind_weight)
                all_exp_weights = np.append(all_exp_weights, exp_weight)
                all_ctrl_weights = np.append(all_ctrl_weights,ctrl_weight)
                all_overall_r = np.append(all_overall_r, overall_r)
                all_ABOD_r = np.append(all_ABOD_r, ABOD_r)
                all_KNN_r = np.append(all_KNN_r, KNN_r)
                all_OCSVM_r = np.append(all_OCSVM_r,OCSVM_r)
                
                if len(all_exp_ind_weights) == 1000:
                    print(1000)
                if len(all_exp_ind_weights) == 5000:
                    print(5000)
                    
                    
full_table = pd.DataFrame({'exp_ind_weight': all_exp_ind_weights,
                             'ctrl_ind_weight': all_ctrl_ind_weights,
                             'exp_weight':all_exp_weights,
                             'ctrl_weight':all_ctrl_weights,
                             'overall_r':all_overall_r,
                             'ABOD_r':all_ABOD_r,
                             'KNN_r': all_KNN_r,
                             'OCSVM_r':all_OCSVM_r})

full_table

# Version 3

filename = "iteration_round_2(3).csv"
f = open(filename, 'w')
with f:
    csv_writer = csv.writer(f) 
    csv_writer.writerow(["exp_ind_weight", "ctrl_ind_weight", "exp_weight","ctrl_weight", "overall_r", "ABOD_all_r", "KNN_all_r","overall_r_r1_only"])
    
    n = 11 
    
    for exp_ind_weight in np.arange(7,n):
        for ctrl_ind_weight in np.arange(1,n):
            for exp_weight in np.arange(1,n):
                for ctrl_weight in np.arange(1,n):

                    # set up quadrumer score
                    temp_quads_score = quad_score_full(99.5, 99.5, exp_ind_weight, ctrl_ind_weight,exp_weight, ctrl_weight, 'temp')
                    
                    # compute dFF score
                    dFF_1_with_score_tbl = dFF_with_score(threshold=1.5, dFF_tbl=dFF, quad_score=temp_quads_score, score_name='temp')
                    dFF_2_with_score_tbl = dFF_with_score(threshold=1.5, dFF_tbl=dFF2_new, quad_score=temp_quads_score, score_name='temp')
                    dFF_3_with_score_tbl = dFF_with_score(threshold=1.5, dFF_tbl=dFF3_new, quad_score=temp_quads_score, score_name='temp')
                    dFF_combined = dFF_1_with_score_tbl.append(dFF_2_with_score_tbl).append(dFF_3_with_score_tbl)
                    
                    
                    X_train = dFF_combined.loc[:,['dFF','temp']].values

                    # overall_r
                    overall_r = dFF_combined.corr().iloc[0,1]
                  
                    # Outlier detecting r for all sequences
                    ABOD_all_r = r_OD(dFF_combined, X_train, ABOD())
                    KNN_all_r = r_OD(dFF_combined, X_train, KNN())
                    
                    # Original round 1 r
                    overall_r_r1_only = dFF_1_with_score_tbl.corr().iloc[0,1]
                    
                    
                    # write to file
                    csv_writer.writerow([exp_ind_weight, ctrl_ind_weight, exp_weight, ctrl_weight, overall_r, ABOD_all_r, KNN_all_r,overall_r_r1_only])


In [9]:
full = pd.read_csv('parameter_iteration_r2.csv',usecols=range(1,9))

In [10]:
full

Unnamed: 0,exp_ind_weight,ctrl_ind_weight,exp_weight,ctrl_weight,overall_r,ABOD_all_r,KNN_all_r,overall_r_r1_only
0,1,1,1,1,0.304091,0.254804,0.245538,0.280207
1,1,1,1,2,0.264548,0.317705,0.261740,0.253958
2,1,1,1,3,0.219242,0.210967,0.185119,0.210071
3,1,1,1,4,0.190164,0.158849,0.092924,0.181792
4,1,1,1,5,0.171383,0.224947,0.201532,0.163789
...,...,...,...,...,...,...,...,...
9995,10,10,10,6,0.357141,0.336895,0.384163,0.263082
9996,10,10,10,7,0.366562,0.338025,0.358248,0.272101
9997,10,10,10,8,0.375308,0.336893,0.358276,0.280783
9998,10,10,10,9,0.383301,0.380356,0.382147,0.289000


# Inspection

In [22]:
full.describe().loc[:,"overall_r":"overall_r_r1_only"]

Unnamed: 0,overall_r,ABOD_all_r,KNN_all_r,overall_r_r1_only
count,10000.0,10000.0,10000.0,10000.0
mean,0.340003,0.339126,0.343897,0.259414
std,0.050533,0.065246,0.061458,0.051428
min,0.132555,0.075273,0.092924,0.076124
25%,0.310098,0.307615,0.324368,0.229676
50%,0.34279,0.346305,0.355089,0.26364
75%,0.380317,0.381867,0.381792,0.300924
max,0.416727,0.5205,0.470309,0.334257


In [23]:
# top 10 ABOD
full.sort_values('ABOD_all_r',ascending=False).head(15)

Unnamed: 0,exp_ind_weight,ctrl_ind_weight,exp_weight,ctrl_weight,overall_r,ABOD_all_r,KNN_all_r,overall_r_r1_only
5946,6,10,5,7,0.413116,0.5205,0.43607,0.319109
2400,3,5,1,1,0.396621,0.511922,0.373239,0.325829
910,1,10,2,1,0.403112,0.501168,0.435477,0.320134
6947,7,10,5,8,0.412704,0.498537,0.443828,0.317777
6647,7,7,5,8,0.409501,0.497307,0.411381,0.326547
4945,5,10,5,6,0.413601,0.494132,0.438645,0.320811
7502,8,6,1,3,0.393713,0.493484,0.416108,0.308348
5969,6,10,7,10,0.412187,0.49285,0.462283,0.317531
7636,8,7,4,7,0.409147,0.48885,0.409606,0.325542
3900,4,10,1,1,0.41415,0.4888,0.440614,0.323042


In [25]:
# top 15 KNN
full.sort_values('KNN_all_r',ascending=False).head(15)

Unnamed: 0,exp_ind_weight,ctrl_ind_weight,exp_weight,ctrl_weight,overall_r,ABOD_all_r,KNN_all_r,overall_r_r1_only
9938,10,10,4,9,0.410678,0.449454,0.470309,0.313212
8838,9,9,4,9,0.408696,0.444581,0.464035,0.314024
6725,7,8,3,6,0.406397,0.423202,0.462408,0.315796
6749,7,8,5,10,0.406397,0.423202,0.462408,0.315796
6737,7,8,4,8,0.406397,0.423202,0.462408,0.315796
6701,7,8,1,2,0.406397,0.423202,0.462408,0.315796
6713,7,8,2,4,0.406397,0.423202,0.462408,0.315796
5969,6,10,7,10,0.412187,0.49285,0.462283,0.317531
7978,8,10,8,9,0.408896,0.38159,0.460113,0.317799
5748,6,8,5,9,0.40515,0.425984,0.459922,0.314612


In [24]:
# top 15 overall
full.sort_values('overall_r',ascending=False).head(15)

Unnamed: 0,exp_ind_weight,ctrl_ind_weight,exp_weight,ctrl_weight,overall_r,ABOD_all_r,KNN_all_r,overall_r_r1_only
5956,6,10,6,7,0.416727,0.460111,0.439653,0.326567
3976,4,10,8,7,0.416714,0.447952,0.430703,0.328397
5945,6,10,5,6,0.416713,0.450652,0.446033,0.326198
4988,5,10,9,9,0.416697,0.435461,0.45486,0.327534
4977,5,10,8,8,0.416697,0.435461,0.45486,0.327534
4933,5,10,4,4,0.416697,0.435461,0.45486,0.327534
4911,5,10,2,2,0.416697,0.445178,0.45486,0.327534
4900,5,10,1,1,0.416697,0.445178,0.45486,0.327534
4955,5,10,6,6,0.416697,0.435461,0.45486,0.327534
4922,5,10,3,3,0.416697,0.435461,0.45486,0.327534


In [19]:
# top 10 round 1 only
full.sort_values('overall_r_r1_only',ascending=False).head(10)

Unnamed: 0,exp_ind_weight,ctrl_ind_weight,exp_weight,ctrl_weight,overall_r,ABOD_all_r,KNN_all_r,overall_r_r1_only
684,1,7,9,5,0.401593,0.399043,0.382117,0.334257
595,1,6,10,6,0.396566,0.404292,0.34168,0.33416
542,1,6,5,3,0.396566,0.404292,0.34168,0.33416
794,1,8,10,5,0.405422,0.42954,0.420615,0.334149
752,1,8,6,3,0.405422,0.426326,0.420615,0.334149
710,1,8,2,1,0.405422,0.456821,0.420615,0.334149
773,1,8,8,4,0.405422,0.42954,0.420615,0.334149
731,1,8,4,2,0.405422,0.426326,0.420615,0.334149
862,1,9,7,3,0.407964,0.417089,0.42958,0.334105
652,1,7,6,3,0.400819,0.407702,0.36626,0.334067


## TOP 10 Predictions (iterated version)


In [13]:
ABOD_top10_para = full.sort_values('ABOD_all_r',ascending=False).head(10).loc[:,'exp_ind_weight':'ctrl_weight']
KNN_top10_para = full.sort_values('KNN_all_r',ascending=False).head(10).loc[:,'exp_ind_weight':'ctrl_weight']

In [52]:
all_top_10_ABOD = []

for i in np.arange(10):
    # parameters
    exp_percentile, ctrl_percentile = [99.5, 99.5]
    exp_ind_weight, ctrl_ind_weight,exp_weight, ctrl_weight = ABOD_top10_para.iloc[i]
    name = 'score name'
    
    # temp tables
    temp_quads_score = quad_score_full(exp_percentile, ctrl_percentile, exp_ind_weight, ctrl_ind_weight, exp_weight, ctrl_weight, name)
    temp_aptamer_score = aptamer_score(R6E, temp_quads_score, name)
    dFF_with_score_tbl = dFF_with_score(threshold=1.5, quad_score=temp_quads_score, score_name=name)

    # get top 10
    temp_top_10 = temp_aptamer_score.sort_values('score name su', ascending=False).head(10)['Trimed']
    all_top_10_ABOD = np.append(all_top_10_ABOD, temp_top_10)
    

In [61]:
all_top_10_KNN = []

for i in np.arange(10):
    # parameters
    exp_percentile, ctrl_percentile = [99.5, 99.5]
    exp_ind_weight, ctrl_ind_weight,exp_weight, ctrl_weight = KNN_top10_para.iloc[i]
    name = 'score name'
    
    # temp tables
    temp_quads_score = quad_score_full(exp_percentile, ctrl_percentile, exp_ind_weight, ctrl_ind_weight, exp_weight, ctrl_weight, name)
    temp_aptamer_score = aptamer_score(R6E, temp_quads_score, name)
    dFF_with_score_tbl = dFF_with_score(threshold=1.5, quad_score=temp_quads_score, score_name=name)

    # get top 10
    temp_top_10 = temp_aptamer_score.sort_values('score name su', ascending=False).head(10)['Trimed']
    all_top_10_KNN = np.append(all_top_10_KNN, temp_top_10)
    

Look at the patterns of the 100 sequences from each method

In [9]:
pd.Series(all_top_10_ABOD).value_counts()

NameError: name 'all_top_10_ABOD' is not defined

In [62]:
pd.Series(all_top_10_KNN).value_counts()

ACAACCGCTCACTCCGAT    10
ACCGACCACAACTCCGCT    10
ACCAAGCACTCCGATCCT    10
ACAACCCAACTCCGCTCG    10
AACCGATCCAACCACTCG    10
ACCAACACTCCGCTCGAT    10
TACCACTCCAACTCCGCT    10
ACTCCGAACCACTCCGCT    10
ACCGCACAATCCTCCGAT     8
CAACCAGAGCACTCCGAT     6
AGCACTCCGATCCTCACA     4
GGCACGCACCGATCCGAT     1
ATCCGCAACTCATCCGCT     1
dtype: int64

In [None]:
all_top_10_KNN = 