# <span style="color:green"> Sequence match </span>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
import statistics as stat
import time
from Bio.Seq import Seq
from termcolor import colored

In [2]:
import gpcr_package as gpcr


>>>>>>>>>>>>>>>>> Testing dependencies...
`gpcr_package` depends on the following packages:
['ProgressBar', 'Seq', 'SeqIO', 'ceil', 'clr', 'colored', 'datetime', 'display', 'floor', 'go', 'it', 'kurtosis', 'norm', 'np', 'os', 'pd', 'pickle', 'plt', 'skew', 'stat', 'style']
<<<<<<<<<<<<<<<<< Dependencies test successful!


>>>>>>>>>>>>>>>>> Testing constants...
`gpcr_package` uses the following constants:
['AA', 'AA_ABBREVATIONS', 'AA_ABBRE_LONG_NAMES', 'AA_ABBRE_SHORT_NAMES', 'AA_HYDRO_PHOBICITY_PH2', 'AA_HYDRO_PHOBICITY_PH7', 'AA_LONG_NAMES', 'AA_SHORT_NAMES', 'BINS_PLOT_HIST', 'COLOR_COLORBAR_CMAP', 'COLOR_COLORBAR_CMAP_RED_BLUE', 'COLOR_CONFI_LINE', 'COLOR_FIT_GAUSSIAN', 'COLOR_GPCR_CLASSES', 'COLOR_GPCR_CLASSES_ALL', 'COLOR_MEAN_LINE', 'COLOR_MEDIAN_LINE', 'COLOR_PLOT_HIST_DEFAULT', 'COLOR_PLOT_LINE', 'COLOR_PLOT_SCATTER', 'COLOR_QUANTILE_LINE', 'COLOR_SEQ_COMPARISION_LENGTH_COMPARISION', 'COLOR_SEQ_COMPARISION_LENGTH_NON_COMPARISION', 'COLOR_SEQ_COMPARISION_MATCH', 'COLOR_SEQ_COM

# <span style="color:orange"> Jaccard Similarity </span>

### <span style="color:skyblue"> Logic: Probability of intersection of two texts </span>
Jaccard similarity or intersection over union is defined as size of intersection divided by size of union of two sets.  
It doesn't care about repeatition of words

In [13]:
def get_jaccard_sim(str1, str2): 
    a = set(str1.split()) 
    print(f"Text 1:       {a}")
    b = set(str2.split())
    print(f"Text 2:       {b}")
    c = a.intersection(b)
    print(f"Intersection: {c}")
    return float(len(c)) / (len(a) + len(b) - len(c))

In [15]:
get_jaccard_sim("ABC ijk abc mno ABC", "ABC xyz MNO ABC ijk")

Text 1:       {'mno', 'ABC', 'ijk', 'abc'}
Text 2:       {'MNO', 'ABC', 'ijk', 'xyz'}
Intersection: {'ABC', 'ijk'}


0.3333333333333333

In [16]:
get_jaccard_sim("ABC ijk abc mno ABC", "xyz ABC MNO ABC ijk")

Text 1:       {'mno', 'ABC', 'ijk', 'abc'}
Text 2:       {'MNO', 'ABC', 'ijk', 'xyz'}
Intersection: {'ABC', 'ijk'}


0.3333333333333333

# <span style="color:orange"> Cosine Similarity </span>

### <span style="color:skyblue"> Logic: Cosine similarity calculates similarity by measuring the cosine of angle between two vectors. </span>

We need to convert sentences into vectors.  
1. One way to do that is to use bag of words with either TF (term frequency) or TF-IDF (term frequency- inverse document frequency).  
The choice of TF or TF-IDF depends on application and is immaterial to how cosine similarity is actually performed — which just needs vectors.
TF is good for text similarity in general, but TF-IDF is good for search query relevance.  
2. Another way is to use Word2Vec or our own custom word embeddings to convert words into vectors.  

There are two main difference between tf/ tf-idf with bag of words and word embeddings:
1. tf / tf-idf creates one number per word, word embeddings typically creates one vector per word.
2. tf / tf-idf is good for classification documents as a whole, but word embeddings is good for identifying contextual content.

In [18]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings

def get_cosine_sim(*strs): 
    vectors = [t for t in get_vectors(*strs)]
    return cosine_similarity(vectors)
    
def get_vectors(*strs):
    warnings.filterwarnings("ignore")
    text       = [t for t in strs]
    vectorizer = CountVectorizer(text)
    vectorizer.fit(text)
    return vectorizer.transform(text).toarray()

In [19]:
get_vectors("ABC ijk abd mno ABC abc") 

array([[3, 1, 1, 1]], dtype=int64)

In [20]:
get_vectors("ABC ijk abd mno ABC", "ABC xyz MNO ABC ijk") 

array([[2, 1, 1, 1, 0],
       [2, 0, 1, 1, 1]], dtype=int64)

In [21]:
get_cosine_sim("ABC ijk abc mno ABC", "ABC xyz MNO ABC ijk")

array([[1.        , 0.91168461],
       [0.91168461, 1.        ]])

In [22]:
get_cosine_sim("ABC ijk abc mno ABC", "ABC xyz MNO ijk ABC ABC")

array([[1.        , 0.91168461],
       [0.91168461, 1.        ]])

In [10]:
get_cosine_sim("ABC ijk abc mno ABC", "ABC xyz MNO ABC ijk", "ABC xyz MNO ABC ijk")

array([[1.        , 0.91168461, 0.91168461],
       [0.91168461, 1.        , 1.        ],
       [0.91168461, 1.        , 1.        ]])

# <span style="color:orange"> Cosine Similarity for Sequences </span>

In [15]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings

def get_cosine_sim(*strs):
    vectors = [t for t in get_vectors(*strs)]
    return cosine_similarity(vectors)
    
# OLD >>
# def get_vectors(*sequences):
#     warnings.filterwarnings("ignore")
#     text       = [t for t in sequences]
#     vectorizer = CountVectorizer(text)
#     vectorizer.fit(text)
#     return vectorizer.transform(text).toarray()

def get_vectors(sequences):
    warnings.filterwarnings("ignore")
    vectorizer = CountVectorizer(sequences)
    vectorizer.fit(sequences)
    return vectorizer.transform(sequences).toarray()

# OLD >>
# def rep_aa_in_seq(seq):
#     aa_list    = [char*2 + " " for char in seq]
#     rep_aa_seq = ''.join(aa_list)[:-1]
#     return rep_aa_seq

def rep_aa_in_seq(seqs):
    rep_aa_seqs = [ ''.join([char*2 + " " for char in seq])[:-1] for seq in seqs ]
    return rep_aa_seqs

In [16]:
get_vectors( rep_aa_in_seq(["ABCDE", "ABDE"]) )

array([[1, 1, 1, 1, 1],
       [1, 1, 0, 1, 1]], dtype=int64)

In [17]:
rep_aa_in_seq(["ABCDE", "ABDE"])

['AA BB CC DD EE', 'AA BB DD EE']

In [18]:
seq_1 = 'ABCDEA'
seq_2 = 'ABCXF'
get_cosine_sim( rep_aa_in_seq( ["ABCDE", "ABDE"] ) )

array([[1.        , 0.89442719],
       [0.89442719, 1.        ]])

In [19]:
seq_1 = gpcr.GPCR_DF.loc[0, "seq"]
seq_2 = gpcr.GPCR_DF.loc[2, "seq"]
print(seq_1)
print(seq_2)
get_cosine_sim( rep_aa_in_seq( [seq_1, seq_2] ) )

EFRKAFLKILHC
SKTRNHSTAYLTK


array([[1.        , 0.44232587],
       [0.44232587, 1.        ]])

In [20]:
cos_sim_array = get_cosine_sim( rep_aa_in_seq( gpcr.GPCR_DF["seq"] ) )

In [23]:
cos_sim_list      = [ list(array) for array in cos_sim ]
cos_sim_elements  = sum(cos_sim_list, [])
cos_sim_elements.sort( reverse = True )
len( [ element for element in cos_sim_elements if element > 0.999] )

NameError: name 'cos_sim' is not defined

In [23]:
def get_cosine_sim_in_sequences( df              = gpcr.GPCR_DF,
                                 lim_cos_sim_min = 0,
                                 lim_cos_sim_max = 1,
                                 show_details    = False ):
    sequences = gpcr.GPCR_DF["seq"]
    index     = gpcr.GPCR_DF.index
    cosine_sim_in_sequences = []
    for (i, seq_1), (j, seq_2) in zip( zip(index[:-1], sequences[:-1]), zip(index[1:], sequences[1:]) ):
        cos_sim_mat = get_cosine_sim( rep_aa_in_seq([seq_1, seq_2]) )
        cosine_sim_in_sequences.append( (i, j, cos_sim_mat[0,1]) )
    
    # Applying limits >>
    cosine_sim_in_sequences = [ cos_sim for cos_sim in cosine_sim_in_sequences if (lim_cos_sim_min <= cos_sim[2] and cos_sim[2] <= lim_cos_sim_max) ]
    if show_details:
        for cosine_sims in cosine_sim_in_sequences:
            print( f"Cosine similarity: {cosine_sims[2]}" )
            print_comparision_seqs(   seq_1       = sequences[cosine_sims[0]],
                                      seq_2       = sequences[cosine_sims[1]],
                                      seq_1_index = cosine_sims[0],
                                      seq_2_index = cosine_sims[1],
                                  )
            print()
    return cosine_sim_in_sequences

In [24]:
cosine_sim_in_sequences = get_cosine_sim_in_sequences( lim_cos_sim_min = 0.999,
                                                       # lim_cos_sim_max = 1, 
                                                       show_details    = True )
cosine_sim_in_sequences

Cosine similarity: 1.0
Seq 82           : [37m[0m[32mN[0m[32mR[0m[32mQ[0m[32mF[0m[32mR[0m[32mN[0m[32mC[0m[32mI[0m[32mL[0m[32mQ[0m[32mL[0m[32mF[0m[32mG[0m[32mK[0m[32mK[0m[32mV[0m[32mD[0m[32mD[0m[32mG[0m[32mS[0m[32mE[0m[32mL[0m[32mS[0m[32mS[0m[32mA[0m[32mS[0m[32mK[0m[32mT[0m[32mE[0m[32mV[0m[32mS[0m[32mS[0m[32mV[0m[32mS[0m[32mS[0m[32mV[0m[32mS[0m[32mP[0m[32mA[0m[37m[0m
Seq 83           : [37m[0m[32mN[0m[32mR[0m[32mQ[0m[32mF[0m[32mR[0m[32mN[0m[32mC[0m[32mI[0m[32mL[0m[32mQ[0m[32mL[0m[32mF[0m[32mG[0m[32mK[0m[32mK[0m[32mV[0m[32mD[0m[32mD[0m[32mG[0m[32mS[0m[32mE[0m[32mL[0m[32mS[0m[32mS[0m[32mA[0m[32mS[0m[32mK[0m[32mT[0m[32mE[0m[32mV[0m[32mS[0m[32mS[0m[32mV[0m[32mS[0m[32mS[0m[32mV[0m[32mS[0m[32mP[0m[32mA[0m[37m[0m

Cosine similarity: 0.9999999999999999
Seq 101           : [37m[0m[32mS[0m[32mK[0m[32mQ[0m[32mF[0m[32mQ[0m[3

[(82, 83, 1.0),
 (101, 102, 0.9999999999999999),
 (175, 176, 0.9999999999999999),
 (243, 244, 0.9999999999999998)]

In [25]:
def get_match_score_in_sequences( df              = gpcr.GPCR_DF,
                                  lim_cos_sim_min = 0,
                                  lim_cos_sim_max = 1,
                                  show_details    = False ):
    sequences = gpcr.GPCR_DF["seq"]
    index     = gpcr.GPCR_DF.index
    cosine_sim_in_sequences = []
    for (i, seq_1), (j, seq_2) in zip( zip(index[:-1], sequences[:-1]), zip(index[1:], sequences[1:]) ):
        cos_sim_mat = get_cosine_sim( rep_aa_in_seq([seq_1, seq_2]) )
        cosine_sim_in_sequences.append( (i, j, cos_sim_mat[0,1]) )
    
    # Applying limits >>
    cosine_sim_in_sequences = [ cos_sim for cos_sim in cosine_sim_in_sequences if (lim_cos_sim_min <= cos_sim[2] and cos_sim[2] <= lim_cos_sim_max) ]
    if show_details:
        for cosine_sims in cosine_sim_in_sequences:
            print( f"Cosine similarity: {cosine_sims[2]}" )
            print_comparision_seqs(   seq_1       = sequences[cosine_sims[0]],
                                      seq_2       = sequences[cosine_sims[1]],
                                      seq_1_start = 0,
                                      seq_2_start = 0,
                                      seq_1_index = 1,
                                      seq_2_index = 2,
                                      length      = None   )
            print()
    return cosine_sim_in_sequences

In [11]:
# --------------------------------------------------------------------------------------------------------

In [26]:
one_sequence     = 'actgatcgattgatcgatcgatcg'
another_sequence =    'tttagatcgatctttgatc'

In [27]:
# def score_match(subject, query, subject_start, query_start, length):
#     score = 0
#     for i in range(0,length):
#         subject_base = subject[subject_start + i]
#         query_base = query[query_start + i]
#         if subject_base == query_base:
#             score = score + 1
#         else:
#             score = score
#     return score

In [28]:
print(my_score_match(one_sequence, another_sequence, 7, 4, 8, True))
print(my_score_match(one_sequence, another_sequence, 7, 4, 4, True))
print(my_score_match(one_sequence, another_sequence, 7, 4, 12, True))
print(my_score_match(one_sequence, another_sequence, 10, 1, 5, True))
print(my_score_match(one_sequence, another_sequence, 0, 1, 0, True))

NameError: name 'my_score_match' is not defined

In [1]:
# new
def my_score_match_new( seq_1       = "1ST_TEST_SEQ",
                        seq_2       = "2ND_TEST_SEQUENCE",
                    seq_1_start  = 0,
                    seq_2_start  = 0,
                    length       = None,
                    show_details = False
                  ):
        
    # defult length is of smaller length sequence >>
    if length is None: length = min( [len(seq_1) - seq_1_start, len(seq_2) - seq_1_start] )
    
    score    = 0
    mismatch = 0
    if show_details:
        color_def = "white"
        out_seq_1 = colored( "".join( [" "]*( (seq_2_start - seq_1_start) if seq_2_start > seq_1_start else 0 ) ) + seq_1[0 : seq_1_start], color_def )
        out_seq_2 = colored( "".join( [" "]*( (seq_1_start - seq_2_start) if seq_1_start > seq_2_start else 0 ) ) + seq_2[0 : seq_2_start], color_def )
        for i in range(length):
            if seq_1[seq_1_start + i] == seq_2[seq_2_start + i]:
                score    = score + 1
                color    = "green" # >> green color for match
            else:
                mismatch = mismatch + 1
                color    = "red"   # >> red color for mis-match
            out_seq_1 = out_seq_1 + colored( seq_1[seq_1_start + i], color)
            out_seq_2 = out_seq_2 + colored( seq_2[seq_2_start + i], color)
        out_seq_1 = out_seq_1 + colored( seq_1[seq_1_start + length : ], color_def )
        out_seq_2 = out_seq_2 + colored( seq_2[seq_2_start + length : ], color_def )
        print( f"Seq 1      : {out_seq_1}")
        print( f"Seq 2      : {out_seq_2}")
        print( f"Match              : {colored(str(score), 'green')}" )
        print( f"Mis-match          : {colored(str(mismatch), 'red')}" )
        print( f"Comparision length : {length}" )
        print( f"Match probability  : {2*score / ( len(seq_1) + len(seq_2) ):0.10f}    [ Match probability = 2 * Match / ( len(seq_1) + len(seq_2) ) ]" )
        non_compared_length = len(seq_1) + len(seq_2) - 2 * length
        print( "Non compared length: " + str(non_compared_length) )
        
    else:
        for i in range(length):
            if seq_1[seq_1_start + i] == seq_2[seq_2_start + i]: score = score + 1
        mismatch            = None
        non_compared_length = None
    
    return score, mismatch, non_compared_length

In [35]:
my_score_match_new(show_details=True)

Seq 1      : [37m[0m[31m1[0m[31mS[0m[31mT[0m[32m_[0m[32mT[0m[32mE[0m[32mS[0m[32mT[0m[32m_[0m[32mS[0m[32mE[0m[32mQ[0m[37m[0m
Seq 2      : [37m[0m[31m2[0m[31mN[0m[31mD[0m[32m_[0m[32mT[0m[32mE[0m[32mS[0m[32mT[0m[32m_[0m[32mS[0m[32mE[0m[32mQ[0m[37mUENCE[0m
Match              : [32m9[0m
Mis-match          : [31m3[0m
Comparision length : 12
Match probability  : 0.6206896552    [ Match probability = 2 * Match / ( len(seq_1) + len(seq_2) ) ]
Non compared length: 5


(9, 3, 5)

In [30]:
# new
def my_try_all_matches_new( seq_1,
                            seq_2,
                            score_limit    = 0,
                            max_score_only = False,
                            show_details   = False):
    print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
    score_max        = 0
    length_max       = 1
    non_compared_min = 1000000000
    for seq_1_start in range(0, len(seq_1)):
        for seq_2_start in range(0, len(seq_2)):
            for length in range(1, len(seq_2)+1):
                if (seq_1_start + length < len(seq_1) + 1 and seq_2_start + length < len(seq_2) + 1):
                    score, mismatch, non_compared_length = my_score_match_new(seq_1, seq_2, seq_1_start, seq_2_start, length)
                    # Print details for score above score-limit >>
                    if (score >= score_limit):
                        score, mismatch, non_compared_length = my_score_match_new(seq_1, seq_2, seq_1_start, seq_2_start, length, show_details)
                        print(seq_1_start, seq_2_start, length, score)
                        if show_details: print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
                    if score > score_max:
                        score_max       = score
                        length_max      = length
                        seq_1_start_max = seq_1_start
                        seq_2_start_max = seq_2_start
                    elif score == score_max and length > length_max:
                        length_max      = length
                        seq_1_start_max = seq_1_start
                        seq_2_start_max = seq_2_start
                    elif score == score_max and length > length_max:
                        length_max      = length
                        seq_1_start_max = seq_1_start
                        seq_2_start_max = seq_2_start

    # Print details for max score and min mis-match >>
    print("                      Max score with: 1. min mis-match & 2. min non-compared length")
    print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
    _ = my_score_match_new(seq_1, seq_2, seq_1_start_max, seq_2_start_max, length_max, show_details)
    print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")

    return score_max

In [27]:
seq_1   = "XXXABCDEFFGHG"
seq_2   = "YYABCDEFFGHR"
min_len = min([len(seq_1), len(seq_2)])
my_try_all_matches_new( seq_1, seq_2,
                        score_limit  = min_len - 3,
                        show_details = True
                      )

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Seq 1      : [37mX[0m[31mX[0m[31mX[0m[32mA[0m[32mB[0m[32mC[0m[32mD[0m[32mE[0m[32mF[0m[32mF[0m[32mG[0m[32mH[0m[37mG[0m
Seq 2      : [37m [0m[31mY[0m[31mY[0m[32mA[0m[32mB[0m[32mC[0m[32mD[0m[32mE[0m[32mF[0m[32mF[0m[32mG[0m[32mH[0m[37mR[0m
Match              : [32m9[0m
Mis-match          : [31m2[0m
Comparision length : 11
Match probability  : 0.7200000000    [ Match probability = 2 * Match / ( len(seq_1) + len(seq_2) ) ]
Non compared length: 3
1 0 11 9
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Seq 1      : [37mX[0m[31mX[0m[31mX[0m[32mA[0m[32mB[0m[32mC[0m[32mD[0m[32mE[0m[32mF[0m[32mF[0m[32mG[0m[32mH[0m[31mG[0m[37m[0m
Seq 2      : [37m [0m[31mY[0m[31mY[0m[32mA[0m[32mB[0m[32mC[0m[32mD[0m[32mE[0m[32mF[0m[32mF[0m[3

9

In [31]:
def my_score_match( seq_1,
                    seq_2,
                    seq_1_start  = 0,
                    seq_2_start  = 0,
                    length       = None,
                    show_details = False
                  ):
        
    # defult length is of smaller length sequence >>
    if length is None: length = min( [len(seq_1) - seq_1_start, len(seq_2) - seq_1_start] )
    
    score    = 0
    mismatch = 0
    if show_details:
        color_def = "white"
        out_seq_1 = colored( "".join( [" "]*( (seq_2_start - seq_1_start) if seq_2_start > seq_1_start else 0 ) ) + seq_1[0 : seq_1_start], color_def )
        out_seq_2 = colored( "".join( [" "]*( (seq_1_start - seq_2_start) if seq_1_start > seq_2_start else 0 ) ) + seq_2[0 : seq_2_start], color_def )
        for i in range(length):
            if seq_1[seq_1_start + i] == seq_2[seq_2_start + i]:
                score    = score + 1
                color    = "green" # >> green color for match
            else:
                mismatch = mismatch + 1
                color    = "red"   # >> red color for mis-match
            out_seq_1 = out_seq_1 + colored( seq_1[seq_1_start + i], color)
            out_seq_2 = out_seq_2 + colored( seq_2[seq_2_start + i], color)
        out_seq_1 = out_seq_1 + colored( seq_1[seq_1_start + length : ], color_def )
        out_seq_2 = out_seq_2 + colored( seq_2[seq_2_start + length : ], color_def )
        print( "Seq 1      : " + out_seq_1)
        print( "Seq 2      : " + out_seq_2)
        print( "Match              : " + colored(str(score), "green") )
        print( "Mis-match          : " + colored(str(mismatch), "red") )
        print( "Comparision length : " + str(length) )
        print( "Match probability  : " + str( 2*score / len(seq_1) + len(seq_2) ) )
        print( "[ Match probability = 2 * Match / ( len(seq_1) + len(seq_2) ) ]" )
    else:
        for i in range(length):
            if seq_1[seq_1_start + i] == seq_2[seq_2_start + i]: score = score + 1
    
    return score

In [39]:
my_score_match( seq_1        = "ABCDEFGH",
                seq_2        = "ABCDEF",
                show_details = True )

Seq 1      : [37m[0m[32mA[0m[32mB[0m[32mC[0m[32mD[0m[32mE[0m[32mF[0m[37mGH[0m
Seq 2      : [37m[0m[32mA[0m[32mB[0m[32mC[0m[32mD[0m[32mE[0m[32mF[0m[37m[0m
Match              : [32m6[0m
Mis-match          : [31m0[0m
Comparision length : 6
Match probability  : 7.5
[ Match probability = 2 * Match / ( len(seq_1) + len(seq_2) ) ]


6

In [32]:
def my_try_all_matches(seq_1, seq_2, score_limit = 0, show_details = False):
    for seq_1_start in range(0, len(seq_1)):
        for seq_2_start in range(0, len(seq_2)):
            for length in range(1, len(seq_2)+1):
                if (seq_1_start + length < len(seq_1) + 1 and seq_2_start + length < len(seq_2) + 1):
                    score = my_score_match(seq_1, seq_2, seq_1_start, seq_2_start, length)                    
                    # only print a line of output if the score is better than scare limit
                    if (score >= score_limit):
                        score = my_score_match(seq_1, seq_2, seq_1_start, seq_2_start, length, show_details)
                        print(seq_1_start, seq_2_start, length, score)
                        if show_details: print("-----------------------------------------------------------------------------------") 

In [10]:
for i in gpcr.GPCR_DF.index[ gpcr.GPCR_DF["seq_len"] == 15 ]:
    print(i)

5


In [11]:
gpcr.GPCR_DF[ gpcr.GPCR_DF.index == 5 ]

Unnamed: 0,gene,class,name,seq_len,seq
5,ADGRG3,AD,Adhesion G protein-coupled receptor G3,15,SSTARLDQAHSASQE


In [33]:
# my_try_all_matches(one_sequence, another_sequence, score_limit = 10, show_details = True)
seq_1   = "XXXABCDEFG"
seq_2   = "YYABCDEFR"
min_len = min([len(seq_1), len(seq_2)])
# my_try_all_matches(seq_1, seq_2, score_limit = min_len - 3,  show_details = True)
my_try_all_matches(seq_1, seq_2, show_details = True) 

Seq 1      : [37m[0m[31mX[0m[37mXXABCDEFG[0m
Seq 2      : [37m[0m[31mY[0m[37mYABCDEFR[0m
Match              : [32m0[0m
Mis-match          : [31m1[0m
Comparision length : 1
Match probability  : 9.0
[ Match probability = 2 * Match / ( len(seq_1) + len(seq_2) ) ]
0 0 1 0
-----------------------------------------------------------------------------------
Seq 1      : [37m[0m[31mX[0m[31mX[0m[37mXABCDEFG[0m
Seq 2      : [37m[0m[31mY[0m[31mY[0m[37mABCDEFR[0m
Match              : [32m0[0m
Mis-match          : [31m2[0m
Comparision length : 2
Match probability  : 9.0
[ Match probability = 2 * Match / ( len(seq_1) + len(seq_2) ) ]
0 0 2 0
-----------------------------------------------------------------------------------
Seq 1      : [37m[0m[31mX[0m[31mX[0m[31mX[0m[37mABCDEFG[0m
Seq 2      : [37m[0m[31mY[0m[31mY[0m[31mA[0m[37mBCDEFR[0m
Match              : [32m0[0m
Mis-match          : [31m3[0m
Comparision length : 3
Match probability  :

Mis-match          : [31m4[0m
Comparision length : 4
Match probability  : 9.0
[ Match probability = 2 * Match / ( len(seq_1) + len(seq_2) ) ]
3 0 4 0
-----------------------------------------------------------------------------------
Seq 1      : [37mXXX[0m[31mA[0m[31mB[0m[31mC[0m[31mD[0m[31mE[0m[37mFG[0m
Seq 2      : [37m   [0m[31mY[0m[31mY[0m[31mA[0m[31mB[0m[31mC[0m[37mDEFR[0m
Match              : [32m0[0m
Mis-match          : [31m5[0m
Comparision length : 5
Match probability  : 9.0
[ Match probability = 2 * Match / ( len(seq_1) + len(seq_2) ) ]
3 0 5 0
-----------------------------------------------------------------------------------
Seq 1      : [37mXXX[0m[31mA[0m[31mB[0m[31mC[0m[31mD[0m[31mE[0m[31mF[0m[37mG[0m
Seq 2      : [37m   [0m[31mY[0m[31mY[0m[31mA[0m[31mB[0m[31mC[0m[31mD[0m[37mEFR[0m
Match              : [32m0[0m
Mis-match          : [31m6[0m
Comparision length : 6
Match probability  : 9.0
[ Match probab

7 7 2 0
-----------------------------------------------------------------------------------
Seq 1      : [37m XXXABCD[0m[31mE[0m[37mFG[0m
Seq 2      : [37mYYABCDEF[0m[31mR[0m[37m[0m
Match              : [32m0[0m
Mis-match          : [31m1[0m
Comparision length : 1
Match probability  : 9.0
[ Match probability = 2 * Match / ( len(seq_1) + len(seq_2) ) ]
7 8 1 0
-----------------------------------------------------------------------------------
Seq 1      : [37mXXXABCDE[0m[31mF[0m[37mG[0m
Seq 2      : [37m        [0m[31mY[0m[37mYABCDEFR[0m
Match              : [32m0[0m
Mis-match          : [31m1[0m
Comparision length : 1
Match probability  : 9.0
[ Match probability = 2 * Match / ( len(seq_1) + len(seq_2) ) ]
8 0 1 0
-----------------------------------------------------------------------------------
Seq 1      : [37mXXXABCDE[0m[31mF[0m[31mG[0m[37m[0m
Seq 2      : [37m        [0m[31mY[0m[31mY[0m[37mABCDEFR[0m
Match              : [32m0[0m
Mi

In [231]:
seq_1 = "XXXABCDEFCCCXX"
seq_2 = "YABCDEFYCCYYYYYYY"

# <span style="color:orange"> Score Match </span>

In [107]:
# my_score_match( seq_1, seq_2, seq_1_start = 0, seq_2_start = 0, length = None, show_details = True ); print()
my_score_match( seq_1, seq_2, seq_1_start = 3, seq_2_start = 1, length = None, show_details = True ); print()
# my_score_match( seq_1, seq_2, seq_1_start = 5, seq_2_start = 3, length = None, show_details = True ); print()

Seq 1      : [37mXXX[0m[32mA[0m[32mB[0m[32mC[0m[32mD[0m[32mE[0m[32mF[0m[31mC[0m[32mC[0m[32mC[0m[31mX[0m[31mX[0m[37m[0m
Seq 2      : [37m  Y[0m[32mA[0m[32mB[0m[32mC[0m[32mD[0m[32mE[0m[32mF[0m[31mY[0m[32mC[0m[32mC[0m[31mY[0m[31mY[0m[37mYYYYY[0m
Score      : [34m8[0m



# <span style="color:orange"> All score match </span>

In [114]:
seq_1 = "QDFRERLIHALPASLERALTEDSTQTSDTATNSTLPSAEVEL"
seq_2 = "QDFRERLIHSLPTSLERALSEDSAPTNDTAANSASPPAETEL"
print(seq_1, len(seq_1))
print(seq_2, len(seq_2)) 

QDFRERLIHALPASLERALTEDSTQTSDTATNSTLPSAEVEL 42
QDFRERLIHSLPTSLERALSEDSAPTNDTAANSASPPAETEL 42


In [115]:
my_try_all_matches(seq_1, seq_2, score_limit = 30, show_details = True)

Seq 1      : [37m[0m[32mQ[0m[32mD[0m[32mF[0m[32mR[0m[32mE[0m[32mR[0m[32mL[0m[32mI[0m[32mH[0m[31mA[0m[32mL[0m[32mP[0m[31mA[0m[32mS[0m[32mL[0m[32mE[0m[32mR[0m[32mA[0m[32mL[0m[31mT[0m[32mE[0m[32mD[0m[32mS[0m[31mT[0m[31mQ[0m[32mT[0m[31mS[0m[32mD[0m[32mT[0m[32mA[0m[31mT[0m[32mN[0m[32mS[0m[31mT[0m[31mL[0m[32mP[0m[31mS[0m[32mA[0m[32mE[0m[31mV[0m[32mE[0m[37mL[0m
Seq 2      : [37m[0m[32mQ[0m[32mD[0m[32mF[0m[32mR[0m[32mE[0m[32mR[0m[32mL[0m[32mI[0m[32mH[0m[31mS[0m[32mL[0m[32mP[0m[31mT[0m[32mS[0m[32mL[0m[32mE[0m[32mR[0m[32mA[0m[32mL[0m[31mS[0m[32mE[0m[32mD[0m[32mS[0m[31mA[0m[31mP[0m[32mT[0m[31mN[0m[32mD[0m[32mT[0m[32mA[0m[31mA[0m[32mN[0m[32mS[0m[31mA[0m[31mS[0m[32mP[0m[31mP[0m[32mA[0m[32mE[0m[31mT[0m[32mE[0m[37mL[0m
Score      : [34m30[0m
0 0 41 30
-------------------------------------------------------------------------------

In [116]:
my_try_all_matches(seq_1, seq_2, score_limit = 31, show_details = True) 

Seq 1      : [37m[0m[32mQ[0m[32mD[0m[32mF[0m[32mR[0m[32mE[0m[32mR[0m[32mL[0m[32mI[0m[32mH[0m[31mA[0m[32mL[0m[32mP[0m[31mA[0m[32mS[0m[32mL[0m[32mE[0m[32mR[0m[32mA[0m[32mL[0m[31mT[0m[32mE[0m[32mD[0m[32mS[0m[31mT[0m[31mQ[0m[32mT[0m[31mS[0m[32mD[0m[32mT[0m[32mA[0m[31mT[0m[32mN[0m[32mS[0m[31mT[0m[31mL[0m[32mP[0m[31mS[0m[32mA[0m[32mE[0m[31mV[0m[32mE[0m[32mL[0m[37m[0m
Seq 2      : [37m[0m[32mQ[0m[32mD[0m[32mF[0m[32mR[0m[32mE[0m[32mR[0m[32mL[0m[32mI[0m[32mH[0m[31mS[0m[32mL[0m[32mP[0m[31mT[0m[32mS[0m[32mL[0m[32mE[0m[32mR[0m[32mA[0m[32mL[0m[31mS[0m[32mE[0m[32mD[0m[32mS[0m[31mA[0m[31mP[0m[32mT[0m[31mN[0m[32mD[0m[32mT[0m[32mA[0m[31mA[0m[32mN[0m[32mS[0m[31mA[0m[31mS[0m[32mP[0m[31mP[0m[32mA[0m[32mE[0m[31mT[0m[32mE[0m[32mL[0m[37m[0m
Score      : [34m31[0m
0 0 42 31
-------------------------------------------------------------

In [1]:
pip install notebook-as-pdf

Collecting notebook-as-pdf
  Downloading notebook_as_pdf-0.4.0-py3-none-any.whl (6.2 kB)
Collecting PyPDF2
  Downloading PyPDF2-1.26.0.tar.gz (77 kB)
Collecting pyppeteer
  Downloading pyppeteer-0.2.5-py3-none-any.whl (87 kB)
Collecting pyee<9.0.0,>=8.1.0
  Downloading pyee-8.1.0-py2.py3-none-any.whl (12 kB)
Collecting appdirs<2.0.0,>=1.4.3
  Downloading appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)
Collecting websockets<9.0,>=8.1
  Downloading websockets-8.1-cp38-cp38-win_amd64.whl (66 kB)
Building wheels for collected packages: PyPDF2

  Building wheel for PyPDF2 (setup.py): started
  Building wheel for PyPDF2 (setup.py): finished with status 'done'
  Created wheel for PyPDF2: filename=PyPDF2-1.26.0-py3-none-any.whl size=61087 sha256=f8446a537b6ccefa54f0bb5a9ea945f9d2f445cdb8db855985544e0f7863f15b
  Stored in directory: c:\users\shashank\appdata\local\pip\cache\wheels\b1\1a\8f\a4c34be976825a2f7948d0fa40907598d69834f8ab5889de11
Successfully built PyPDF2
Installing collected packages: Py

In [3]:
!jupyter-nbconvert --to PDFviaHTML hydrophobicity_in_sequences.ipynb

Traceback (most recent call last):
  File "C:\Users\Shashank\anaconda3\lib\site-packages\traitlets\traitlets.py", line 535, in get
    value = obj._trait_values[self.name]
KeyError: 'template_paths'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Shashank\anaconda3\Scripts\jupyter-nbconvert-script.py", line 10, in <module>
    sys.exit(main())
  File "C:\Users\Shashank\anaconda3\lib\site-packages\jupyter_core\application.py", line 270, in launch_instance
    return super(JupyterApp, cls).launch_instance(argv=argv, **kwargs)
  File "C:\Users\Shashank\anaconda3\lib\site-packages\traitlets\config\application.py", line 845, in launch_instance
    app.start()
  File "C:\Users\Shashank\anaconda3\lib\site-packages\nbconvert\nbconvertapp.py", line 350, in start
    self.convert_notebooks()
  File "C:\Users\Shashank\anaconda3\lib\site-packages\nbconvert\nbconvertapp.py", line 518, in convert_notebooks
    cls = get_export

In [4]:
pyppeteer-install

NameError: name 'pyppeteer' is not defined