In [1]:
from Bio import Align
import csv
import itertools
import pandas as pd
import numpy as np

#### Psuedo Code:

- read csv column containing sequences as a list of strings
- use single for loop with `itertools.combinations(<listname>, 2)` and `Align.PairwiseAligner()` to get new list of alignement scores.

- Issues to solve:
    - Limit each sequence pair to 1 alignment (sometimes it likes to give 2)
    - Ensure scoring params are same as blastn (https://www.ncbi.nlm.nih.gov/books/NBK279684/#_appendices_BLAST_Substitution_Matrices_)


In [2]:
with open('Alignment_RM_Test.csv', 'r') as csv_file:
    lines = csv_file.readlines()
    
headers = lines[0].rstrip().split(',') # rstrip removes end-of-line chars
numLines = len(lines)

# create lineList to include only numLines elements
linelist = [x.rstrip().split(',')
           for x in lines[1:numLines+1]] 

# list comprehension within dictionary comprehension to split each element by its headers and create dictionary of lists
outputDict = {keyVal:[x[idx] 
                       for x in linelist 
                       if len(x) == len(headers)] 
              for idx, keyVal in enumerate (headers)} 



In [3]:
outputDict # print dict to make sure everything worked

{'x': ['1', '2', '3', '4', '5', '6', '7'],
 'y': ['atgcatgc',
  'attgctgc',
  'tttgccgg',
  'atgctttt',
  'cgggctaa',
  'tgtgtgta',
  'tagggccc'],
 '': ['', '', '', '', '', '', '']}

In [4]:
# convert our desired column(s) from dict values into list
seq_list = list(outputDict.get('y'))
print(seq_list)

['atgcatgc', 'attgctgc', 'tttgccgg', 'atgctttt', 'cgggctaa', 'tgtgtgta', 'tagggccc']


In [5]:
pair_list = []

for a, b in itertools.combinations(seq_list, 2): 
    pair_list.append([a, b])
    
print(pair_list)


[['atgcatgc', 'attgctgc'], ['atgcatgc', 'tttgccgg'], ['atgcatgc', 'atgctttt'], ['atgcatgc', 'cgggctaa'], ['atgcatgc', 'tgtgtgta'], ['atgcatgc', 'tagggccc'], ['attgctgc', 'tttgccgg'], ['attgctgc', 'atgctttt'], ['attgctgc', 'cgggctaa'], ['attgctgc', 'tgtgtgta'], ['attgctgc', 'tagggccc'], ['tttgccgg', 'atgctttt'], ['tttgccgg', 'cgggctaa'], ['tttgccgg', 'tgtgtgta'], ['tttgccgg', 'tagggccc'], ['atgctttt', 'cgggctaa'], ['atgctttt', 'tgtgtgta'], ['atgctttt', 'tagggccc'], ['cgggctaa', 'tgtgtgta'], ['cgggctaa', 'tagggccc'], ['tgtgtgta', 'tagggccc']]


In [6]:
aligner = Align.PairwiseAligner()
aligner.mode = 'local'

alignments_list = []

for pair in pair_list:
    alignments_list.append(aligner.align(pair[0], pair[1]))
    
alignments_list # check to make sure correct number of alignments generated

[<Bio.Align.PairwiseAlignments at 0x7fed0745a190>,
 <Bio.Align.PairwiseAlignments at 0x7fed0842b820>,
 <Bio.Align.PairwiseAlignments at 0x7fed0842b7c0>,
 <Bio.Align.PairwiseAlignments at 0x7fed0842b4c0>,
 <Bio.Align.PairwiseAlignments at 0x7fed0842b6d0>,
 <Bio.Align.PairwiseAlignments at 0x7fed0842b130>,
 <Bio.Align.PairwiseAlignments at 0x7fed0842b520>,
 <Bio.Align.PairwiseAlignments at 0x7fed0842b1f0>,
 <Bio.Align.PairwiseAlignments at 0x7fed0842b3a0>,
 <Bio.Align.PairwiseAlignments at 0x7fed0842b4f0>,
 <Bio.Align.PairwiseAlignments at 0x7fed0842b730>,
 <Bio.Align.PairwiseAlignments at 0x7fed0842b250>,
 <Bio.Align.PairwiseAlignments at 0x7fed0842b2e0>,
 <Bio.Align.PairwiseAlignments at 0x7fed0842b040>,
 <Bio.Align.PairwiseAlignments at 0x7fed0842b0a0>,
 <Bio.Align.PairwiseAlignments at 0x7fed0842b310>,
 <Bio.Align.PairwiseAlignments at 0x7fed0842b220>,
 <Bio.Align.PairwiseAlignments at 0x7fed0842b610>,
 <Bio.Align.PairwiseAlignments at 0x7fed0842b8b0>,
 <Bio.Align.PairwiseAlignments 

In [77]:
'''play code (not necessary for running entire notebook)'''

def avg_score(alignments_list):
    
    # Finding length of the list
    length = len(alignments_list) 
    i = 0
    y = []
    
    # While Loop to iterate through list
    while i < length:
        x = []
        for alignment in test_alignment:
            x.append(alignment.score)
        y.append(np.mean(x))
        i += 1
    return y

'''this returns the correct number of scores, but the values are wrong.
  averaged values should not all be four.  this is obvious when you print the scores
  for the first alignment only, which returns two alignment scores, both of 7.'''
avg_score(alignments_list)  

[4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0,
 4.0]

In [78]:
'''Need to figure out how to get one score value back for every alignment'''




score_list = []
avg_score_list = []

for alignments in alignments_list:
    for alignment in alignments:
        score_list.append(alignment.score)
        avg_score_list = np.mean(score_list)
    #avg_score_list.append(np.mean(score_list))
    

print(score_list)
#print(avg_score_list)

[7.0, 7.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0,

In [79]:
pair_list
print(len(pair_list))

21


In [81]:
'''waaaaaaaaay too big.  we need to get this value down to 21'''


score_list
print(len(score_list))

1173


In [82]:
alignment_df = pd.DataFrame(pair_list)
alignment_df

Unnamed: 0,0,1
0,atgcatgc,attgctgc
1,atgcatgc,tttgccgg
2,atgcatgc,atgctttt
3,atgcatgc,cgggctaa
4,atgcatgc,tgtgtgta
5,atgcatgc,tagggccc
6,attgctgc,tttgccgg
7,attgctgc,atgctttt
8,attgctgc,cgggctaa
9,attgctgc,tgtgtgta
