<a href="https://colab.research.google.com/github/Nubiancodingdelight/ACS-Project-Repository-/blob/main/Generate_Matrix.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebooks generates matrices as numpy arrays.
Created By: Lorrayya Williams
Updated On: 4/13/2025

In [1]:
#mounts google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
#set path
%cd /content/drive/MyDrive/ACS_Research/VISDB_Data/

/content/drive/MyDrive/ACS_Research/VISDB_Data


INSTALL

In [3]:
!pip install pysam

Collecting pysam
  Downloading pysam-0.23.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.6 kB)
Downloading pysam-0.23.0-cp311-cp311-manylinux_2_28_x86_64.whl (26.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.0/26.0 MB[0m [31m83.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pysam
Successfully installed pysam-0.23.0


#Imports

In [4]:
#imports
import numpy as np
import subprocess
import shlex
import pysam
import re
import tensorflow as tf
import re
import math
import statistics as stat


# Functions




## Generate Cigar

In [5]:
def generate_cigar(ref, seq):
    cigar = []
    count = 0
    op = ''

    for r, s in zip(ref, seq):
        # Determine operation
        if r == s and r != '-':
            current_op = 'M'
        elif r != s and r != '-' and s != '-':
            current_op = 'X'
        elif r == '-' and s != '-':
            current_op = 'I'
        elif s == '-' and r != '-':
            current_op = 'D'
        else:
            continue  # skip if both are gaps

        # Group by same operation
        if current_op == op:
            count += 1
        else:
            if op:
                cigar.append(f"{count}{op}")
            op = current_op
            count = 1

    if op:
        cigar.append(f"{count}{op}")

    return ''.join(cigar)

# Example usage (aligned input)
#ref = "ACCGT-A"
#seq = "AC-GTGA"
#print(generate_cigar(ref, seq))  # Output: 2=1D2=1I1=


In [6]:
def reverse_complement(seq):
    complement = str.maketrans('ACGTNacgtn', 'TGCANtgcan')
    return seq.translate(complement)[::-1]

## Create Bam from Sequence

In [7]:
import pysam
from pysam import AlignmentHeader, AlignedSegment
def create_bam_from_sequence(output_bam_path, dna_sequence,ref_seq, reference_name, start_pos):
  """
  Creates a BAM file from a DNA sequence and its coordinates.

  Args:
      output_bam_path (str): Path to save the BAM file (e.g., "output.bam").
      dna_sequence (str): DNA sequence (e.g., "ATCGATCG").
      reference_name (str): Reference sequence name (e.g., "chr1").
      start_pos (int): 1-based start position on the reference.
  """
  # 1. Create a BAM header
  header = AlignmentHeader.from_references(
      [reference_name],  # List of reference names
      [1]  # Lengths of references
      )

  # 2. Open a BAM file for writing
  with pysam.AlignmentFile(output_bam_path, "wb", header=header) as bam_file:
    # 3. Create an aligned segment (read)
    read = AlignedSegment(header)
    read.query_name = "read1"  # Read ID
    read.query_sequence = dna_sequence  # DNA sequence
    read.flag = 0  # No flags set (0 means mapped)
    read.reference_id = 0  # Index of reference in header (0 = first one)
    read.reference_start = start_pos - 1  # 0-based position
    read.mapping_quality = 60  # High mapping quality
    read.cigarstring = generate_cigar(dna_sequence,ref_seq)  # CIGAR string (exact match)

    # 4. Write the read to the BAM file
    bam_file.write(read)

     # Reverse strand read
    read_rev = AlignedSegment(header)
    read_rev.query_name = "read2"
    rev_seq = reverse_complement(dna_sequence)
    rev_ref = reverse_complement(ref_seq)
    read_rev.query_sequence = rev_seq
    read_rev.flag = 16  # Reverse strand flag
    read_rev.reference_id = 0
    read_rev.reference_start = start_pos - 1
    read_rev.mapping_quality = 60
    read_rev.cigarstring = generate_cigar(rev_seq,rev_ref)
    bam_file.write(read_rev)

    return read, read_rev


Generate SAM file

#Matrix Generator 1 -- DeepHBV


In [8]:
def extract_virus(samfile,start, end):
  for line in samfile:
    line_list = str(line).split("\t")
    CIGAR = line_list[5]
    SEQ = line_list[9]
    POS = line_list[3]
    SEQ = line_list[9]
    if int(POS) in range(start,end):
      POS_SEQ.append(SEQ)
    else:
      NEG_SEQ.append(SEQ)
  return POS_SEQ, NEG_SEQ


In [9]:
import numpy as np
import os
from scipy.io import loadmat
# from array import array
# from util import seq_matrix

def matrix_generator_uno(seq_list, dim, isvirus=True):  # One Hot Encoding
    tensor = np.zeros((len(seq_list), dim, 4))
    if isvirus:
      label = np.ones(len(seq_list))
    else:
      label = np.zeros(len(seq_list))
    for i in range(len(seq_list)):
        seq = seq_list[i]
        j = 0
        for s in seq:
            if s == 'A' or s == 'a':
                tensor[i][j] = [1, 0, 0, 0]
            if s == 'T' or s == 't':
                tensor[i][j] = [0, 1, 0, 0]
            if s == 'C' or s == 'c':
                tensor[i][j] = [0, 0, 1, 0]
            if s == 'G' or s == 'g':
                tensor[i][j] = [0, 0, 0, 1]
            if s == 'N':
                tensor[i][j] = [0, 0, 0, 0]
            j += 1
    return tensor, label

# Matrix Generator 2 -- Novel Matrix

In [12]:
#matrix row generator from CIGAR, SEQuence, Position of Insert
def matrix_row(CIG, SEQ, POS, start, end):
  matrix_list= []
  label_list =[]
  bases = ['A','T','C','G']
  seq_list= list(SEQ)
  cigar_list = re.split(r'(\d+)', CIG)[1::]
  tensor= np.zeros((1,2000, 4))
  if int(POS) in range(start,end):
    label_list.append(1)
  else:
    label_list.append(0)

  #print(len(seq_list))
  #FIX LOOP
  if len(cigar_list) ==2:
    #if it is a matched a sequence of 1's based on the base are added
    if cigar_list[1] == 'M':
      i= 0
      for j in range(int(cigar_list[0])):
        num_list = [0,0,0,0]
        try:
          num_list[bases.index(seq_list[j])] = 1
          #print(num_list)
          tensor[i][j]= num_list
          #print(tensor[i][j])
        except ValueError:
          tensor[i][j]= num_list


    else:
      for j in range(len(seq_list)):
        num_list = [0,0,0,0]

#  return tensor, label_list

  else:
    length = 0
    i = 0
    for time in range(int(len(cigar_list)/2)):
      if cigar_list[(2*time)+1] == 'M':
        for j in range(length,length + int(cigar_list[(2*time)])):
          num_list = [0,0,0,0]
          try:
            num_list[bases.index(seq_list[j])] = 1
            tensor[i][j]= num_list
          except ValueError:
            tensor[i][j]= num_list
        length += int(cigar_list[(2*time)])
  return tensor, label_list


In [13]:
from os.path import samefile
def matrix_generator_dos (read1,read2, start, end, Matrix_Name ='Matrix_data', Label_Name='Label_data', save_file_path=os.getcwd()):
  final_matrix = []
  label_list = []
  reads =[read1, read2]
  for read in reads:
    CIGAR = read.cigarstring
    SEQ = read.query_sequence
    POS = read.reference_start
  final_matrix_temp, label_list_temp = matrix_row(CIGAR, SEQ, POS, start,end)
  if len(final_matrix) == 0:
    final_matrix = final_matrix_temp
  else:
    final_matrix = np.concatenate([final_matrix, final_matrix_temp])

  label_list += label_list_temp

  #saves matrix to specified path
  os.chdir(save_file_path)
  Data = np.asarray(final_matrix)
  Label = np.asarray(label_list)
  return Data, Label

# Matrix Generator 3 -- 3D Matrix

> Add blockquote



In [14]:
def generate_mapping_list(read1,read2, int_start, int_end, direction):
  reads = [read1, read2]
  for read in reads:
    CIGAR = re.split(r'(\d+)', read.cigarstring)[1::]
    SEQ =  read.query_sequence
    POS = int(read.reference_start)
    DIR = 0 if direction == '-' or direction == 0 else 1
    mapping_list = []
    COUNT= 0
    if len(CIGAR) ==2:
    #if it is a matched a sequence of 1's based on the base are added
      if CIGAR[1] == 'M':
        mat = 1
        for i in range(len(SEQ)):
          mapping_list.append([SEQ[i], mat, int(POS) + i ,DIR, COUNT])
          if dir ==0:
            POS +=1
          else: POS -=1
      else:
        mat=0
        for i in range(len(SEQ)):
          mapping_list.append([SEQ[i], mat, int(POS) + i ,DIR, COUNT])
          if dir ==0:
            POS +=1
          else: POS -=1
    else:
      length = 0
      i = 0
      for time in range(int(len(CIGAR)/2)):
        if CIGAR[(2*time)+1] == 'M':
          mat =1
          for j in range(length,length + int(CIGAR[(2*time)])):
            mapping_list.append([SEQ[i], mat, int(POS) + i ,DIR, COUNT])
            if dir ==0:
              POS +=1
            else: POS -=1
        else:
          mat =0
          for j in range(length,length + int(CIGAR[(2*time)])):
            mapping_list.append([SEQ[i], mat, int(POS) + i ,DIR, COUNT])
            if dir ==0:
              POS +=1
            else: POS -=1
        COUNT += 1

  #order a list lists
  df = pd.DataFrame(mapping_list, columns = ['Base', 'Match', 'Position', 'Direction', 'Row_count'])
  df = df.sort_values(by=['Position'])
  #df = df.reset_index(drop=True)
  df = df.tail(-1).drop_duplicates()
  df = df.reset_index(drop=True)
  three_five = df[df['Direction'] ==0]
  five_three = df[df['Direction'] ==1]
  return df, three_five, five_three

In [20]:
def matrix_generator_tres(read1, read2, int_start, int_end, direction, Matrix_Name ='Matrix_data', Label_Name='Label_data', save_file_path=os.getcwd()):
  df, three_five, five_three = generate_mapping_list(read1,read2, int_start, int_end, direction)
  #create tensor
  tensor = np.zeros((len(df),2000,  4))
  label = np.zeros(len(df))
  label_list= [0] * len(df)

  col_count = 0
  curr_pos = 0
  end_pos = 0
  bases = ['A','T','C','G']
  match_dict ={'A':'T', 'T':'A', 'C':'G', 'G':'C'}
  for row in range(1,len(df)):
    curr_pos = df['Position'][row]

  #3' -->5' Direction
    if df['Direction'][row] == 0:
      #checks to see if it matches with alignment genome
      if three_five['Base'][row//2] in bases and three_five['Match'][row] == 1:
        tensor[row][col_count][bases.index(df['Base'][row].upper())] = 1

        #checks if the bases match with each other
        #print(df[df['Position']== df['Position'][row]].Base.values[0])
        if str(five_three[five_three['Position']== three_five['Position'][row//2]].Base).upper() == match_dict[str(three_five['Base'][row//2]).upper()]:
          tensor[row][col_count][bases.index(df['Base'][row//2].upper())] =3

      #checks if it matches with each other but not with alignment genome
      elif  str(five_three[five_three['Position']== three_five['Position'][row//2]].Base).upper() == match_dict[str(three_five['Base'][row//2]).upper()]:
          tensor[row][col_count][bases.index(df['Base'][row//2].upper())]=2

  #5'-->3' Direction
    else:
      #checks to see if it matches with alignment genome
      if five_three['Base'][row] in bases and five_three['Match'][row//2] == 1:
        tensor[row][col_count][bases.index(five_three['Base'][row//2].upper())] = 1

        #checks if the bases match with each other
        #print( match_dict[df['Base'][row]])

        #print(df[df['Position']== df['Position'][row]].Base.item())
        if three_five[three_five['Position']== five_three['Position'][row//2]].Base.upper() == match_dict[five_three['Base'][row//2].upper()]:
          tensor[row][col_count][bases.index(df['Base'][row//2].upper())] =3

      #checks if it matches with each other but not with alignment genome
      elif three_five[three_five['Position']== five_three['Position'][row//2]].Base.upper() == match_dict[five_three['Base'][row//2].upper()]:
          tensor[row][col_count][bases.index(df['Base'][row//2].upper())]=2
  if curr_pos in range(int_start,int_end):
    label[row] =1
  else:
    label[row] =0
  return tensor, label
'''
    if curr_pos == end_pos:
      if int(curr_pos) in range(int_start,int_end):
        label_list.append(1)
      else:
        label_list.append(0)
      end_pos +=75
      col_count +=1
      if col_count == 150:
        col_count =0

    elif end_pos == 0:
      end_pos= curr_pos +75
      if int(curr_pos) in range(int_start,int_end):
        label_list.append(1)
      else:
        label_list.append(0)
 '''





'\n    if curr_pos == end_pos:\n      if int(curr_pos) in range(int_start,int_end):\n        label_list.append(1)\n      else:\n        label_list.append(0)\n      end_pos +=75\n      col_count +=1\n      if col_count == 150:\n        col_count =0\n\n    elif end_pos == 0:\n      end_pos= curr_pos +75\n      if int(curr_pos) in range(int_start,int_end):\n        label_list.append(1)\n      else:\n        label_list.append(0)\n '

# HBV

## Generate Bam File

In [None]:
#load hbv data
import pandas as pd
hbv_data = pd.read_csv('Spliced_Data_HBV.csv')
hbv_data.head()

Unnamed: 0.1,Unnamed: 0,virus,virus_ref,begin_ref,stop_ref,human_ref,begin_breakpoint,stop_breakpoint,spliced_seq,viral_seq,human_ref_sequence,human_seq_upstream,human_seq_downstream
0,0,HBV,X70185.1,420,437,GRCh37/hg19,1720,1603,NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...,,,,NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...
1,1,HBV,X70185.1,97732036,97732056,GRCh37/hg19,1246,1390,TACTTTTAAAGTCACATGAATTTAGACAAATAGTACTACACAATAA...,TTTGTGGCTCCTCTGCCGATCCATACTGCGGAACTCCTAGCCGCTT...,TACTTTTAAAGTCACATGAATTTAGACAAATAGTACTACACAATAA...,TACTTTTAAAGTCACATGAATTTAGACAAATAGTACTACACAATAA...,gtaatttgtaatgaacagatattcatttCtttttttattattatta...
2,2,HBV,X70185.1,194161891,194161910,GRCh37/hg19,1603,1664,GGTTTGTGGTTGTTTCAACTTAGATTGACAGTTGTATACtgtgtgt...,GTTGCATGGAGACCACCGTGAACGCCCATCAGATCCTGCCCAAGGT...,GGTTTGTGGTTGTTTCAACTTAGATTGACAGTTGTATACtgtgtgt...,GGTTTGTGGTTGTTTCAACTTAGATTGACAGTTGTATACtgtgtgt...,AGTGTTTGCTTTTTAAAATGTTAAGATTAAATCATATTGTTACttt...
3,3,HBV,X70185.1,171450967,171450816,GRCh37/hg19,1623,1717,ctagtaagccagtgtttcctatgttgggtaaacacatttgcagtgg...,AACGCCCATCAGATCCTGCCCAAGGTCTTACATAAGAGGACTCTTG...,cagggtggagtgtagtggcatgatcatagatcactgcaacctcaaa...,ctagtaagccagtgtttcctatgttgggtaaacacatttgcagtgg...,cagggtggagtgtagtggcatgatcatagatcactgcaacctcaaa...
4,4,HBV,X70185.1,171450967,171450816,GRCh37/hg19,1638,1717,ctagtaagccagtgtttcctatgttgggtaaacacatttgcagtgg...,CTGCCCAAGGTCTTACATAAGAGGACTCTTGGACTCTCAGCAATGT...,accataaggttcatgagggcagggcagggatcatgtacattctctt...,ctagtaagccagtgtttcctatgttgggtaaacacatttgcagtgg...,accataaggttcatgagggcagggcagggatcatgtacattctctt...


In [None]:
# prompt: remove first row from hbv_Data
hbv_data = hbv_data.iloc[1:]
hbv_data = hbv_data[hbv_data['spliced_seq'].str.len() == 2000]

In [None]:
# CHECK WORK
def insertion_span(vir_start, vir_end, ref_start, ref_end):
  import math
  diff = abs(vir_start- vir_end)
  dir =  0 if vir_start < vir_end else 1
  return min(ref_start, ref_end), min(ref_start, ref_end) -diff, dir

In [21]:
# @title Default title text
##TROUBLESHOOT####################################################
for index, row in hbv_data.iterrows():
  try:
    bam_file_path = '/content/drive/MyDrive/ACS_Research/VISDB_Data/Aligned_Sequences/'  +str(row['virus'])+str(index)+'.bam' # Changed to .bam
    read1, read2 = create_bam_from_sequence(bam_file_path,row['spliced_seq'], row['human_ref_sequence'], 'hg19', row['begin_breakpoint'])

    start,end,direction = insertion_span(row['begin_ref'], row['stop_ref'], row['begin_breakpoint'], row['stop_breakpoint'])

    #generate matrix 1
    Data1, Label1=matrix_generator_uno(row['viral_seq'], len(str(row['viral_seq'])))

    #save matrix 1 output
    np.save('/content/drive/MyDrive/ACS_Research/VISDB_Data/Matrices/HBV/Experiment_1/'+str(row['virus'])+str(index)+'_Matrix_data.npy', Data1)
    np.save('/content/drive/MyDrive/ACS_Research/VISDB_Data/Matrices/HBV/Experiment_1/'+str(row['virus'])+str(index)+'_Label_data.npy', Label1)

  #generate matrix 2
    Data2,Label2 =matrix_generator_dos (read1, read2, start, end, Matrix_Name ='HBV_Matrix_2_data', Label_Name='HBV_Label_2_data', save_file_path=os.getcwd())

  #save generated data
    np.save('/content/drive/MyDrive/ACS_Research/VISDB_Data/Matrices/HBV/Experiment_2/'+str(row['virus'])+str(index)+'_Matrix_data.npy', Data2)
    np.save('/content/drive/MyDrive/ACS_Research/VISDB_Data/Matrices/HBV/Experiment_2/'+str(row['virus'])+str(index)+'_Label_data.npy', Label2)


  #generate matrix 3
    Data3,Label3 =matrix_generator_tres(read1,read2, start, end, direction, Matrix_Name ='Matrix_data', Label_Name='Label_data', save_file_path=os.getcwd())

    #save matrix
    np.save('/content/drive/MyDrive/ACS_Research/VISDB_Data/Matrices/HBV/Experiment_3/'+str(row['virus'])+str(index)+'_Matrix_data.npy', Data3)
    np.save('/content/drive/MyDrive/ACS_Research/VISDB_Data/Matrices/HBV/Experiment_3/'+str(row['virus'])+str(index)+'_Label_data.npy', Label3)
  except:
    print(row['viral_seq'])
    pass

NameError: name 'hbv_data' is not defined

# HPV

In [None]:
#load hbv data
import pandas as pd
hpv_data = pd.read_csv('Spliced_Data_HPV.csv')
hpv_data = hpv_data[hpv_data['spliced_seq'].str.len() == 2000]
hpv_data.head()

Unnamed: 0.1,Unnamed: 0,virus,virus_ref,begin_ref,stop_ref,human_ref,begin_breakpoint,stop_breakpoint,spliced_seq,viral_seq,human_ref_sequence,human_seq_upstream,human_seq_downstream
0,0,HPV,AF125673.1,28595279,28595577,GRCh37/hg19,2827,3062,catctttttAGACCTACGTGACCATATAGACTATTGGAAACACATG...,AGACCTACGTGACCATATAGACTATTGGAAACACATGCGCCTAGAA...,catctttttgtgtcagaaagtaaggaagtgctcaaagaacaacgcg...,catcttttt,gattgattaaggtcagagtcatcaaacgatgctaaggctggtgggc...
1,1,HPV,AF125673.1,28595311,28595730,GRCh37/hg19,2505,3062,gaggatgccaggatttgccactattgtttaacatcattctggaact...,GGATGTAAAGCATAGACCATTGGTACAACTAAAATGCCCTCCATTA...,gaggatgccaggatttgccactattgtttaacatcattctggaact...,gaggatgccaggatttgccactattgtttaacatcattctggaact...,ccaacctgacctggtgtcaaggttaacatcactaatggtgggacag...
2,2,HPV,AF125673.1,28595585,28595729,GRCh37/hg19,2173,2575,agacttggaactaacccaaatgtccatcagtgatagactgggttaa...,GGTGATTGGAAGCAAATTGTTATGTTTTTAAGGTATCAAGGTGTAG...,agacttggaactaacccaaatgtccatcagtgatagactgggttaa...,agacttggaactaacccaaatgtccatcagtgatagactgggttaa...,accaacctgacctggtgtcaaggttaacatcactaatggtgggaca...
3,3,HPV,AF125673.1,28595716,28596131,GRCh37/hg19,3151,3678,aatgccattctccttaaacaaatttataagaaaaaatcaaacaacc...,AAACTGGACACATATATATATTTGTGAAGAAGCATCAGTAACTGTG...,aatgccattctccttaaacaaatttataagaaaaaatcaaacaacc...,aatgccattctccttaaacaaatttataagaaaaaatcaaacaacc...,cctaatttggaggatttaatgatagtaacaaggag
4,4,HPV,AF125673.1,28595741,28595968,GRCh37/hg19,3693,4038,tggcgattcctcagggatctagaactagcaattccatttgacccag...,CTGCAGTGTCGTCTACATGGCATTGGACAGGACATAATGTAAAACA...,tggcgattcctcagggatctagaactagcaattccatttgacccag...,tggcgattcctcagggatctagaactagcaattccatttgacccag...,tgtttcctattcaaggaaactaatgaggcatgacaactcaacgcag...


In [None]:
##TROUBLESHOOT####################################################
for index, row in hpv_data.iterrows():
  try:
    bam_file_path = '/content/drive/MyDrive/ACS_Research/VISDB_Data/Aligned_Sequences/'  +str(row['virus'])+str(index)+'.bam' # Changed to .bam
    read1, read2 = create_bam_from_sequence(bam_file_path,row['spliced_seq'], row['human_ref_sequence'], 'hg19', row['begin_breakpoint'])

    start,end,direction = insertion_span(row['begin_ref'], row['stop_ref'], row['begin_breakpoint'], row['stop_breakpoint'])

    #generate matrix 1
    Data1, Label1=matrix_generator_uno(row['viral_seq'], len(str(row['viral_seq'])))

    #save matrix 1 output
    np.save('/content/drive/MyDrive/ACS_Research/VISDB_Data/Matrices/HPV/Experiment_1/'+str(row['virus'])+str(index)+'_Matrix_data.npy', Data1)
    np.save('/content/drive/MyDrive/ACS_Research/VISDB_Data/Matrices/HPV/Experiment_1/'+str(row['virus'])+str(index)+'_Label_data.npy', Label1)

  #generate matrix 2
    Data2,Label2 =matrix_generator_dos (read1, read2, start, end, Matrix_Name ='HBV_Matrix_2_data', Label_Name='HBV_Label_2_data', save_file_path=os.getcwd())

  #save generated data
    np.save('/content/drive/MyDrive/ACS_Research/VISDB_Data/Matrices/HPV/Experiment_2/'+str(row['virus'])+str(index)+'_Matrix_data.npy', Data2)
    np.save('/content/drive/MyDrive/ACS_Research/VISDB_Data/Matrices/HPV/Experiment_2/'+str(row['virus'])+str(index)+'_Label_data.npy', Label2)


  #generate matrix 3
    Data3,Label3 =matrix_generator_tres(read1,read2, start, end, direction, Matrix_Name ='Matrix_data', Label_Name='Label_data', save_file_path=os.getcwd())

    #save matrix
    np.save('/content/drive/MyDrive/ACS_Research/VISDB_Data/Matrices/HPV/Experiment_3/'+str(row['virus'])+str(index)+'_Matrix_data.npy', Data3)
    np.save('/content/drive/MyDrive/ACS_Research/VISDB_Data/Matrices/HPV/Experiment_3/'+str(row['virus'])+str(index)+'_Label_data.npy', Label3)
  except:
    print(row['viral_seq'])
    pass

TTGGCCAACCACTCCGCCGCGACCCATACCAAAGCCGTCGCCTTGGGCACCGAAGAAACACAGACGACTATCCAGCGACCAAGATCAGAGCCAGACACCGGAAACCCCTGCCACACCACTAAGTTGTTGCACAGAGACTCAGTGGACAGTGCTCCAATCCTCACTGCATTTAACAGCTCACACAAAGGACGGATTAACTGTAATAGTAACACTACACCCATAGTACATTTAAAAGGTGATGCTAATACTTTAAAATGTTTAAGATATAGATTTAAAAAGCATTGTACATTGTATACTGCAGTGTCGTCTACATGGCATTGGACAGGACAT
TGAGTTTCCATTTGACGAAAACGGAAATCCAGTGTATGAGCTTAATGATAAGAACTGGAAATCCTTTTTCTCAAGGACGTGGTCCAGATTAAGTTTGCACGAGGACGAGGACAAGGAAAACGATGGAGACTCTTTGCCAACGTTTAAATGTGTGTCAGGACAAAATACTAACACATTATG
GCAGTACAAATATGTCATTATGTGCTGCCATATCTACTTCAGAAACTACATATAAAAATACTAACTTTAAGGAGTACCTACGACATGGGGAGGAATATGATTTACAGTTTATTTTTCAACTGTGCAAAATAACCTTAACTGCAGACGTTATGACATACATACATTCTATGAATTCCACTATTTTGGAGGACTGGAATTTTGGTCTACAACCTCCCCCAGGAGGCACACTAGAAGATACTTATAGGTTTGTAACATCCCAGGCAATTGCTTGTCAAAAACATACACCTCCAGCACCTAAAGAAGATCCCCTTAAAAAATACACTTTTTGGGAAGTAAATTTAAAGGAAAAGTTTTCTGCAGACCTAGATCAGTTTCCTTTAGGACGCAAATTTTTACTACAAGCAGGATTGAAGGCCAAACCAAAATTTACATTAGGAAAACGAAAAGCTACACCCACCACCTCATCTACCTCTACAACTGCTAAACGC

# Reference Data


In [22]:
#load hbv data
import pandas as pd
ref_data = pd.read_csv('Spliced_Data_Reference.csv')
ref_data.head()

Unnamed: 0.1,Unnamed: 0,human_ref,chromosome,start,stop,human_ref_sequence
0,0,GRCh37/hg19,chr1,6812,8812,NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...
1,1,GRCh37/hg19,chr1,97735381,97737381,ggaggttaggaggttagggcttcaacatatgaactttggagggagg...
2,2,GRCh37/hg19,chr1,194167276,194169276,TAAAATCATTAATTCCAGGGATAAAGAACTCCACTAGAGAGATTAT...
3,3,GRCh37/hg19,chr2,171455372,171457372,gctaagaatctagtacctcgtaagcTATAAAACTATAGTTAACCAT...
4,4,GRCh37/hg19,chr2,171449160,171451160,tcactgaccataaggttcatgagggcagggcagggatcatgtacat...


In [23]:
# prompt: remove first row from hbv_Data
ref_data = ref_data.iloc[1:]


In [14]:
# CHECK WORK
def insertion_span(vir_start, vir_end, ref_start, ref_end):
  import math
  diff = abs(vir_start- vir_end)
  dir =  0 if vir_start < vir_end else 1
  return min(ref_start, ref_end), min(ref_start, ref_end) -diff, dir

In [26]:
##TROUBLESHOOT####################################################
for index, row in ref_data.iterrows():
  try:
    bam_file_path = '/content/drive/MyDrive/ACS_Research/VISDB_Data/Aligned_Sequences/'+str(row['chromosome'])+'_'+str(index)+'.bam' # Changed to .bam
    #print(row['human_ref_sequence'])
    read1, read2 = create_bam_from_sequence(bam_file_path,row['human_ref_sequence'], row['human_ref_sequence'], 'hg19', row['start'])

    #start,end,direction = insertion_span(row['begin_ref'], row['stop_ref'], row['begin_breakpoint'], row['stop_breakpoint'])

    #generate matrix 1
    #Data1, Label1=matrix_generator_uno(row['human_ref_sequence'], len(str(row['human_ref_sequence'])))

    #save matrix 1 output
    #np.save('/content/drive/MyDrive/ACS_Research/VISDB_Data/Matrices/References/Experiment_1/'+str(row['chromosome'])+'_'+str(index)+'_Matrix_data.npy', Data1)
    #np.save('/content/drive/MyDrive/ACS_Research/VISDB_Data/Matrices/References/Experiment_1/'+str(row['chromosome'])+'_'+str(index)+'_Label_data.npy', Label1)

  #generate matrix 2
    #Data2,Label2 =matrix_generator_dos (read1, read2, row['start'], row['stop'], Matrix_Name ='HBV_Matrix_2_data', Label_Name='HBV_Label_2_data', save_file_path=os.getcwd())

  #save generated data
    #np.save('/content/drive/MyDrive/ACS_Research/VISDB_Data/Matrices/References/Experiment_2/'+str(row['chromosome'])+'_'+str(index)+'_Matrix_data.npy', Data2)
    #np.save('/content/drive/MyDrive/ACS_Research/VISDB_Data/Matrices/References/Experiment_2/'+str(row['chromosome'])+'_'+str(index)+'_Label_data.npy', Label2)


  #generate matrix 3
    Data3,Label3 =matrix_generator_tres(read1,read2, row['start'], row['stop'], 0, Matrix_Name ='Matrix_data', Label_Name='Label_data', save_file_path=os.getcwd())

    #save matrix
    np.save('/content/drive/MyDrive/ACS_Research/VISDB_Data/Matrices/References/Experiment_3/'+str(row['chromosome'])+'_'+str(index)+'_Matrix_data.npy', Data3)
    np.save('/content/drive/MyDrive/ACS_Research/VISDB_Data/Matrices/References/Experiment_3/'+str(row['chromosome'])+'_'+str(index)+'_Label_data.npy', Label3)
  except:
    print(row['human_ref_sequence'])

nan
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN