# Pre Processing Script
This script uses multiple libraries to transform the RNA strings into the attributes that identify each of them. During the final conversion process, the CPU is parallelized to optimize it.

# Utils / Setup

The following attribute should be set to `False` if running on Google Colab + Drive.

In [62]:
LOCAL = True

In [63]:
UTILS_DIR = './utils'
FASTA_FILE_PATH = f'{UTILS_DIR}/gencode.v49.pc_transcripts.fa'
CPB_MATRIX_PATH = f'{UTILS_DIR}/cpb_matrix.pkl'
CAI_WEIGHTS_PATH = f'{UTILS_DIR}/cai_weights.pkl'

In [64]:
DATA_DIR = './data'
PRE_PROCESSING_PATH = f'{DATA_DIR}/sanofi-1st/not-processed.csv'
RESULT_PATH = f'{DATA_DIR}/sanofi-1st/processed.parquet'

## Not Local (e.g., Google Colab)

The following are the main libraries used to synthetize most of the attributes.

In [65]:
if not LOCAL:
  !pip install biopython
  !pip install ViennaRNA
  !pip install pandarallel
  !pip install ipywidgets

### Drive Management

Firstly, a request to the Drive is made. If succesfull, it might reload the page

In [66]:
if not LOCAL:
  from google.colab import drive
  drive.mount(r"/content/drive", force_remount=True)

In [67]:
if not LOCAL:
  %cd /content/drive/.shortcut-targets-by-id/<your-drive-id>/<your-folder-path>

In [68]:
if not LOCAL:
  %ls

## Data Loading

Before starting, the gencode must be checked and eventually loaded

In [69]:
import os
import urllib.request
import gzip
import shutil

os.makedirs(UTILS_DIR, exist_ok=True)

if not os.path.exists(FASTA_FILE_PATH):
    print(f"Downloading {FASTA_FILE_PATH}...")
    os.makedirs(UTILS_DIR, exist_ok=True)
    url = "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_49/gencode.v49.pc_transcripts.fa.gz"
    gz_path = FASTA_FILE_PATH + ".gz"
    
    try:
        urllib.request.urlretrieve(url, gz_path)
        
        print(f"Decompressing {gz_path}...")
        with gzip.open(gz_path, 'rb') as f_in:
            with open(FASTA_FILE_PATH, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        
        os.remove(gz_path)
        print("Download and decompression complete.")
    except Exception as e:
        print(f"Error downloading or decompressing: {e}")
        if os.path.exists(gz_path):
            os.remove(gz_path)
else:
    print(f"FASTA file already exists at {FASTA_FILE_PATH}")

FASTA file already exists at ./utils/gencode.v49.pc_transcripts.fa


In [70]:
# Load data from PRE_PROCESSING_PATH (CSV or Parquet).
# - If it ends with .parquet -> read_parquet
# - Otherwise -> read_csv

import pandas as pd

try:
  if isinstance(PRE_PROCESSING_PATH, str) and PRE_PROCESSING_PATH.lower().endswith('.parquet'):
    df = pd.read_parquet(PRE_PROCESSING_PATH)
  else:
    df = pd.read_csv(PRE_PROCESSING_PATH)

  if 'Sequence' in df.columns:
    df['Sequence'] = df['Sequence'].astype(str).str.upper().str.replace('U', 'T')

  display(df)
except Exception as e:
  print('Error: ', e)

Unnamed: 0,Sequence,Value,Dataset,Split
0,ATGGCATCATCAGAAGACGTCATAAAAGAATTTATGCGATTCAAAG...,10.164760,mRFP Expression,train
1,ATGGCGTCTTCAGAGGATGTAATCAAGGAATTCATGCGTTTTAAGG...,10.572869,mRFP Expression,train
2,ATGGCATCATCGGAAGATGTAATAAAGGAATTTATGCGTTTCAAAG...,9.766912,mRFP Expression,train
3,ATGGCGAGTAGTGAAGACGTTATCAAAGAATTTATGCGTTTTAAGG...,9.926981,mRFP Expression,train
4,ATGGCTTCTTCTGAGGACGTAATAAAGGAGTTCATGAGGTTCAAGG...,9.857074,mRFP Expression,train
...,...,...,...,...
1454,ATGGCGTCGTCTGAGGATGTAATTAAAGAGTTCATGCGCTTTAAGG...,10.542758,mRFP Expression,test
1455,ATGGCATCGTCGGAGGACGTCATAAAGGAGTTCATGAGATTCAAGG...,9.087852,mRFP Expression,test
1456,ATGGCATCTTCGGAGGACGTCATAAAGGAGTTCATGAGATTCAAGG...,9.147396,mRFP Expression,test
1457,ATGGCATCGTCGGAAGATGTAATTAAGGAGTTCATGCGTTTTAAAG...,10.690711,mRFP Expression,test


# Pre-processing

## Synthetization Procedures
The following section defines the functions and data used to synthetize the attributes of the codon strings.

### GC Content

In [71]:
from Bio.SeqUtils import gc_fraction

def get_basic_stats(sequence):
  gc_score = gc_fraction(sequence)
  length = len(sequence)
  return gc_score, length

### Minimum Free Energy (MFE)


In [72]:
import RNA
from functools import lru_cache

@lru_cache(maxsize=200_000)
def _mfe(seq: str) -> float:
  """Return minimum free energy (MFE) for a sequence, cached for speed."""
  if not seq:
    return 0.0
  # ViennaRNA: fold_compound(...).mfe() is usually faster than RNA.fold for many calls
  fc = RNA.fold_compound(seq)
  _structure, mfe = fc.mfe()
  return float(mfe)

def get_folding_energy(sequence, start_len: int = 50):
  seq = str(sequence)
  global_mfe = _mfe(seq)
  start_mfe = _mfe(seq[:start_len])
  return global_mfe, start_mfe

### Codon Adaptation Index (CAI)


In [73]:
from Bio.SeqUtils import CodonAdaptationIndex
from Bio.Data import CodonTable

In [74]:
from collections import defaultdict
from Bio import SeqIO

def build_human_cai_weights(fasta_file_path: str = FASTA_FILE_PATH):
  codon_counts = defaultdict(int)

  for record in SeqIO.parse(fasta_file_path, "fasta"):
          seq = str(record.seq).upper()
          if len(seq) % 3 != 0: continue

          # Simple loop to count codons
          for i in range(0, len(seq), 3):
              codon = seq[i:i+3]
              # Skip 'N' or weird characters
              if len(codon) == 3 and all(base in 'ATCG' for base in codon):
                  codon_counts[codon] += 1

  # 2. Group by Amino Acid
  # We use Biopython's standard genetic code table (ID 1 = Standard)
  # This maps 'TTT' -> 'F' (Phenylalanine), etc.
  genetic_code = CodonTable.unambiguous_dna_by_id[1]

  # We need a reverse map: Amino Acid -> List of Codons
  # e.g., 'A': ['GCT', 'GCC', 'GCA', 'GCG']
  aa_to_codons = defaultdict(list)

  for codon, aa in genetic_code.forward_table.items():
      aa_to_codons[aa].append(codon)

  # Don't forget Stop codons! (Usually we skip them for CAI, but let's be safe)
  # genetic_code.stop_codons are ['TAA', 'TAG', 'TGA']

  # 3. Calculate Weights (W)
  cai_weights = {}

  for aa, codons in aa_to_codons.items():
      # Find the max count for this amino acid
      counts = [codon_counts[c] for c in codons]
      max_count = max(counts)

      # Assign weights
      for codon, count in zip(codons, counts):
          if max_count > 0:
              weight = count / max_count
          else:
              weight = 0.01 # Fallback for extremely rare/missing codons

          cai_weights[codon] = round(weight, 4) # Round for clean JSON

  # Handle Met (ATG) and Trp (TGG) explicitly if needed (they are always 1.0)
  cai_weights['ATG'] = 1.0
  cai_weights['TGG'] = 1.0

  return cai_weights

#### Build the Weights and Serialize them

In [75]:
import math
import pickle

CAI_DEFAULT_WEIGHT = 0.01
CAI_DEFAULT_LOG = math.log(CAI_DEFAULT_WEIGHT)

def get_cai_weights(weights_path: str = CAI_WEIGHTS_PATH):
  # load the weights
  if (os.path.exists(weights_path)):
    with open(weights_path, 'rb') as f:
      return pickle.load(f)
  # or build it, if absent
  else:
    weights = build_human_cai_weights()

    with open(weights_path, 'wb') as f:
      pickle.dump(weights, f)

    return weights

human_cai_weights = get_cai_weights()
human_cai_log_weights = {
  codon: (math.log(w) if (w is not None and w > 0) else CAI_DEFAULT_LOG)
  for codon, w in human_cai_weights.items()
}
f"CAI Weights built! Score for CGG: {human_cai_weights.get('CGG', None)}"

'CAI Weights built! Score for CGG: 0.4019'

In [76]:
def get_cai_score(codons, log_weights = human_cai_log_weights):
    """Calculates the Codon Adaptation Index (CAI) using precomputed log-weights."""
    if not codons:
        return 0.0

    body = codons[:-1]  # exclude stop codon
    if not body:
        return 0.0

    log_sum = 0.0
    valid = 0
    for codon in body:
        if len(codon) == 3:
            log_sum += log_weights.get(codon, CAI_DEFAULT_LOG)
            valid += 1

    if valid == 0:
        return 0.0
    return math.exp(log_sum / valid)

In [77]:
def get_cai_score_splitted(codons, log_weights = human_cai_log_weights, ramp_length = 5):
    """Calculates CAI for the ramp (start) and tail separately (fast: uses log-weights)."""
    if not codons:
        return 0.0, 0.0
    if len(codons) < ramp_length + 1:
        return 0.0, 0.0

    # Ramp: skip ATG at index 0
    ramp_slice = codons[1 : ramp_length + 1]
    tail_slice = codons[ramp_length + 1 : -1]  # exclude stop codon

    def geo_mean_from_logs(codon_list):
        if not codon_list:
            return 0.0
        log_sum = 0.0
        valid = 0
        for c in codon_list:
            if len(c) == 3:
                log_sum += log_weights.get(c, CAI_DEFAULT_LOG)
                valid += 1
        if valid == 0:
            return 0.0
        return math.exp(log_sum / valid)

    return round(geo_mean_from_logs(ramp_slice), 4), round(geo_mean_from_logs(tail_slice), 4)

### Kozak Strength

In [78]:
def get_kozak_score(full_mrna_seq, start_index):
    if start_index < 6 or start_index + 4 > len(full_mrna_seq):
        return 0.0 # Sequence too short

    window = full_mrna_seq[start_index-6 : start_index+4]

    # Simple scoring (based on consensus GCC(A/G)CC AUG G)
    score = 0

    # The -3 Position (Crucial) - Should be A or G
    if window[3] in ['A', 'G']:
        score += 3

    # The +4 Position (Crucial) - Should be G
    if window[-1] == 'G':
        score += 3

    # The -6 Position (Minor) - Should be G
    if window[0] == 'G':
        score += 1

    return score

### Codon Pair Bias (CPB)
The matrix is build by using the `FASTA_FILE_PATH`, containing the _gold standard of human evolution_: it contains the coding sequence of every single gene that currently exists in the human body - these are the winners of million of years of evolution.

In the translation efficiency context, it's used to give a human metric to the codon sequence.

The Codon Pair Bias matrix is a lookup table that tell us how compatible two codons are when placed next to each other _i.e., how much they fit with each other_.

In [79]:
from Bio import SeqIO
from collections import defaultdict
import math

def build_human_cpb_matrix(fasta_file_path: str = FASTA_FILE_PATH):
    """
    Reads a file of ALL human genes and calculates the CPB Matrix.
    """
    pair_counts = defaultdict(int)
    codon_counts = defaultdict(int)
    total_pairs = 0

    # 1. Count everything in the human genome
    for record in SeqIO.parse(fasta_file_path, "fasta"):
        seq = str(record.seq).upper()
        if len(seq) % 3 != 0: continue # Skip fragments

        for i in range(0, len(seq) - 3, 3):
            codon_a = seq[i:i+3]
            codon_b = seq[i+3:i+6]

            # Skip invalid characters or stop codons if desired
            if "N" in codon_a or "N" in codon_b: continue

            pair_counts[f"{codon_a}-{codon_b}"] += 1
            codon_counts[codon_a] += 1
            # We count codon_b in the 'second position' context conceptually,
            # but for simple CPB, global codon counts are often used.
            total_pairs += 1

    # 2. Calculate Scores: ln( Observed / Expected )
    cpb_scores = {}

    # Expected(AB) = (Count(A) * Count(B)) / Total_Pairs
    # This is a simplified expectation model (independent probability)

    for pair, observed_count in pair_counts.items():
        codon_a, codon_b = pair.split('-')

        count_a = codon_counts[codon_a]
        count_b = codon_counts[codon_b]

        # Calculate Expected Count
        if total_pairs > 0:
            expected_count = (count_a * count_b) / total_pairs
        else:
            expected_count = 0

        # Avoid division by zero or log(0)
        if expected_count > 0 and observed_count > 0:
            score = math.log(observed_count / expected_count)
            cpb_scores[(codon_a, codon_b)] = score
        else:
            cpb_scores[(codon_a, codon_b)] = -5.0 # Penalty for non-existent pairs

    return cpb_scores

#### Build the Matrix and Serialize it

In [80]:
def get_cpb_matrix(matrix_path: str = CPB_MATRIX_PATH):
  # load the matrix
  if (os.path.exists(matrix_path)):
    with open(matrix_path, 'rb') as f:
      return pickle.load(f)
  # or build it, if absent
  else:
    matrix = build_human_cpb_matrix()

    with open(matrix_path, 'wb') as f:
      pickle.dump(matrix, f)

    return matrix

human_cpb_matrix = get_cpb_matrix()
f"Matrix built! Score for GCC-CTG and viceversa: {human_cpb_matrix[('CGG', 'CTG')]} , {human_cpb_matrix[('CTG', 'CGG')]}"

'Matrix built! Score for GCC-CTG and viceversa: 0.25860812619029416 , 0.3011096162811363'

In [81]:
def get_cpb_score(codons, cpb_matrix = human_cpb_matrix):
    """
    Calculates CPB from a pre-split list of codons.
    Excludes the final Stop Codon from the calculation.
    """
    # 1. Isolate the Body (Slice off the last element/Stop Codon)
    #    Input:  ['ATG', 'GCC', 'CTG', 'TGA']
    #    Body:   ['ATG', 'GCC', 'CTG']
    if len(codons) < 2:
        return 0.0

    body = codons[:-1]

    # 2. Safety Check: Do we have enough codons to make a pair?
    if len(body) < 2:
        return 0.0

    total_score = 0.0
    pair_count = 0

    # 3. Iterate directly through the list
    #    We go up to len(body) - 1 so we always have a "next" codon
    for i in range(len(body) - 1):
        c1 = body[i]
        c2 = body[i+1]

        # 4. Tuple Lookup
        #    Assuming your matrix keys are tuples: ('ATG', 'GCC')
        #    If your matrix uses string keys 'ATG-GCC', change this line.
        val = cpb_matrix.get((c1, c2), 0.0)

        total_score += val
        pair_count += 1

    # 5. Return Average
    return total_score / pair_count

## Attribute Synthetization
Here, the attributes are synthetized and serialized, starting from the following codon strings

In [82]:
def extract_longest_orf(sequence):
    """Return (longest_cds, start_index) scanning in O(n).
    
    The previous implementation was O(n^2) because it tried every ATG start position.
    This version scans each reading frame once and tracks candidate starts until a stop codon.
    """
    seq = str(sequence).upper().replace('U', 'T')
    start_codon = "ATG"
    stop_codons = {"TAA", "TAG", "TGA"}

    best_start = 0
    best_end = 0
    best_len = 0

    n = len(seq)
    for frame in (0, 1, 2):
        starts_since_stop = []
        for i in range(frame, n - 2, 3):
            codon = seq[i : i + 3]
            if codon == start_codon:
                starts_since_stop.append(i)
            elif codon in stop_codons:
                if starts_since_stop:
                    # Earliest start gives the longest ORF ending at this stop.
                    s = starts_since_stop[0]
                    e = i + 3
                    cand_len = e - s
                    if cand_len > best_len:
                        best_len = cand_len
                        best_start = s
                        best_end = e
                starts_since_stop = []

    if best_len == 0:
        return "", 0
    return seq[best_start:best_end], best_start

In [83]:
def get_codons(cds_string):
  if len(cds_string) % 3 != 0:
    return None
  return [cds_string[i:i+3] for i in range(0, len(cds_string), 3)]

In [84]:
df[['cds', 'cds_start']] = df['Sequence'].apply(lambda x: pd.Series(extract_longest_orf(x)))
df['codons'] = df['cds'].apply(get_codons)
display(df)

Unnamed: 0,Sequence,Value,Dataset,Split,cds,cds_start,codons
0,ATGGCATCATCAGAAGACGTCATAAAAGAATTTATGCGATTCAAAG...,10.164760,mRFP Expression,train,ATGGCATCATCAGAAGACGTCATAAAAGAATTTATGCGATTCAAAG...,0,"[ATG, GCA, TCA, TCA, GAA, GAC, GTC, ATA, AAA, ..."
1,ATGGCGTCTTCAGAGGATGTAATCAAGGAATTCATGCGTTTTAAGG...,10.572869,mRFP Expression,train,ATGGCGTCTTCAGAGGATGTAATCAAGGAATTCATGCGTTTTAAGG...,0,"[ATG, GCG, TCT, TCA, GAG, GAT, GTA, ATC, AAG, ..."
2,ATGGCATCATCGGAAGATGTAATAAAGGAATTTATGCGTTTCAAAG...,9.766912,mRFP Expression,train,ATGGCATCATCGGAAGATGTAATAAAGGAATTTATGCGTTTCAAAG...,0,"[ATG, GCA, TCA, TCG, GAA, GAT, GTA, ATA, AAG, ..."
3,ATGGCGAGTAGTGAAGACGTTATCAAAGAATTTATGCGTTTTAAGG...,9.926981,mRFP Expression,train,ATGGCGAGTAGTGAAGACGTTATCAAAGAATTTATGCGTTTTAAGG...,0,"[ATG, GCG, AGT, AGT, GAA, GAC, GTT, ATC, AAA, ..."
4,ATGGCTTCTTCTGAGGACGTAATAAAGGAGTTCATGAGGTTCAAGG...,9.857074,mRFP Expression,train,ATGGCTTCTTCTGAGGACGTAATAAAGGAGTTCATGAGGTTCAAGG...,0,"[ATG, GCT, TCT, TCT, GAG, GAC, GTA, ATA, AAG, ..."
...,...,...,...,...,...,...,...
1454,ATGGCGTCGTCTGAGGATGTAATTAAAGAGTTCATGCGCTTTAAGG...,10.542758,mRFP Expression,test,ATGGCGTCGTCTGAGGATGTAATTAAAGAGTTCATGCGCTTTAAGG...,0,"[ATG, GCG, TCG, TCT, GAG, GAT, GTA, ATT, AAA, ..."
1455,ATGGCATCGTCGGAGGACGTCATAAAGGAGTTCATGAGATTCAAGG...,9.087852,mRFP Expression,test,ATGGCATCGTCGGAGGACGTCATAAAGGAGTTCATGAGATTCAAGG...,0,"[ATG, GCA, TCG, TCG, GAG, GAC, GTC, ATA, AAG, ..."
1456,ATGGCATCTTCGGAGGACGTCATAAAGGAGTTCATGAGATTCAAGG...,9.147396,mRFP Expression,test,ATGGCATCTTCGGAGGACGTCATAAAGGAGTTCATGAGATTCAAGG...,0,"[ATG, GCA, TCT, TCG, GAG, GAC, GTC, ATA, AAG, ..."
1457,ATGGCATCGTCGGAAGATGTAATTAAGGAGTTCATGCGTTTTAAAG...,10.690711,mRFP Expression,test,ATGGCATCGTCGGAAGATGTAATTAAGGAGTTCATGCGTTTTAAAG...,0,"[ATG, GCA, TCG, TCG, GAA, GAT, GTA, ATT, AAG, ..."


In [85]:
f"There are {len(df) - len(df.query('Sequence.str.len() == cds.str.len()'))} strings with the cds as a substring"

'There are 0 strings with the cds as a substring'

In [86]:
def calculate_metrics_dict(sequence, cds, cds_start, codons):
  # GC content
  gc = gc_fraction(sequence)

  # MFE
  mfe_global, mfe_start = get_folding_energy(sequence)

  # CAI
  #cai_score = get_cai_score(codons)
  ramp_cai, tail_cai = get_cai_score_splitted(codons)

  # Kozak
  kozak_score = get_kozak_score(sequence, cds_start)

  # CPB
  cpb_score = get_cpb_score(codons)

  return pd.Series({
        'GC': gc,
        'ramp_cai': ramp_cai,
        'tail_cai': tail_cai,
        'kozak': kozak_score,
        'CPB': cpb_score,
        'mfe_global': mfe_global,
        'mfe_start': mfe_start,
    })


In [87]:
import os
import importlib.util
import pandas as pd
from pandarallel import pandarallel

# Pandarallel's notebook progress bar relies on ipywidgets.
has_widgets = importlib.util.find_spec("ipywidgets") is not None
pandarallel.initialize(nb_workers=os.cpu_count(), progress_bar=has_widgets)
if not has_widgets:
    print("ipywidgets is not installed -> no progress bar.")
    print("Install with: %pip install ipywidgets  (or: pip install ipywidgets)")

def process_row(row):
    return calculate_metrics_dict(
        row['Sequence'],
        row['cds'],
        row['cds_start'],
        row['codons']
    )

# Reduce per-row serialization by only parallelizing the needed columns.
df_base = df[['Sequence', 'cds', 'cds_start', 'codons', 'Value']]
df_f = df_base.parallel_apply(process_row, axis=1, result_type='expand')
df_f['sequence'] = df_base['Sequence']

df_f['value'] = df_base['Value']
# normalize the value
df_f["value"] = df_f["value"] / df_f["value"].max()

df_f = df_f.iloc[:, [7, 8, 0, 1, 2, 3, 4, 5, 6]]
display(df_f)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=92), Label(value='0 / 92'))), HBox…

Unnamed: 0,sequence,value,GC,ramp_cai,tail_cai,kozak,CPB,mfe_global,mfe_start
0,ATGGCATCATCAGAAGACGTCATAAAAGAATTTATGCGATTCAAAG...,0.893305,0.486726,0.9294,0.7854,0.0,-0.069971,-179.399994,-5.9
1,ATGGCGTCTTCAGAGGATGTAATCAAGGAATTCATGCGTTTTAAGG...,0.929171,0.489676,0.7944,0.7701,0.0,-0.069443,-192.699997,-8.2
2,ATGGCATCATCGGAAGATGTAATAAAGGAATTTATGCGTTTCAAAG...,0.858341,0.507375,0.7256,0.7791,0.0,-0.054764,-203.899994,-7.9
3,ATGGCGAGTAGTGAAGACGTTATCAAAGAATTTATGCGTTTTAAGG...,0.872409,0.505900,0.7314,0.7569,0.0,-0.136812,-215.500000,-6.9
4,ATGGCTTCTTCTGAGGACGTAATAAAGGAGTTCATGAGGTTCAAGG...,0.866265,0.510324,0.9451,0.8302,0.0,-0.087608,-187.399994,-5.9
...,...,...,...,...,...,...,...,...,...
1454,ATGGCGTCGTCTGAGGATGTAATTAAAGAGTTCATGCGCTTTAAGG...,0.926525,0.492625,0.6202,0.7711,0.0,-0.069474,-204.699997,-10.2
1455,ATGGCATCGTCGGAGGACGTCATAAAGGAGTTCATGAGATTCAAGG...,0.798664,0.504425,0.5668,0.8270,0.0,-0.051701,-179.699997,-8.1
1456,ATGGCATCTTCGGAGGACGTCATAAAGGAGTTCATGAGATTCAAGG...,0.803897,0.504425,0.7289,0.8355,0.0,-0.062937,-189.000000,-7.6
1457,ATGGCATCGTCGGAAGATGTAATTAAGGAGTTCATGCGTTTTAAAG...,0.939527,0.492625,0.5665,0.7696,0.0,-0.064932,-203.699997,-7.5


In [88]:
# Saving to Parquet
df_f.to_parquet(RESULT_PATH, engine='pyarrow')

In [89]:
display(pd.read_parquet(RESULT_PATH))

Unnamed: 0,sequence,value,GC,ramp_cai,tail_cai,kozak,CPB,mfe_global,mfe_start
0,ATGGCATCATCAGAAGACGTCATAAAAGAATTTATGCGATTCAAAG...,0.893305,0.486726,0.9294,0.7854,0.0,-0.069971,-179.399994,-5.9
1,ATGGCGTCTTCAGAGGATGTAATCAAGGAATTCATGCGTTTTAAGG...,0.929171,0.489676,0.7944,0.7701,0.0,-0.069443,-192.699997,-8.2
2,ATGGCATCATCGGAAGATGTAATAAAGGAATTTATGCGTTTCAAAG...,0.858341,0.507375,0.7256,0.7791,0.0,-0.054764,-203.899994,-7.9
3,ATGGCGAGTAGTGAAGACGTTATCAAAGAATTTATGCGTTTTAAGG...,0.872409,0.505900,0.7314,0.7569,0.0,-0.136812,-215.500000,-6.9
4,ATGGCTTCTTCTGAGGACGTAATAAAGGAGTTCATGAGGTTCAAGG...,0.866265,0.510324,0.9451,0.8302,0.0,-0.087608,-187.399994,-5.9
...,...,...,...,...,...,...,...,...,...
1454,ATGGCGTCGTCTGAGGATGTAATTAAAGAGTTCATGCGCTTTAAGG...,0.926525,0.492625,0.6202,0.7711,0.0,-0.069474,-204.699997,-10.2
1455,ATGGCATCGTCGGAGGACGTCATAAAGGAGTTCATGAGATTCAAGG...,0.798664,0.504425,0.5668,0.8270,0.0,-0.051701,-179.699997,-8.1
1456,ATGGCATCTTCGGAGGACGTCATAAAGGAGTTCATGAGATTCAAGG...,0.803897,0.504425,0.7289,0.8355,0.0,-0.062937,-189.000000,-7.6
1457,ATGGCATCGTCGGAAGATGTAATTAAGGAGTTCATGCGTTTTAAAG...,0.939527,0.492625,0.5665,0.7696,0.0,-0.064932,-203.699997,-7.5
