In [9]:
from collections import defaultdict as DD

# Define the valid nucleotide bases
v = {'A', 'C', 'G', 'T'}

# File path for the input FASTA file
input_fasta_path = '/kaggle/input/multiline-input/multiline_input.fasta'

# Function to read a multi-line FASTA file and convert it to a single-line sequence
def read_fasta_as_single_line(fasta_path):
    sequence = []
    try:
        with open(fasta_path, 'r') as f:
            for line in f:
                if not line.startswith('>'):  # Skip header lines
                    sequence.append(line.strip().upper())
        return ''.join(sequence)  # Merge all lines into one sequence
    except FileNotFoundError:
        print(f"Error: File not found at {fasta_path}")
        return ''
    except Exception as e:
        print(f"Error reading FASTA file: {e}")
        return ''

# Build the Markov transition matrix
def build_markov_transition_matrix(sequence, valid_bases):
    m = DD(lambda: DD(int))
    p = None

    for b in sequence:
        if b not in valid_bases:
            p = None
            continue

        if p:
            m[p][b] += 1
        p = b

    return m

# Print the Markov transition matrix
def print_markov_transition_matrix(matrix, valid_bases):
    print("Markov Transition Matrix (Probabilities):\n")
    q = sorted(valid_bases)

    [print(f"{x:>8}", end='') for x in q]
    print()

    for a in q:
        t = sum(matrix[a].values())
        print(f"{a:>8}", end='')

        for b in q:
            r = matrix[a][b] / t if t > 0 else 0
            print(f"{r:8.5f}", end='')  # Print probabilities up to 5 decimal points
        print()

# Main processing
sequence = read_fasta_as_single_line(input_fasta_path)
if not sequence:
    print("No valid sequence was loaded. Exiting.")
else:
    markov_matrix = build_markov_transition_matrix(sequence, v)
    print_markov_transition_matrix(markov_matrix, v)

Markov Transition Matrix (Probabilities):

       A       C       G       T
       A 0.25087 0.24264 0.26479 0.24170
       C 0.24655 0.25444 0.24490 0.25411
       G 0.25183 0.24738 0.25564 0.24516
       T 0.27094 0.23744 0.25025 0.24138
