In [None]:
from hmmlearn.hmm import MultinomialHMM
import numpy as np
from sklearn.metrics import classification_report

# Step 1: Sample DNA sequences and their labels
sequences = ["ATGCGTACGTAGCTAGCTAGCTA", "CGTACGTAGCGTATAGCTAGCTA"]
labels =    ["IIIIIGGGGGGGGGGGGIIIIII", "IIIGGGGGGGGGGGIIIIIGGGG"]

# Step 2: Map nucleotides and states to integers
nucleotide_map = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
state_map = {'I': 0, 'G': 1}
inverse_state_map = {0: 'I', 1: 'G'}

# Step 3: Encode sequences and labels
X = [np.array([nucleotide_map[nuc] for nuc in seq]) for seq in sequences]
y = [np.array([state_map[state] for state in lbl]) for lbl in labels]
lengths = [len(seq) for seq in X]

# Step 4: Prepare data for training
X_concat = np.concatenate(X).reshape(-1, 1)
y_concat = np.concatenate(y)

# Step 5: Define and train the HMM
model = MultinomialHMM(n_components=2, n_iter=100, random_state=42)
model.fit(X_concat, lengths)

# Step 6: Prediction using Viterbi on one sequence
test_seq = X[0].reshape(-1, 1)
true_labels = y[0]
log_prob, predicted_states = model.decode(test_seq, algorithm="viterbi")

# Step 7: Evaluation
print("True Labels:     ", true_labels)
print("Predicted States:", predicted_states)
print("\nClassification Report:")
print(classification_report(true_labels, predicted_states, target_names=["Intergenic", "Gene"]))
