In [9]:
import numpy as np
from hmmlearn import hmm
from collections import Counter

# Mappings
nucleotide_mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
state_mapping = {'G': 0, 'N': 1}

# Observed sequences (DNA)
train_sequences = ["ACGTAGCT", "CGTAGCTA", "GATCGTAC"]

# Hidden states (G = Gene, N = Non-Gene)
train_states = ["GGNNGGNN", "NNGGNNGG", "GGNNGGNN"]

# Convert sequences to numeric format
observed_sequences = [np.array([nucleotide_mapping[nuc] for nuc in seq]).reshape(-1, 1) for seq in train_sequences]
state_sequences = [np.array([state_mapping[state] for state in states]) for states in train_states]

# Define HMM parameters
n_states = 2
n_observations = 4
model = hmm.MultinomialHMM(n_components=n_states, n_iter=100, tol=1e-4, verbose=True)

# Prepare training data
X_train = np.concatenate(observed_sequences)
lengths = [len(seq) for seq in observed_sequences]

# Train the model
model.fit(X_train, lengths)

# Test on new sequence
test_sequence = "GTACGTA"
test_observed = np.array([nucleotide_mapping[nuc] for nuc in test_sequence]).reshape(-1, 1)
predicted_states = model.predict(test_observed)

# Convert back to G/N labels
predicted_labels = ''.join(['G' if s == 0 else 'N' for s in predicted_states])

# Print results
print("\nTest DNA Sequence: ", test_sequence)
print("Predicted Gene Regions:", predicted_labels)


MultinomialHMM has undergone major changes. The previous version was implementing a CategoricalHMM (a special case of MultinomialHMM). This new implementation follows the standard definition for a Multinomial distribution (e.g. as in https://en.wikipedia.org/wiki/Multinomial_distribution). See these issues for details:
https://github.com/hmmlearn/hmmlearn/issues/335
https://github.com/hmmlearn/hmmlearn/issues/340



Test DNA Sequence:  GTACGTA
Predicted Gene Regions: GNGNGNG


         1       0.00000000             +nan
         2       0.00000000      +0.00000000
