In [1]:
import pandas as pd
import numpy as np
from scipy.optimize import minimize
from collections import Counter
from itertools import product
from itertools import combinations


In [2]:

def TransitionCounts(T, M):
    pairs: list[str] = [str(T[i]) + str(T[i + 1]) for i in range(len(T) - 1)]
    pairs_order = [str(i) + str(j) for i, j in product(range(M), repeat=2)]
    iter = Counter(pairs)
    iter = [iter[pair] for pair in pairs_order]
    return iter

def mle(N):
    p_ij_hat_matrix = N / N.sum(axis=1, keepdims=True)
    return p_ij_hat_matrix

def prob_diff(z):
    pairs= list(combinations(z,2))
    w=[]
    for pair in pairs:
        diff = abs(pair[0]-pair[1])
        w.append(diff)
    return np.array(w)

def penalty_function(numerator,denominator):  
    pelt = np.zeros_like(numerator)
    for i in range(len(numerator)):
        if denominator[i] != 0:
            pelt[i] = numerator[i] / denominator[i]
        else:
            pelt[i] = 0
    return pelt


def eval_g_eq(x):
    row = [x[i*M:(i+1)*M] for i in range(len(x)//M)]
    constraints = []
    for z in row:
        constraints.extend([np.sum(z)-1])
    return constraints


def McALasso(data, lam):
    n =TransitionCounts(data,M)
    n_ij_matrix = np.reshape(n,(M,M))
    p_ij_hat_matrix = mle(n_ij_matrix)
    p_ij_hat = p_ij_hat_matrix.flatten()  
    w_deno= prob_diff(p_ij_hat) 
    def obj_f(x): #### Objective function
        w_num  = prob_diff(x) 
        pelt = penalty_function(w_num,w_deno)
        obj = -np.sum((n * np.log(x))) + lam*np.sum(pelt) 
        return obj
    
    res = minimize(obj_f, p_ij_hat, 
    constraints={'type': 'eq', 'fun': eval_g_eq}, 
    bounds=[(0, 1)] * (M**2), 
    method='SLSQP', 
    options={'maxiter': 160000}
    )
    p_tilde = res.x
    return n, p_tilde






def purity(tilde):
    purity_set = []
    each_class_tilde_index = []
    class_tilde = np.unique(tilde)
    each_class_tilde_index = []
    for class_val in class_tilde:
        indices = np.where(tilde == class_val)[0]
        each_class_tilde_index.append(indices.tolist())
    max_count = []
    for ell in each_class_index:
        count = []
        for i in each_class_tilde_index:
            count.append(len(np.intersect1d(ell,i)))
        max_count.append(np.max(count))
    purity_result = np.sum(max_count)/M**2    
    return purity_result


def FDnorm(b):
    b_matrix = np.reshape(b,(M,M))
    p_ij_tilde_bar = b_matrix- p_ij_true
    norm = np.sqrt(np.sum(np.abs(p_ij_tilde_bar) ** 2))
    return norm



In [5]:
sequence ## Any observed Markov sequences 

M = len(np.unique(sequence))
K = 5
N = len(sequence)
K_len = N//K

DATA = [sequence[i*K_len:(i+1)*K_len] for i in range(K)]


In [6]:
LAMBDA = np.arange(0, 1, 0.1)

cv_each_lambda_McALasso=[]
for j in range(len(LAMBDA)):
    L = LAMBDA[j]
    cv_each_training_adalasso = []
    cv_each_test_adalasso=[]
    cv_each_training_lasso = []
    cv_each_test_lasso=[]
    for i in range(K):
        testing_set = DATA[i]
        n_ij_testing = TransitionCounts(testing_set,M)
        training_set = np.delete(DATA, i, axis=0)
        for z in range(K-1):
            train_subdata = training_set[z]
            n_train, p_tilde_train = McALasso(train_subdata,L)
            cv_adalasso = -np.sum(n_ij_testing*np.log(p_tilde_train))  
            cv_each_training_adalasso.append(np.sum(cv_adalasso))
            
        cv_each_test_adalasso.append(np.sum(cv_each_training_adalasso))
    cv_each_lambda_McALasso.append(np.sum(cv_each_test_adalasso))   

position = list(cv_each_lambda_McALasso).index(min(cv_each_lambda_McALasso))
opt_lambda_adalasso = LAMBDA[position]
a, p_tilde = McALasso(sequence, opt_lambda_adalasso)

p_tilde = np.round(p_tilde,3)
print(np.reshape(p_tilde,(M,M)))

  obj = -np.sum((n * np.log(x))) + lam*np.sum(pelt)


[[0.305 0.168 0.222 0.305]
 [0.33  0.242 0.123 0.305]
 [0.305 0.222 0.227 0.246]
 [0.242 0.182 0.246 0.33 ]]
