## Setup

In [None]:
!pip install --quiet dash dash_bio 
!pip install --quiet lingpy

## Import required libraries

In [None]:
import sys
sys.path.append('../')

import json
import os
from dash import Dash, html
import dash_bio as dashbio
import pandas as pd
import pandas as pd
import numpy as np
from collections import defaultdict
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.pairwise2 import align
import lingpy
import lingpy as lp
from lingpy import *
import src.utils.seq as seq
from configs.config import data as data_path, cache as cache_path

## Read the input and output data paths

In [None]:
activities_in_file_name = "activities.csv"
features_in_file_name = "features.csv"
variants_in_file_name = "variants.csv"
filtered_folder_name = "filtered"
task_seq_per_case_fasta_file_name = "task_seq_per_case.fasta"
activity_seq_per_case_fasta_file_name = "activity_seq_per_case.fasta"

app = Dash(__name__)

## Functions for visualizing

In [None]:
# Function to pad sequences with gaps to match the length of the longest sequence
def aligned_task_seq(sequences):
    aligned_list = mult_align(sequences, pprint=False)
    aligned_seq_list = [''.join(sublist) for sublist in aligned_list]

    return aligned_seq_list

# case as label and tasks as sequence
def case_task_seq(data_path=cache_path, file_name=activities_in_file_name, aligned=False, excludeConsecutive=False):
    global task_seq_per_case

    # read activities csv into pandas dataframe
    activities_df = pd.read_csv(os.path.join(data_path, filtered_folder_name, file_name))
    # remove cases which are None
    activities_df = activities_df[activities_df['case'] != 'None']
    activities = activities_df.groupby('case')['task'].apply(list).reset_index(name='tasks')

    # get the sequence dictionary
    task_alphabet_mapping = seq.get_seq_dict(activities_df, "task")
    task_seq_per_case, sequences = seq.get_fasta(activities.apply(list), task_alphabet_mapping , "tasks", "case", excludeConsecutive=excludeConsecutive)

    # save the fasta file
    seq.save_fasta(task_seq_per_case, os.path.join(data_path, filtered_folder_name, task_seq_per_case_fasta_file_name))

    # if aligned:
    #     print(task_alphabet_mapping)
    #     activities_df["aligned_task_list"] = aligned_task_seq(sequences)
    #     task_seq_per_case = seq.get_aligned_fasta(activities_df.apply(list), "aligned_task_list", "case", label_prefix="V", excludeConsecutive=excludeConsecutive)

    return task_seq_per_case, task_alphabet_mapping

def case_activity_seq(data_path=cache_path, file_name=activities_in_file_name, aligned=False, excludeConsecutive=False):
    global activity_seq_per_case

    # read activities csv into pandas dataframe
    activities_df = pd.read_csv(os.path.join(data_path, filtered_folder_name, file_name))
    # remove cases which are None
    activities_df = activities_df[activities_df['case'] != 'None']
    activities = activities_df.groupby('case')['activity'].apply(list).reset_index(name='activities')

    # get the sequence dictionary
    activity_alphabet_mapping = seq.get_seq_dict(activities_df, "activity")
    activity_seq_per_case, sequences = seq.get_fasta(activities.apply(list), activity_alphabet_mapping , "activities", "case", excludeConsecutive=excludeConsecutive)

    # save the fasta file
    seq.save_fasta(activity_seq_per_case, os.path.join(data_path, filtered_folder_name, activity_seq_per_case_fasta_file_name))

    # if aligned:
    #     print(task_alphabet_mapping)
    #     activities_df["aligned_task_list"] = aligned_task_seq(sequences)
    #     task_seq_per_case = seq.get_aligned_fasta(activities_df.apply(list), "aligned_task_list", "case", label_prefix="V", excludeConsecutive=excludeConsecutive)

    return activity_seq_per_case, activity_alphabet_mapping


def variant_task_seq(data_path=cache_path, activities_file_name=activities_in_file_name, 
                     variants_file_name=variants_in_file_name, aligned=False, excludeConsecutive=False):
    # get the list of files in a directory
    variants_file, variants_file = os.listdir(data_path)[0], os.listdir(data_path)[-1]
    # read activities csv into pandas dataframe
    variants_df = pd.read_csv(os.path.join(data_path, filtered_folder_name, variants_file_name))
    activities_df = pd.read_csv(os.path.join(data_path, filtered_folder_name, activities_file_name))

    # change task_list column to list
    variants_df['task_list'] = variants_df['task_list'].apply(lambda x: [task.strip().strip("'\"") for task in x[1:-1].split(',')])

    # get the sequence dictionary
    task_alphabet_mapping = seq.get_seq_dict(activities_df, "task")
    task_seq_per_variant, sequences = seq.get_fasta(variants_df.apply(list), task_alphabet_mapping , "task_list", "variant_ID", label_prefix="V", excludeConsecutive=excludeConsecutive)
    
    if aligned:
        print(task_alphabet_mapping)
        variants_df["aligned_task_list"] = aligned_task_seq(sequences)
        task_seq_per_variant = seq.get_aligned_fasta(variants_df.apply(list), "aligned_task_list", "variant_ID", label_prefix="V", excludeConsecutive=excludeConsecutive)

    return task_seq_per_variant, task_alphabet_mapping

def set_layout(data):
    app.layout = html.Div([
    dashbio.AlignmentChart(
        id='alignment-viewer',
        showgap=False,
        showconservation=False,
        showconsensus=False,
        tilewidth=50,
        # numtiles=10,
        data=data
    ),
])

## Defining different types of case streams

In [None]:
def case_stream(excludeConsecutive=False, type="task"):
    if type == "task":
        print("TASK STREAM BY CASE")
        data, seq_dict = case_task_seq(excludeConsecutive=excludeConsecutive)
    elif type == "activity":
        print("ACTIVITY STREAM BY CASE")
        data, seq_dict = case_activity_seq(excludeConsecutive=excludeConsecutive)
    else:
        raise Exception("Type must be either task or activity")
    print(seq_dict)
    set_layout(data)
    app.run_server(debug=True)

## Visualize case stream

In [None]:
excludeConsecutive = True
type = "task"

case_stream(excludeConsecutive=excludeConsecutive, type=type)

# print(task_seq_per_case)
# print(task_seq_per_activity)