## Import libraries

In [None]:
import sys
sys.path.append('../')

import os
import pandas as pd
import numpy as np
import src.utils.seq as seq
from configs.config import data as data_path, cache as cache_path

## Input data paths

In [None]:
activities_in_file_name = "activities.csv"
features_in_file_name = "features.csv"
variants_in_file_name = "variants.csv"
filtered_folder_name = "filtered"
task_seq_per_case_fasta_file_name = "task_seq_per_case.fasta"
activity_seq_per_case_fasta_file_name = "activity_seq_per_case.fasta"

## Case Summary

In [None]:
class Cases:
    def __init__(self):
        self.cache_path = cache_path
        self.filtered_folder = filtered_folder_name
        self.activities_in_file = activities_in_file_name

    def get_activities(self):
        activities = pd.read_csv(os.path.join(self.cache_path, self.filtered_folder, self.activities_in_file))
        return activities

    def info(self):
        activities = self.get_activities()
        reliable_activities = activities[(activities['case'] != "Missing") & (activities['case'] != "0") & (len(activities[activities['case'].notna()]))]
        
        print("RELIABLE = Cases not missing/None/NaN and not 0\n")
        print("Cases with value = 0: {}".format(len(activities[activities['case'] == "0"])))
        print("Missing cases: {}".format((len(activities[activities['case']=="Missing"])) or (len(activities[activities['case']=="None"])) or (len(activities[activities['case'].isna()]))))
        print("\n")
        print("Total reliable cases: {}".format(len(reliable_activities['case'])))
        print("Unique reliable cases: {}".format(len(reliable_activities['case'].unique())))
             
cases = Cases()

## Activity Summary

In [None]:
class Activities:
    def __init__(self):
        self.cache_path = cache_path
        self.filtered_folder = filtered_folder_name
        self.activities_in_file = activities_in_file_name

    def get_activities(self):
        activities = pd.read_csv(os.path.join(self.cache_path, self.filtered_folder, self.activities_in_file))
        return activities

    def info(self):
        activities = self.get_activities()
        reliable_activities = activities[(activities['case'] != "Missing") & (activities['case'] != "0") & (len(activities[activities['case'].notna()]))]
        
        print("RELIABLE = Cases not missing/None/NaN and not 0\n")
        print("Total reliable activities: {}".format(len(reliable_activities)))
        print("Unique reilable activities: {}".format(len(reliable_activities['activity'].unique())))
             
activities = Activities()

## Task Summary

In [None]:
class Tasks:
    def __init__(self):
        self.cache_path = cache_path
        self.filtered_folder = filtered_folder_name
        self.activities_in_file = activities_in_file_name

    def get_activities(self):
        activities = pd.read_csv(os.path.join(self.cache_path, self.filtered_folder, self.activities_in_file))
        return activities

    def info(self):
        activities = self.get_activities()
        reliable_activities = activities[(activities['case'] != "Missing") & (activities['case'] != "0")]
        
        print("RELIABLE = Cases not missing/None/NaN and not 0\n")
        print("Total tasks: {}".format(len(activities['task'])))
        print("Unique tasks: {}".format(len(activities['task'].unique())))
        print("\n")
        print("Total reliable tasks: {}".format(len(reliable_activities['task'])))
        print("Unique reliable tasks: {}".format(len(reliable_activities['task'].unique())))
        print("Reliable activities with no task discovered: {}".format((len(reliable_activities[reliable_activities['task']=="Missing"])) or (len(reliable_activities[reliable_activities['task']=="None"])) or (len(reliable_activities[reliable_activities['task']==""])) or (len(reliable_activities[reliable_activities['task']==" "])) or (len(reliable_activities[reliable_activities['task'].isna()]))))
        print("Reliable activities with discovered tasks: {}".format((len(reliable_activities[reliable_activities['task']!="Missing"])) & (len(reliable_activities[reliable_activities['task']!="None"])) & (len(reliable_activities[reliable_activities['task']!=""]) & (len(reliable_activities[reliable_activities['task']!=" "]))) & (len(reliable_activities[reliable_activities['task'].notna()]))))
         
tasks = Tasks()

## Variants Summary

In [None]:
class Variants:
    def __init__(self):
        self.cache_path = cache_path
        self.filtered_folder = filtered_folder_name
        self.variants_in_file = variants_in_file_name

    def get_variants(self):
        activities = pd.read_csv(os.path.join(self.cache_path, self.filtered_folder, self.variants_in_file))
        return activities

    def info(self):
        variants = self.get_variants()
        print("Total variants: {}".format(len(variants['variant_ID'].unique())))
        print("Total subvariants: {}".format(len(variants)))
        print("Total variants/subvariants that occur in a single case: {} ({})%".format(len(variants[variants['count'] == 1]), (len(variants[variants['count'] == 1])/len(variants))*100))

variants = Variants()

## Check Info

In [None]:
cases.info()

In [None]:
activities.info()

In [None]:
tasks.info()

In [None]:
variants.info()