# sktime Interview Demo

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import os
import sktime
from sktime.datasets import load_from_tsfile_to_dataframe
import pandas as pd
import numpy as np

# import to retrieve examples
from sktime.datatypes import get_examples

In [None]:
pd_example = get_examples(mtype="pd-multiindex", as_scitype="Panel")[0]


In [None]:
pd_example

In [None]:
pd_example.index

In [None]:
arrays = [
    np.array(["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"]),  # This will be a list of file name
    np.array(["one", "two", "one", "two", "one", "two", "one", "two"]),  # This will the the list of all .time step for each file
]

In [None]:
tuples = list(zip(*arrays))
index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"])
index

## Example of constructing MultiIndex Dataframe

In [None]:
df = pd.DataFrame(np.random.randn(3, 8), index=["A", "B", "C"], columns=arrays)  # You "theoretically can keep the unit column here but it's best to leave out for sktime"
df

In [None]:
pd.DataFrame(np.random.randn(8, 8), index=index, columns=index)

In [None]:
s = pd.Series(np.random.randn(8), index=index)
s

In [None]:
import os
from io import StringIO

In [None]:
path = "H:/Tensorflow Dataset/Example csv for SPS - SPLNG/"

In [None]:
all_files = os.listdir(path)
csv_files = list(filter(lambda f: f.endswith('.csv'), all_files))
csv_files

In [None]:
# Equivalent code in pandas
csv_data_pd = pd.read_csv(path + csv_files[0])
csv_data_pd = csv_data_pd.drop([0]).astype(np.float16).to_numpy()
print(csv_files[0])
print(csv_data_pd)
len(csv_data_pd)


In [None]:
csv_data_pd = pd.read_csv(path + csv_files[0])
csv_data_pd = csv_data_pd.drop([0]).astype(np.float32)
csv_data_pd.to_numpy()


In [None]:
# Equivalent code in numpy
csv_data_np = np.genfromtxt(fname=path + csv_files[0], dtype=np.float32, delimiter=",", skip_header=2)
csv_data_np

In [None]:
from numpy.testing import assert_allclose
assert_allclose(csv_data_pd, csv_data_np)

The using pandas to read csv is 3 times faster than numpy. So we will use pandas in our code.

## The base load data class prep_csv

In [69]:
import os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings('ignore')

class prep_csv:
    def __init__(
        self,
        path=None,
        file_format=".csv",
        dtype=np.float32,
        decimal=2,
        force_ending="_F_LBF"
    ):
        self.path=path
        self.file_format=file_format
        self.dtype=dtype
        self.decimal=decimal
        self.force_ending=force_ending
        decimal_point='{:,.'+str(self.decimal)+'f}'
        pd.options.display.float_format = decimal_point.format
        super().__init__()

    def _load_data(self):
        """
        Build a multiIndex Dataframe for all cases
        return: A multiIndex DataFrame of all files 
        Index 0: case name
        Index 1: time series
        To "extract" dataframe for a specific case, use the formula
        ResultMax.filter(items=['case name'], axis=0)
        
        Parameter:
        path: absolute or relative path to the folder where csv files are
        file_format: only suport csv at the moment
        dtype: numpy dtype, np.float16, np.float32 or np.float64
        decimal: number of decimal points to display

        """
        file_list = os.listdir(self.path)
        chosen_files = list(filter(lambda f: f.endswith(self.file_format), file_list))
        num_files = len(chosen_files)
        case = []
        time = []
        columns = []
        #content = []
        for i, file in enumerate(tqdm(chosen_files, bar_format='{desc}{percentage:3.0f}%{r_bar}')):
            case_name = file.replace(self.file_format,"")
            current_file = pd.read_csv(path + file)
            if i==0:
                columns = np.array(current_file.columns[1:], dtype="U150")
                columns = np.char.strip(columns)  # Remove white space
                content = np.empty([1,len(current_file.columns)-1])
                current_file = current_file.drop([0])
                new_index = np.arange(len(current_file))  # Reset index to 0
                current_file = current_file.reindex(new_index, method="backfill")
                current_time = current_file.iloc[:,0].astype(self.dtype).to_numpy()  # Get time array
                current_content = current_file.iloc[:,1:].astype(self.dtype).to_numpy()  # Get content array
                # Get case name array to have the same length as time series in order to create MultiIndex
                current_case = np.empty(len(current_time), dtype='S150')
                current_case.fill(case_name)
                # Append all arays
                case = np.append(case, current_case)
                time = np.append(time, current_time)
                content = np.concatenate((content, current_content), axis=0)
                content = np.delete(content, 0, 0)  # Delete first row because it's an empty array
                print(f"Finished {case_name}")
            else:
                case_name = file.replace(self.file_format,"")
                current_file = pd.read_csv(self.path + file)
                current_file = current_file.drop([0])
                new_index = np.arange(len(current_file))  # Reset index to 0
                current_file = current_file.reindex(new_index, method="backfill")
                current_time = current_file.iloc[:,0].astype(self.dtype).to_numpy()  # Get time array
                current_content = current_file.iloc[:,1:].astype(self.dtype).to_numpy()  # Get content array
                # Get case array to have the same length as time series in order to create MultiIndex
                current_case = np.empty(len(current_time), dtype="U150")
                current_case.fill(case_name)
                # Append all arays
                case = np.append(case, current_case)
                time = np.append(time, current_time)
                content = np.concatenate((content, current_content), axis=0)
                print(f"Finished {case_name}")

        # Create MultiIndex array    
        index_array = [
            np.array(case),  
            np.array(time),  
        ]
        tuples = list(zip(*index_array))
        index = pd.MultiIndex.from_tuples(tuples, names=["CASE", "TIME"])
        all_cases = pd.DataFrame(
        content, 
        index=index, 
        columns=columns)        
        return all_cases.astype(self.dtype)      
    
    def _prep_max_min(self):
        all_data = self._load_data()
        return all_data.groupby(level=0).agg(["max"]).droplevel(1, axis=1), all_data.groupby(level=0).agg(["min"]).droplevel(1, axis=1)

    def _prep_force(self):
        force_max, force_min = self._prep_max_min()
        # Apply masking to extract force
        selected_columns = list(filter(lambda f: f.endswith(self.force_ending), force_max.columns))
        clean_columns = [col.replace(self.force_ending, "") for col in selected_columns]
        # Process max
        selected_force_max = force_max[selected_columns].apply(lambda x: x/1000)
        selected_force_max.columns = clean_columns
        df_force_max = selected_force_max.max().apply(lambda x: 0 if x <0 else x).to_dict()
        idx_force_max = selected_force_max.idxmax().to_dict()
        # Process min
        selected_force_min = force_min[selected_columns].apply(lambda x: x/(-1000))
        selected_force_min.columns = clean_columns
        df_force_min = selected_force_min.min().apply(lambda x: 0 if x <0 else x).to_dict()
        idx_force_min = selected_force_min.idxmin().to_dict()
        # Create MultiIndex Header
        header=[np.array(["With Flow", "Against Flow", "With Flow Controlling Case", "Against Flow Controlling Case"]), np.array(["kips", "kips", "", ""])]
        result=pd.DataFrame([df_force_max, df_force_min, idx_force_max, idx_force_min], index=header).T
        return result

    def force_report(self):
        return self._prep_force()

In [70]:
path = 'H:/Tensorflow Dataset/Example csv for SPS - SPLNG/'

In [71]:
#pd.options.display.float_format = '{:,.2f}'.format
PrepMyCSV = prep_csv(path=path, dtype=np.float32, decimal=2, force_ending="_F_LBF")
Result = PrepMyCSV.force_report()
Result

  0%| 0/11 [00:00<?, ?it/s]

Finished phase1_2_10k_seg_esd1e_trip_4_pumps_each_phase_csrate05
Finished phase1_2_10k_seg_esd1w_trip_4_pumps_each_phase_csrate05
Finished phase1_2_10k_seg_esd1_trip_4_pumps_each_phase_csrate05
Finished phase1_2_11k_seg_esd1e_trip_4_pumps_each_phase_csrate05
Finished phase1_2_11k_seg_esd1w_trip_4_pumps_each_phase_csrate05
Finished phase1_2_11k_seg_esd1_trip_4_pumps_each_phase_csrate05
Finished phase1_2_12k_seg_esd1e_trip_4_pumps_each_phase_csrate05
Finished phase1_2_12k_seg_esd1w_trip_4_pumps_each_phase_csrate05
Finished phase1_2_12k_seg_esd1_trip_4_pumps_each_phase_csrate05
Finished phase1_2_12k_seg_pwrfail_8k_rd_csrate5
Finished phase1_2_12k_seg_pwrfail_csrate2


Unnamed: 0_level_0,With Flow,Against Flow,With Flow Controlling Case,Against Flow Controlling Case
Unnamed: 0_level_1,kips,kips,Unnamed: 3_level_1,Unnamed: 4_level_1
101A_PUMP_RISER,18.02,17.11,phase1_2_10k_seg_esd1w_trip_4_pumps_each_phase...,phase1_2_12k_seg_pwrfail_csrate2
16_PL_24210_NO_C8_1S2_1A,7.51,0.58,phase1_2_10k_seg_esd1_trip_4_pumps_each_phase_...,phase1_2_12k_seg_pwrfail_8k_rd_csrate5
16_PL_24210_NO_C8_1S2_1B,6.86,0.01,phase1_2_10k_seg_esd1_trip_4_pumps_each_phase_...,phase1_2_12k_seg_pwrfail_8k_rd_csrate5
16_PL_24210_NO_C8_1S2_2,37.45,0.31,phase1_2_10k_seg_esd1_trip_4_pumps_each_phase_...,phase1_2_12k_seg_pwrfail_csrate2
16_PL_24210_NO_C8_1S2_3A,1.62,0.00,phase1_2_10k_seg_esd1e_trip_4_pumps_each_phase...,phase1_2_12k_seg_pwrfail_8k_rd_csrate5
...,...,...,...,...
3A2-PL-24740-01,1.24,0.23,phase1_2_12k_seg_esd1w_trip_4_pumps_each_phase...,phase1_2_11k_seg_esd1w_trip_4_pumps_each_phase...
3A2-PL-24740-02,2.26,0.05,phase1_2_12k_seg_esd1w_trip_4_pumps_each_phase...,phase1_2_11k_seg_esd1w_trip_4_pumps_each_phase...
3A2-PL-24740-03,1.15,0.00,phase1_2_12k_seg_esd1w_trip_4_pumps_each_phase...,phase1_2_11k_seg_esd1w_trip_4_pumps_each_phase...
3A2-PL-24740-04,2.01,0.00,phase1_2_12k_seg_esd1w_trip_4_pumps_each_phase...,phase1_2_11k_seg_esd1w_trip_4_pumps_each_phase...


In [72]:
Result.columns

MultiIndex([(                    'With Flow', 'kips'),
            (                 'Against Flow', 'kips'),
            (   'With Flow Controlling Case',     ''),
            ('Against Flow Controlling Case',     '')],
           )

Sort from High to low for With Flow

In [75]:
Result.sort_values(by=[('With Flow', 'kips')], ascending=0)

Unnamed: 0_level_0,With Flow,Against Flow,With Flow Controlling Case,Against Flow Controlling Case
Unnamed: 0_level_1,kips,kips,Unnamed: 3_level_1,Unnamed: 4_level_1
3R2_PL_24700_07,48.12,0.00,phase1_2_10k_seg_esd1e_trip_4_pumps_each_phase...,phase1_2_10k_seg_esd1e_trip_4_pumps_each_phase...
3R2_PL_24700_11,47.75,0.00,phase1_2_10k_seg_esd1e_trip_4_pumps_each_phase...,phase1_2_10k_seg_esd1e_trip_4_pumps_each_phase...
TR123456_TK123_PH2_LL_3,46.76,0.00,phase1_2_12k_seg_esd1w_trip_4_pumps_each_phase...,phase1_2_10k_seg_esd1w_trip_4_pumps_each_phase...
PH1LLT_PH2LLT_4,41.91,0.00,phase1_2_12k_seg_esd1w_trip_4_pumps_each_phase...,phase1_2_10k_seg_esd1w_trip_4_pumps_each_phase...
PH1LLA_PH1LLT_1,40.00,0.00,phase1_2_10k_seg_esd1e_trip_4_pumps_each_phase...,phase1_2_10k_seg_esd1e_trip_4_pumps_each_phase...
...,...,...,...,...
RGHRT_RGTK3_1,0.00,0.08,phase1_2_12k_seg_pwrfail_8k_rd_csrate5,phase1_2_12k_seg_esd1w_trip_4_pumps_each_phase...
1C1_PL_24405_01,0.00,3.85,phase1_2_10k_seg_esd1_trip_4_pumps_each_phase_...,phase1_2_10k_seg_esd1w_trip_4_pumps_each_phase...
TR56SPLT_TR56TK45_1,0.00,9.31,phase1_2_12k_seg_pwrfail_8k_rd_csrate5,phase1_2_11k_seg_esd1e_trip_4_pumps_each_phase...
TR56SPLT_TR56TK123_1,0.00,5.77,phase1_2_12k_seg_pwrfail_csrate2,phase1_2_10k_seg_esd1e_trip_4_pumps_each_phase...


Sort from High to low for Against Flow

In [76]:
Result.sort_values(by=[('Against Flow', 'kips')], ascending=0)

Unnamed: 0_level_0,With Flow,Against Flow,With Flow Controlling Case,Against Flow Controlling Case
Unnamed: 0_level_1,kips,kips,Unnamed: 3_level_1,Unnamed: 4_level_1
1B1_PL_24305_01,0.00,24.14,phase1_2_12k_seg_pwrfail_8k_rd_csrate5,phase1_2_12k_seg_pwrfail_8k_rd_csrate5
1A1_PL_24205_01,0.00,23.94,phase1_2_12k_seg_pwrfail_8k_rd_csrate5,phase1_2_12k_seg_pwrfail_8k_rd_csrate5
PH1LLT_PH2LLT_2,6.14,23.64,phase1_2_12k_seg_pwrfail_8k_rd_csrate5,phase1_2_12k_seg_pwrfail_8k_rd_csrate5
TR12MRG_TR12SPLT_1,0.00,21.70,phase1_2_12k_seg_pwrfail_csrate2,phase1_2_12k_seg_pwrfail_csrate2
1R1_PL_25001_01,0.04,21.63,phase1_2_12k_seg_pwrfail_8k_rd_csrate5,phase1_2_12k_seg_pwrfail_csrate2
...,...,...,...,...
TR5AB_TR56MRG_7,0.53,0.00,phase1_2_12k_seg_pwrfail_csrate2,phase1_2_11k_seg_esd1w_trip_4_pumps_each_phase...
TR5AB_TR56MRG_8,0.45,0.00,phase1_2_12k_seg_pwrfail_8k_rd_csrate5,phase1_2_11k_seg_esd1w_trip_4_pumps_each_phase...
TR5AB_TR56MRG_9,0.45,0.00,phase1_2_12k_seg_pwrfail_8k_rd_csrate5,phase1_2_11k_seg_esd1w_trip_4_pumps_each_phase...
TR6B_TR6AB_3,2.33,0.00,phase1_2_12k_seg_pwrfail_csrate2,phase1_2_11k_seg_esd1w_trip_4_pumps_each_phase...
