# A Script for Loading and Processing .csv Data

## by Tan Tran

Tutorial for sktime interview

## Example scenario: 
* Given time series Case A.csv...CaseH.csv
* For features ending with :PMAX.HVAL, report the max values each feature.
* For features ending with _F_LBF, report the max and min values for each feature/case and case resulting in those value


* Problem: Using Excel for loading and processing .csv is not practical.
    1. Excel requires extreme labor
    2. `VBA`, even worse

* Goal:
    1. Perform load, extract and process
    2. Expected result: User-friendly and Excel-friendly DataFrame or equivalent
    3. Plots

* Solution: Use `pandas`

In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
import sktime
import pandas as pd
import numpy as np


## Implementation

## Structure


1. Master Class: `prep_csv`
2. Public method: `prep_csv()` and `update()`
3. Internal methods: `_load_data()`, `_prep_max_min()`, `_add_bool_to_max_min()`,
`_prep_force()`, `_prep_mm()`

In [2]:
import os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings('ignore')

class prep_csv:
    def __init__(
        self,
        path=None,
        file_format=".csv",
        dtype=np.float32,
        decimal=2,
        force_ending="_F_LBF",
        custom_ending="MM_",
        custom_attribute=None
    ):
        self.path=path
        self.file_format=file_format
        self.dtype=dtype
        self.decimal=decimal
        self.force_ending=force_ending
        self.custom_ending=custom_ending
        self.custom_attribute=custom_attribute
        # Set precision display
        decimal_point='{:,.'+str(self.decimal)+'f}'
        pd.options.display.float_format = decimal_point.format
        super().__init__()

        # DataFrame-style report
        self.force_report=None
        self.mm_report=None
        # Internal variables
        self.segment=None
        self.all_data=None
        self.fmax=None
        self.fmin=None
        self.all_max=None
        self.all_min=None
        self.custom_attribute_report=None
        self.selected_case=None

    def prep_csv(self):
        self._load_data()
        self._prep_max_min()
        self._add_bool_to_max_min()
        self._prep_force()
        self._prep_mm()
        if isinstance(self.custom_attribute, type(None)) is not True:
            self._get_custom_attribute()
        return self

    def update(self):
        self._filter_case()
        self._prep_force()
        self._prep_mm()
        if isinstance(self.custom_attribute, type(None)) is not True:
            self._get_custom_attribute()
        print("Update finished!")

    def _load_data(self):
        """
        Load all data into one Super MultiIndex DataFrame
        return: A multiIndex DataFrame of all files 
        Index level 0: case name
        Index level 1: time series
        Return: A MultiIndex DataFrame containing all data

        To "extract" dataframe for a specific case, use the formula
        ResultMax.filter(items=['case name'], axis=0)
        
        Parameter:
        path: absolute or relative path to the folder where csv files are
        file_format: only suport csv at the moment
        dtype: numpy dtype, np.float16, np.float32 or np.float64
        decimal: number of decimal points to display
        force_ending: the suffix or prefix used to identify force
        custom_ending: the suffix or prefix used to identify custom feature (pressure)
        custom_attribute: the suffix or prefix used to identify other feature, without processing max-min

        How to use:
        1/ Instantiate the class with parameters
        2/ Call class.prep_csv()
        3/ If needed, remove some cases by assessing the INCLUDE boolean with class.selected_case.
        Set the non-needed case to be zero
        4/ Update the inner state with class.update() 

        Attribute:
        fmax, fmin: Maximum or minimum force in the segment. If minimum, the values will be absolute value.
        force_report: Maximum force and associated case.
        mm_report: Maximum pressure ans associated case.

        """
        # Get a list of files
        file_list = os.listdir(self.path)
        chosen_files = list(filter(lambda f: f.endswith(self.file_format), file_list))
        num_files = len(chosen_files)
        case = []
        time = []
        columns = []
        #content = []
        for i, file in enumerate(tqdm(chosen_files, bar_format='{desc}{percentage:3.0f}%{r_bar}')):
            case_name = file.replace(self.file_format,"")
            # Read each file, first loop is different because we do not know column labels yet.
            current_file = pd.read_csv(path + file)
            if i==0:
                print(f"Now working on {case_name}...")
                columns = np.array(current_file.columns[1:], dtype="U200")
                columns = np.char.strip(columns)
                content = np.empty([1,len(current_file.columns)-1])
                current_file = current_file.drop([0])
                new_index = np.arange(len(current_file))  # Reset index to 0
                current_file = current_file.reindex(new_index, method="backfill")
                # For each file, we get, case name, time array, content array, column labels
                current_time = current_file.iloc[:,0].astype(self.dtype).to_numpy()  # Get time array
                current_content = current_file.iloc[:,1:].astype(self.dtype).to_numpy()  # Get content array
                # Get case name array to have the same length as time series in order to create MultiIndex
                current_case = np.empty(len(current_time), dtype='U200')
                current_case.fill(case_name)
                # Append all arays
                case = np.append(case, current_case)
                time = np.append(time, current_time)
                content = np.concatenate((content, current_content), axis=0)
                content = np.delete(content, 0, 0)  # Delete first row because it's an empty array
                print(f"Finished {case_name}")
            else:
                case_name = file.replace(self.file_format,"")
                print(f"Now working on {case_name}...")
                current_file = pd.read_csv(self.path + file)
                current_file = current_file.drop([0])
                new_index = np.arange(len(current_file))  # Reset index to 0
                current_file = current_file.reindex(new_index, method="backfill")
                current_time = current_file.iloc[:,0].astype(self.dtype).to_numpy()  # Get time array
                current_content = current_file.iloc[:,1:].astype(self.dtype).to_numpy()  # Get content array
                # Get case array to have the same length as time series in order to create MultiIndex
                current_case = np.empty(len(current_time), dtype="U200")
                current_case.fill(case_name)
                # Append all arays
                case = np.append(case, current_case)
                time = np.append(time, current_time)
                content = np.concatenate((content, current_content), axis=0)
                print(f"Finished {case_name}")

        # Create MultiIndex array    
        index_array = [
            np.array(case),  
            np.array(time),  
        ]
        tuples = list(zip(*index_array))
        index = pd.MultiIndex.from_tuples(tuples, names=["CASE", "TIME"])
        # Create final DataFrame
        all_cases = pd.DataFrame(
        content, 
        index=index, 
        columns=columns)        
        self.all_data=all_cases.astype(self.dtype) 
        return      
    
    def _prep_max_min(self):
        # Create 2 DataFrame: Max and Min 
        self.all_max=self.all_data.groupby(level=0).agg(["max"]).droplevel(1, axis=1)  # Max values of each case (level 0)
        self.all_min=self.all_data.groupby(level=0).agg(["min"]).droplevel(1, axis=1)  # Min values of each case (level 0)
        return self

    def _prep_force(self):
        # Find maximum/minimum of force/feature out of all cases and the case associated with it  
        all_data_max=self.all_max
        all_data_max=all_data_max[all_data_max["INCLUDE"]>0]  # Only keep cases whose "INCLUDE" value is 1
        all_data_min=self.all_min
        all_data_min=all_data_min[all_data_min["INCLUDE"]>0]
        # Apply masking to extract force
        selected_columns = list(filter(lambda f: f.endswith(self.force_ending) or f.startswith(self.force_ending), all_data_max.columns))
        clean_columns = [col.replace(self.force_ending, "") for col in selected_columns]
        # Process max
        selected_force_max = all_data_max[selected_columns].apply(lambda x: x/1000)
        selected_force_max.columns = clean_columns
        self.segment = clean_columns
        df_force_max = selected_force_max.max().apply(lambda x: 0 if x<0 else x)
        # Apply mask to set negative value to 0
        x = selected_force_max > 0
        self.fmax = selected_force_max.where(x,0)
        df_force_max=df_force_max.to_dict()
        idx_force_max = selected_force_max.idxmax().to_dict()
        # Process min
        selected_force_min = all_data_min[selected_columns].apply(lambda x: x/(-1000))
        selected_force_min.columns = clean_columns
        df_force_min = selected_force_min.min().apply(lambda x: 0 if x<0 else x)
        # Apply mask to set negative value to 0
        x = selected_force_min > 0
        self.fmin = selected_force_min.where(x,0)
        df_force_min=df_force_min.to_dict()
        idx_force_min = selected_force_min.idxmin().to_dict()
        # Save result into a MultiIndex DataFrame
        header=[np.array(["With Flow", "Against Flow", "With Flow", "Against Flow"]), np.array(["kips", "kips", "Controlling Case", "Controlling Case"])]
        self.force_report=pd.DataFrame([df_force_max, df_force_min, idx_force_max, idx_force_min], index=header).T
        # Get worst force column and move it next to 'With Flow' column
        self.force_report[('Worst Force', 'kips')]=self.force_report[[('With Flow', 'kips'), ('Against Flow', 'kips')]].max(axis=1)
        move_col = self.force_report.pop(('Worst Force', 'kips'))
        self.force_report.insert(2, ('Worst Force', 'kips'), move_col)
        print("Processing Finished!")
        print("Min Flow is reported as absolute value. Actual Min Flow values are negative. If positive, it is set to be 0.")
        print("Likewise, if With Flow is negative, it is set to be 0.")
        return self
    
    def _prep_mm(self):
        "Process maximum pressure"
        all_data_max=self.all_max
        all_data_max=all_data_max[all_data_max["INCLUDE"]>0]  # Only keep cases whose "INCLUDE" value is 1
        # Apply masking to extract attribute
        selected_columns = list(filter(lambda f: f.endswith(self.custom_ending) or f.startswith(self.custom_ending), all_data_max.columns))
        clean_columns = [col.replace(self.custom_ending, "") for col in selected_columns]
        # Process max
        selected_max = all_data_max[selected_columns]
        selected_max.columns = clean_columns
        self.mm_report = selected_max
        return self

    def _get_custom_attribute(self):
        all_data=self.all_data
        # Apply masking to extract attribute
        selected_columns = list(filter(lambda f: f.endswith(self.custom_attribute) or f.startswith(self.custom_attribute), all_data.columns))
        clean_columns = [col.replace(self.custom_ending, "") for col in selected_columns]
        # Process max
        selected_data = all_data[selected_columns]
        selected_data.columns = clean_columns
        self.custom_attribute_report = selected_data
        return self

    # Method to be called by self.update(), allowing case selection
    def _add_bool_to_max_min(self):
        # Only call this once after load_data to add an "Include" column to all_max and all_min
        include_bool=np.empty(len(self.all_max), dtype=np.int32)
        include_bool.fill(1)
        self.all_max["INCLUDE"] = include_bool
        self.all_min["INCLUDE"] = include_bool
        self.selected_case = self.all_max[["INCLUDE"]]
        self.selected_case=self.selected_case.reset_index()
        return self
    
    def _filter_case(self):
        # called by update() when updating which case to include in the analysis
        self.all_max["INCLUDE"]=self.selected_case["INCLUDE"].values
        self.all_min["INCLUDE"]=self.selected_case["INCLUDE"].values
        return self

Instantiate the class

In [3]:
path = 'E:/sktime demo/'
PrepMyCSV = prep_csv(path=path, dtype=np.float32, decimal=2, force_ending="_F_LBF", custom_ending=":PMAX.HVAL")


## Call `prep_csv()` - Same as `fit()`

In [4]:
PrepMyCSV.prep_csv()

  0%| 0/6 [00:00<?, ?it/s]

Now working on CaseA...
Finished CaseA
Now working on CaseB...
Finished CaseB
Now working on CaseC...
Finished CaseC
Now working on CaseD...
Finished CaseD
Now working on CaseE...
Finished CaseE
Now working on CaseF...
Finished CaseF
Processing Finished!
Min Flow is reported as absolute value. Actual Min Flow values are negative. If positive, it is set to be 0.
Likewise, if With Flow is negative, it is set to be 0.


<__main__.prep_csv at 0x1d13f078908>

All files are now in panel-type data

In [5]:
PrepMyCSV.all_data

Unnamed: 0_level_0,Unnamed: 1_level_0,XV24541:P-,XV24521:P-,XV24501:P-,XV24641:P-,XV24621:P-,XV24601:P-,XV24581:P-,XV24681:P-,XV24571:P-,XV24671:P-,...,LOADINGARMS:PMAX.HVAL,LOADINGARMS:MASP.HDIF,EXTRA:PMAX.HVAL,EXTRA:MASP.HDIF,INTERTANK:PMAX.HVAL,INTERTANK:MASP.HDIF,SHIPPIPING:PMAX.HVAL,SHIPPIPING:MASP.HDIF,MM_SYSTEM:PMAX.HVAL,MM_SYSTEM:MASP.HDIF
CASE,TIME,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
CaseA,0.00,49.98,49.92,49.87,53.34,53.34,53.34,55.37,51.29,55.30,51.25,...,49.28,253.28,53.88,386.19,228.20,211.89,14.86,287.72,233.66,68.89
CaseA,0.00,49.98,49.92,49.87,53.34,53.34,53.34,55.37,51.29,55.30,51.25,...,49.28,253.28,53.88,386.19,228.20,211.89,14.86,287.72,233.66,68.89
CaseA,0.00,49.98,49.92,49.87,53.34,53.34,53.34,55.37,51.29,55.30,51.25,...,49.28,253.28,53.88,386.19,228.20,211.89,14.86,287.72,233.66,68.89
CaseA,0.00,49.98,49.92,49.87,53.34,53.34,53.34,55.37,51.29,55.30,51.25,...,49.28,253.28,53.88,386.19,228.20,211.89,14.86,287.72,233.66,68.89
CaseA,0.00,49.98,49.92,49.87,53.34,53.34,53.34,55.37,51.29,55.30,51.25,...,49.28,253.28,53.88,386.19,228.20,211.89,14.86,287.72,233.66,68.89
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CaseF,7.34,2.57,2.57,2.57,54.33,54.33,54.33,63.89,59.48,42.13,37.70,...,72.06,230.50,195.85,244.23,226.38,213.70,47.48,255.10,231.02,71.54
CaseF,7.38,2.57,2.57,2.57,54.33,54.33,54.33,63.89,59.48,42.14,37.71,...,72.06,230.50,195.85,244.23,226.38,213.70,47.48,255.10,231.02,71.54
CaseF,7.42,2.57,2.57,2.57,54.33,54.33,54.33,63.89,59.48,42.15,37.71,...,72.06,230.50,195.85,244.23,226.38,213.70,47.48,255.10,231.02,71.54
CaseF,7.48,2.57,2.57,2.57,54.33,54.33,54.33,63.89,59.48,42.15,37.72,...,72.06,230.50,195.85,244.23,226.38,213.70,47.48,255.10,231.02,71.54


Result: DataFrames showing Max and Min values for each feature per case

In [6]:
PrepMyCSV.all_min

Unnamed: 0_level_0,XV24541:P-,XV24521:P-,XV24501:P-,XV24641:P-,XV24621:P-,XV24601:P-,XV24581:P-,XV24681:P-,XV24571:P-,XV24671:P-,...,LOADINGARMS:MASP.HDIF,EXTRA:PMAX.HVAL,EXTRA:MASP.HDIF,INTERTANK:PMAX.HVAL,INTERTANK:MASP.HDIF,SHIPPIPING:PMAX.HVAL,SHIPPIPING:MASP.HDIF,MM_SYSTEM:PMAX.HVAL,MM_SYSTEM:MASP.HDIF,INCLUDE
CASE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CaseA,49.98,49.92,49.87,53.33,53.33,53.33,55.37,51.29,55.3,51.25,...,212.65,53.88,275.62,228.2,211.89,14.86,233.45,233.66,68.89,1
CaseB,10.96,11.01,11.15,66.94,66.94,66.94,41.67,37.94,41.73,38.35,...,198.14,70.09,254.77,226.37,213.71,15.59,223.01,230.59,71.97,1
CaseC,40.48,40.92,40.9,54.32,54.32,54.32,48.77,44.15,2.05,-8.7,...,234.93,77.83,239.91,226.38,213.7,14.86,261.13,231.02,71.54,1
CaseD,1.85,1.43,1.34,66.94,66.94,66.94,69.26,65.3,69.14,65.24,...,204.93,70.09,172.54,226.37,173.48,15.59,235.39,230.58,71.97,1
CaseE,0.96,1.08,1.43,66.94,66.94,66.94,19.76,16.0,18.68,15.36,...,206.8,70.09,250.82,226.37,213.71,15.59,235.91,230.59,71.97,1
CaseF,-8.79,-1.4,-2.53,54.32,54.32,54.32,47.46,43.21,0.94,-5.7,...,230.5,77.83,244.23,226.38,213.7,14.86,255.1,231.02,71.54,1


In [7]:
PrepMyCSV.all_max

Unnamed: 0_level_0,XV24541:P-,XV24521:P-,XV24501:P-,XV24641:P-,XV24621:P-,XV24601:P-,XV24581:P-,XV24681:P-,XV24571:P-,XV24671:P-,...,LOADINGARMS:MASP.HDIF,EXTRA:PMAX.HVAL,EXTRA:MASP.HDIF,INTERTANK:PMAX.HVAL,INTERTANK:MASP.HDIF,SHIPPIPING:PMAX.HVAL,SHIPPIPING:MASP.HDIF,MM_SYSTEM:PMAX.HVAL,MM_SYSTEM:MASP.HDIF,INCLUDE
CASE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CaseA,176.14,176.3,176.98,53.34,53.34,53.34,189.99,181.77,189.41,179.7,...,253.28,164.46,386.19,228.2,211.89,69.14,287.72,233.66,68.89,1
CaseB,122.31,122.33,122.51,66.98,66.98,66.98,195.48,186.36,193.93,183.45,...,241.0,185.31,369.98,226.37,213.71,79.58,286.99,230.59,71.97,1
CaseC,72.18,72.19,72.24,54.33,54.33,54.33,76.63,72.41,112.64,105.1,...,253.21,200.16,362.25,226.38,213.7,41.45,287.72,328.17,71.54,1
CaseD,106.99,106.98,107.08,66.98,66.98,66.98,317.16,312.64,317.99,313.82,...,241.0,267.54,369.98,266.6,213.71,67.19,286.99,317.67,71.97,1
CaseE,105.95,105.94,106.06,66.98,66.98,66.98,199.31,189.85,197.99,186.67,...,241.0,189.25,369.98,226.37,213.71,66.67,286.99,230.59,71.97,1
CaseF,78.52,78.65,78.75,54.33,54.33,54.33,89.22,84.75,201.32,187.87,...,253.21,195.85,362.25,226.38,213.7,47.48,287.72,231.02,71.54,1


### Report showing max pressure for each case

Recap: 
* For features ending with :PMAX.HVAL, report the max values and the case associated for each files.

In [8]:
PrepMyCSV.mm_report

Unnamed: 0_level_0,TANKA_RD,TANKB_RD,TANKC_RD,RUNDOWNLINES,T1_MAIN,T2_MAIN,T3_MAIN,COOLDOWNLINES,LOADINGLINES_RD,T1_COOLDOWN,...,TANKB,TANKC,LOADINGLINES,WESTJETTY,EASTJETTY,LOADINGARMS,EXTRA,INTERTANK,SHIPPIPING,MM_SYSTEM
CASE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CaseA,227.36,231.39,229.41,231.57,233.57,233.66,233.61,232.21,-9.999999778196306e+21,232.2,...,59.04,152.62,188.26,53.34,176.59,89.92,164.46,228.2,69.14,233.66
CaseB,225.44,228.43,228.43,230.59,230.59,230.59,230.59,219.11,-9.999999778196306e+21,230.59,...,65.55,169.98,192.8,66.98,122.51,104.42,185.31,226.37,79.58,230.59
CaseC,227.4,228.86,228.86,231.02,231.02,231.02,231.02,223.57,-9.999999778196306e+21,231.02,...,60.06,179.17,200.14,54.32,71.47,67.63,200.16,226.38,41.45,328.17
CaseD,225.44,228.43,228.43,230.59,230.59,230.59,230.59,219.11,-9.999999778196306e+21,230.59,...,65.55,247.76,316.83,66.98,107.08,97.66,267.54,266.6,67.19,317.67
CaseE,225.44,228.43,228.43,230.59,230.59,230.59,230.59,219.11,-9.999999778196306e+21,230.59,...,65.55,174.01,195.96,66.98,106.06,95.79,189.25,226.37,66.67,230.59
CaseF,227.4,228.86,228.86,231.02,231.02,231.02,231.02,223.57,-9.999999778196306e+21,231.02,...,60.06,177.45,198.14,54.32,78.33,72.06,195.85,226.38,47.48,231.02


### Report showing max and min force for each segment for each case

* For features ending with _F_LBF, report the max and min values and the case associated with it out of all the files.

In [9]:
PrepMyCSV.force_report

Unnamed: 0_level_0,With Flow,Against Flow,Worst Force,With Flow,Against Flow
Unnamed: 0_level_1,kips,kips,kips,Controlling Case,Controlling Case
CDML_LDJN_6,0.00,0.00,0.00,CaseB,CaseC
CDML_LDJN_7,0.00,0.09,0.09,CaseB,CaseB
CDML_LDJN_8,0.00,0.00,0.00,CaseB,CaseE
CDML_LDJN_9,1.93,0.00,1.93,CaseA,CaseA
CDML_LDJN_10,0.00,0.04,0.04,CaseD,CaseD
...,...,...,...,...,...
JHB6_JHB8_154_155A,0.00,1.11,1.11,CaseA,CaseA
JHB6_JHB8_154_155B,2.71,0.00,2.71,CaseE,CaseE
JHC6_JHC8_163_164,2.84,0.65,2.84,CaseC,CaseB
JHC6_JHC8_163_164A,0.04,8.75,8.75,CaseF,CaseA


Filter by With Flow, from high to low

In [10]:
PrepMyCSV.force_report.sort_values(by=[('With Flow', 'kips')], ascending=0).head()

Unnamed: 0_level_0,With Flow,Against Flow,Worst Force,With Flow,Against Flow
Unnamed: 0_level_1,kips,kips,kips,Controlling Case,Controlling Case
XVL1_JHL6_123,55.84,0.0,55.84,CaseD,CaseD
XVR1_JHR6_79,55.63,0.0,55.63,CaseD,CaseD
XVR1_JHR6_85,54.77,0.0,54.77,CaseD,CaseB
XVL1_JHL6_129,54.68,0.0,54.68,CaseD,CaseB
JHA6_JHA8_63,51.05,0.0,51.05,CaseD,CaseC


In [11]:
PrepMyCSV.force_report.sort_values(by=[('Against Flow', 'kips')], ascending=0).head()

Unnamed: 0_level_0,With Flow,Against Flow,Worst Force,With Flow,Against Flow
Unnamed: 0_level_1,kips,kips,kips,Controlling Case,Controlling Case
ML12_TKMF_186,0.0,45.24,45.24,CaseE,CaseB
P1C1_J1C1_41,0.0,41.22,41.22,CaseE,CaseB
P1A2_J1A2_6,0.0,40.91,40.91,CaseA,CaseB
P1C2_J1C2_46,0.0,40.55,40.55,CaseB,CaseC
P1C3_J1C3_51,0.0,40.45,40.45,CaseB,CaseC


## Case Selection - Use of update()

All cases being evaluated

In [12]:
PrepMyCSV.selected_case

Unnamed: 0,CASE,INCLUDE
0,CaseA,1
1,CaseB,1
2,CaseC,1
3,CaseD,1
4,CaseE,1
5,CaseF,1


Case selection

In [13]:
PrepMyCSV.selected_case["INCLUDE"].iloc[5]=0
PrepMyCSV.selected_case

Unnamed: 0,CASE,INCLUDE
0,CaseA,1
1,CaseB,1
2,CaseC,1
3,CaseD,1
4,CaseE,1
5,CaseF,0


In [14]:
PrepMyCSV.update()

Processing Finished!
Min Flow is reported as absolute value. Actual Min Flow values are negative. If positive, it is set to be 0.
Likewise, if With Flow is negative, it is set to be 0.
Update finished!


In [15]:
PrepMyCSV.mm_report

Unnamed: 0_level_0,TANKA_RD,TANKB_RD,TANKC_RD,RUNDOWNLINES,T1_MAIN,T2_MAIN,T3_MAIN,COOLDOWNLINES,LOADINGLINES_RD,T1_COOLDOWN,...,TANKB,TANKC,LOADINGLINES,WESTJETTY,EASTJETTY,LOADINGARMS,EXTRA,INTERTANK,SHIPPIPING,MM_SYSTEM
CASE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CaseA,227.36,231.39,229.41,231.57,233.57,233.66,233.61,232.21,-9.999999778196306e+21,232.2,...,59.04,152.62,188.26,53.34,176.59,89.92,164.46,228.2,69.14,233.66
CaseB,225.44,228.43,228.43,230.59,230.59,230.59,230.59,219.11,-9.999999778196306e+21,230.59,...,65.55,169.98,192.8,66.98,122.51,104.42,185.31,226.37,79.58,230.59
CaseC,227.4,228.86,228.86,231.02,231.02,231.02,231.02,223.57,-9.999999778196306e+21,231.02,...,60.06,179.17,200.14,54.32,71.47,67.63,200.16,226.38,41.45,328.17
CaseD,225.44,228.43,228.43,230.59,230.59,230.59,230.59,219.11,-9.999999778196306e+21,230.59,...,65.55,247.76,316.83,66.98,107.08,97.66,267.54,266.6,67.19,317.67
CaseE,225.44,228.43,228.43,230.59,230.59,230.59,230.59,219.11,-9.999999778196306e+21,230.59,...,65.55,174.01,195.96,66.98,106.06,95.79,189.25,226.37,66.67,230.59


Let's try sorting With Flow again

In [16]:
PrepMyCSV.force_report.sort_values(by=[('With Flow', 'kips')], ascending=0).head()

Unnamed: 0_level_0,With Flow,Against Flow,Worst Force,With Flow,Against Flow
Unnamed: 0_level_1,kips,kips,kips,Controlling Case,Controlling Case
XVL1_JHL6_123,55.84,0.0,55.84,CaseD,CaseD
XVR1_JHR6_79,55.63,0.0,55.63,CaseD,CaseD
XVR1_JHR6_85,54.77,0.0,54.77,CaseD,CaseB
XVL1_JHL6_129,54.68,0.0,54.68,CaseD,CaseB
JHA6_JHA8_63,51.05,0.0,51.05,CaseD,CaseC


Return all cases in the evaluation

In [17]:
PrepMyCSV.selected_case["INCLUDE"]=1
PrepMyCSV.update()

Processing Finished!
Min Flow is reported as absolute value. Actual Min Flow values are negative. If positive, it is set to be 0.
Likewise, if With Flow is negative, it is set to be 0.
Update finished!


In [18]:
PrepMyCSV.mm_report

Unnamed: 0_level_0,TANKA_RD,TANKB_RD,TANKC_RD,RUNDOWNLINES,T1_MAIN,T2_MAIN,T3_MAIN,COOLDOWNLINES,LOADINGLINES_RD,T1_COOLDOWN,...,TANKB,TANKC,LOADINGLINES,WESTJETTY,EASTJETTY,LOADINGARMS,EXTRA,INTERTANK,SHIPPIPING,MM_SYSTEM
CASE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CaseA,227.36,231.39,229.41,231.57,233.57,233.66,233.61,232.21,-9.999999778196306e+21,232.2,...,59.04,152.62,188.26,53.34,176.59,89.92,164.46,228.2,69.14,233.66
CaseB,225.44,228.43,228.43,230.59,230.59,230.59,230.59,219.11,-9.999999778196306e+21,230.59,...,65.55,169.98,192.8,66.98,122.51,104.42,185.31,226.37,79.58,230.59
CaseC,227.4,228.86,228.86,231.02,231.02,231.02,231.02,223.57,-9.999999778196306e+21,231.02,...,60.06,179.17,200.14,54.32,71.47,67.63,200.16,226.38,41.45,328.17
CaseD,225.44,228.43,228.43,230.59,230.59,230.59,230.59,219.11,-9.999999778196306e+21,230.59,...,65.55,247.76,316.83,66.98,107.08,97.66,267.54,266.6,67.19,317.67
CaseE,225.44,228.43,228.43,230.59,230.59,230.59,230.59,219.11,-9.999999778196306e+21,230.59,...,65.55,174.01,195.96,66.98,106.06,95.79,189.25,226.37,66.67,230.59
CaseF,227.4,228.86,228.86,231.02,231.02,231.02,231.02,223.57,-9.999999778196306e+21,231.02,...,60.06,177.45,198.14,54.32,78.33,72.06,195.85,226.38,47.48,231.02


## Plotting Results

Get segment length

In [19]:
segment_len_df = pd.read_csv('E:/sktime demo/Imported_Data/Segment_Length.csv', header=[0,1], index_col=0)
segment_len_df=segment_len_df.dropna()
segment_len_df.pop(("Include", "Segment"))
segment_len_df.head()

Segment,Length
Name,ft
J1A1_JHA1_2,40.53
CNA1_JHA1_3,8.42
CNA1_JHA1_4,36.3
CNA1_JHA1_5,4.17
J1A2_JHA2_7,41.02


Merge length and force into 1 df

In [20]:
segment_force_df = PrepMyCSV.force_report[[('Worst Force', 'kips')]]
MyResult = pd.concat([segment_len_df, segment_force_df], axis=1, join='inner')
MyResult.columns=MyResult.columns.droplevel(-1)
MyResult.rename(columns={"Length":"Length (ft)", "Worst Force": "Worst Force (kips)"}, inplace=True)
# Remove cases where worst forces is 0
new_col = np.empty(len(MyResult["Length (ft)"]), dtype="U100")
new_col.fill("evoleap")
MyResult["Type"] = new_col
MyResult.sort_values(by=[('Length (ft)')], ascending=0).head()

Segment,Length (ft),Worst Force (kips),Type
XVE1_JHE8_175,791.82,26.74,evoleap
XVE2_JHE2_187,772.66,39.56,evoleap
XVL1_JHL6_124,707.0,34.35,evoleap
XVR1_JHR6_80,704.0,49.37,evoleap
JHL2_JHL6_119_121,597.23,27.71,evoleap


Get Someone Result

In [21]:
other_result = pd.read_csv('E:/sktime demo/Imported_Data/Someone_Result.csv', index_col='Segment')
other_result=other_result.dropna()
new_col = np.empty(len(other_result["Length (ft)"]), dtype="U100")
new_col.fill("Bechtel")
other_result["Type"] = new_col
other_result.head()

Unnamed: 0_level_0,Length (ft),Worst Force (kips),Type
Segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
J1A1 _JHA1 _2,43.46,46.1,Bechtel
CNA1 _JHA1 _3,8.41,46.1,Bechtel
CNA1 _JHA1 _4,36.31,46.6,Bechtel
CNA1 _JHA1 _5,4.16,45.7,Bechtel
J1A2 _JHA2 _7,41.35,46.1,Bechtel


Combine My Method and Other Method into 1 DataFrame for PLotly

In [22]:
# Combine Myresult and Other result and reset index
combined_df=pd.concat([other_result, MyResult], join="outer", axis=0)
combined_df

Unnamed: 0,Length (ft),Worst Force (kips),Type
J1A1 _JHA1 _2,43.46,46.10,Bechtel
CNA1 _JHA1 _3,8.41,46.10,Bechtel
CNA1 _JHA1 _4,36.31,46.60,Bechtel
CNA1 _JHA1 _5,4.16,45.70,Bechtel
J1A2 _JHA2 _7,41.35,46.10,Bechtel
...,...,...,...
RE5C_RECC_779,7.73,0.02,evoleap
RE5C_RECC_780,10.07,0.01,evoleap
RE5B_RECB_781,8.00,0.04,evoleap
RE5B_RECB_782,4.50,0.01,evoleap


## Plot in plotly

In [23]:
import plotly.express as px

df = combined_df
fig = px.scatter(df, x="Length (ft)", y="Worst Force (kips)", facet_col="Type", color="Type", trendline="ols")
fig.update_layout(title_text='Worst Force as a Function of Segment Length', title_x=0.5)
fig.show()

results = px.get_trendline_results(fig)
print(results)

print(results.px_fit_results.iloc[0].summary())
print(results.px_fit_results.iloc[1].summary())


      Type                                     px_fit_results
0  Bechtel  <statsmodels.regression.linear_model.Regressio...
1  evoleap  <statsmodels.regression.linear_model.Regressio...
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.033
Model:                            OLS   Adj. R-squared:                  0.026
Method:                 Least Squares   F-statistic:                     5.177
Date:                Fri, 06 May 2022   Prob (F-statistic):             0.0243
Time:                        12:01:06   Log-Likelihood:                -616.18
No. Observations:                 156   AIC:                             1236.
Df Residuals:                     154   BIC:                             1242.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef   

## Alternatively, we can plot these 2 plots together

In [24]:
import plotly.express as px

df = combined_df
fig = px.scatter(df, x="Length (ft)", y="Worst Force (kips)", color="Type", trendline="ols")
fig.update_layout(title_text='Worst Force as a Function of Segment Length', title_x=0.5)
fig.show()

results = px.get_trendline_results(fig)
print(results)

print(results.px_fit_results.iloc[0].summary())
print(results.px_fit_results.iloc[1].summary())


      Type                                     px_fit_results
0  Bechtel  <statsmodels.regression.linear_model.Regressio...
1  evoleap  <statsmodels.regression.linear_model.Regressio...
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.033
Model:                            OLS   Adj. R-squared:                  0.026
Method:                 Least Squares   F-statistic:                     5.177
Date:                Fri, 06 May 2022   Prob (F-statistic):             0.0243
Time:                        12:01:07   Log-Likelihood:                -616.18
No. Observations:                 156   AIC:                             1236.
Df Residuals:                     154   BIC:                             1242.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef   