This notebook runs Python 3.11.7 on anaconda3. It is a pipeline to take standford source data and deliver csv data for the analytics team.

In [1]:
'''Data pipeline to pull data from local csv and load clean data into another csv.'''
import pandas as pd # type: ignore


class DataPipeline:
    def __init__(self, source:str, target:str, primary_key:str) -> None:
        self.df = pd.DataFrame()
        self.source = source
        self.target = target
        self.pk = primary_key
    def extract(self) -> None:
        self.df = pd.read_csv(self.source)
    def transform(self) -> None:
        model_data:pd.DataFrame = self.df
        model_data = model_data[(model_data['type']!='dataset')]
        for column_name in model_data.copy().columns:
            if len(model_data[column_name].unique()) <= 1:
                model_data = model_data.drop(columns=column_name)
        model_data['model_id'] = range(1, 1+len(model_data))
        model_data['created_date'] = pd.to_datetime(model_data['created_date'], errors='coerce')
        model_data[['input_modality', 'output_modality']] = \
            model_data['modality'].apply(repackage_modality).apply(pd.Series)
        model_data.drop(columns='modality', inplace=True)
        model_data[['dependencies']] = model_data[['dependencies']].copy().map(
            lambda raw: [s.strip(' ').strip('\'') for s in str(raw)[1:-1].split(',')]
        )
        model_data = model_data.explode('input_modality')
        model_data = model_data.explode('output_modality')
        model_data = model_data.explode('dependencies')
        model_data['table_pk'] = range(1, 1+len(model_data))
        self.df = model_data
    def load(self) -> None:
        self.df.to_csv(self.target)
    def call_sos(self, error_message:str) -> None:
        print(error_message)


def repackage_modality(raw:str) -> tuple[list[str]]:
    raw = str(raw)
    semicolon_count = raw.count(';')
    assert semicolon_count <= 1, 'LLM modality invalid.'
    if semicolon_count == 0:
        raw = raw + ';' + raw
    modal_input_str, modal_output_str = raw.split(';')
    modal_inputs = [s.strip() for s in modal_input_str.split(',')]
    modal_outputs = [s.strip() for s in modal_output_str.split(',')]
    return modal_inputs, modal_outputs


def main() -> None:
    try:
        source_path = '../data/assets.csv'
        target_path = '../data/processed.csv'
        primary_key = 'table_pk'
        pipeline = DataPipeline(
            source_path
            , target_path
            , primary_key
        )
        pipeline.extract()
        pipeline.transform()
        pipeline.load()
        print('Script is good!')
    except Exception as e:
        pipeline.call_sos(e)
    finally:
        '''Close connections.'''
        pass

if __name__ == '__main__':
    main()


Script is good!
