In [25]:
'''Data pipeline to pull data from local csv and prepare a json file for frontEnd display.'''
import pandas as pd # type: ignore
import json


class DataPipeline:
    def __init__(self, source:str, target:str, primary_key:str) -> None:
        self.df = pd.DataFrame()
        self.specialisation = pd.DataFrame()
        self.source = source
        self.target = target
        self.pk = primary_key
    def extract(self) -> None:
        self.df = pd.read_csv(self.source)
    def transform(self) -> None:
        model_data:pd.DataFrame = self.df
        model_data = model_data[(model_data['type']!='dataset')]
        model_data.loc[:,'created_date'] = pd.to_datetime(model_data['created_date'], errors='coerce')
        new_columns = model_data['modality'].apply(repackage_modality).apply(pd.Series)
        model_data = model_data.assign(input_modality=new_columns[0], output_modality=new_columns[1])
        model_data = model_data.drop(columns='modality')
        model_data.loc[:,'dependencies'] = model_data[['dependencies']].copy().map(
            lambda raw: ', '.join([s.strip(' ').strip('\'') for s in str(raw)[1:-1].split(',')])
        )
        model_data = model_data.replace('unknown', None, regex=True)
        model_data = model_data.replace('nan', None, regex=True)
        for column_name in model_data.copy().columns:
            if len(model_data[column_name].unique()) <= 1:
                model_data = model_data.drop(columns=column_name)
        model_data[self.pk] = range(1, 1+len(model_data))
        reordered_columns = [
            'name'
            , 'url'
            , 'organization'
            , 'created_date'
            , 'size'
            , 'intended_uses'
            , 'prohibited_uses'
            , 'input_modality'
            , 'output_modality'
            , 'model_card'
            , 'adaptation'
            , 'output_space'
            , 'monthly_active_users'
            , 'user_distribution'
            , 'terms_of_service'
            , 'license'
            , 'quality_control'
            , 'monitoring'
            , 'feedback'
            , 'access'
            , 'description'
            , 'analysis'
            , 'type'
            , 'dependencies'
            , 'training_emissions'
            , 'training_time'
            , 'training_hardware'
        ]
        model_data = model_data[reordered_columns]
        temp_df_excel = pd.read_excel('../data/LLM Matrix.xlsx')
        self.specialisation = temp_df_excel.iloc[2:16,0:3]
        self.specialisation.columns = temp_df_excel.iloc[1,0:3].to_list()
        self.specialisation.rename(columns={'model':'name'}, inplace=True)
        self.specialisation.dropna(subset='specialisation', inplace=True)
        self.df = model_data.merge(self.specialisation[['name', 'specialisation']], on='name', how='left')
    def load(self) -> None:
        self.df.to_json(self.target, orient='records', lines=True)
        self.df.to_csv('../data/processed.csv')
    def call_sos(self, error_message:str) -> None:
        print(error_message)


def repackage_modality(raw:str) -> tuple[list[str]]:
    raw = str(raw)
    semicolon_count = raw.count(';')
    assert semicolon_count <= 1, 'LLM modality invalid.'
    if semicolon_count == 0:
        raw = raw + ';' + raw
    modal_input_str, modal_output_str = raw.split(';')
    modal_inputs = ', '.join([s.strip() for s in modal_input_str.split(',')])
    modal_outputs = ', '.join([s.strip() for s in modal_output_str.split(',')])
    return modal_inputs, modal_outputs


def main() -> None:
    try:
        source_path = '../data/assets.csv'
        target_path = '../data/processed.json'
        primary_key = 'primary_key'
        pipeline = DataPipeline(
            source_path
            , target_path
            , primary_key
        )
        pipeline.extract()
        pipeline.transform()
        pipeline.load()
        print('Script is good!')
    except Exception as e:
        pipeline.call_sos(e)
    finally:
        '''Close connections.'''
        pass

if __name__ == '__main__':
    main()


Script is good!
