## 0. Description
- 우선 기능 단위로 만들고 하나의 모듈(클래스)로 만들어서 작동시켜보자

## 1. Load Library

In [13]:
import os
import sys
import pickle
from tqdm import tqdm

from safetensors.torch import load_file

## 2. Convert safetensors -> pkl

In [26]:
BASE_PATH = "/Users/dataeng/modelscan-test/modelscan/unscaned/3901296/request-01/model"

ext_in = 'safetensors'
ext_out = 'pkl'

MODEL_NAME = "Mistral-Nemo-Instruct-2407"

In [None]:
def load_model(ext_in:str,path:str):
    tensors = None

    if ext_in == 'safetensors':
        print("execute load_file")
        tensors = load_file(path)
    
    return tensors

def store_model(ext_out:str,path:str,tensors=None):
    '''
    tensor는 어떤 library를 사용하여 load 하는지에 따라 형식이 달라질 수 있다.
    추후 다양한 type을 지원할 수 있도록 모듈을 생성해야할 것이다.

    1. torch : collections.OrderedDict
    '''
    if ext_out == 'pkl':
        # pickle을 사용하여 state_dict 저장
        with open(path, 'wb') as f:
            pickle.dump(tensors, f)

def convert(base_path:str,model_name:str,file_name:str,ext_in:str, ext_out:str):
    # 1. load model weight with ext_in extension file
    load_model_file_path = os.path.join(base_path,ext_in,model_name,file_name)
    print(f"Source Model File Path : {load_model_file_path}")

    # 각 확장자 별로 load함수를 달리 해야한다.
    tensors = load_model(ext_in=ext_in,path=load_model_file_path)

    # 2. store model weight with ext_out extension file
    # .pkl 파일 경로 지정
    store_model_file_name = '.'.join([file_name.split('.')[0],ext_out])
    store_model_file_path = os.path.join(base_path,ext_out,model_name,store_model_file_name)
    print(f"Store File Model File Path : {store_model_file_path}")
    store_model(ext_out=ext_out,path=store_model_file_path,tensors=tensors)  

def model_scan(base_path:str,model_name:str,file_name:str,ext_in:str, ext_out:str):
    

In [None]:
class ModelScan():
    def __init__(self,base_path:str,model_name:str,file_name:str,ext_in:str,ext_out:str):
        self.base_path = base_path
        self.model_name = model_name
        self.file_name = file_name
        self.ext_in = ext_in
        self.ext_out = ext_out
        self.tensors = []
    
    def load_model(self,path:str):
        '''
        다양한 extension input을 지원할 수 있도록 해야함.
        '''
        tensor = None

        if self.ext_in == 'safetensors':
            print("execute load_file")
            tensor = load_file(path)
        
        self.tensors.append(tensor)
    
    def store_model(ext_out:str,path:str,tensors=None):
        '''
        tensor는 어떤 library를 사용하여 load 하는지에 따라 형식이 달라질 수 있다.
        추후 다양한 type을 지원할 수 있도록 모듈을 생성해야할 것이다.

        1. torch : collections.OrderedDict
        '''
        if ext_out == 'pkl':
            # pickle을 사용하여 state_dict 저장
            with open(path, 'wb') as f:
                pickle.dump(tensors, f)

    def convert(self):
        # 1. load model weight with ext_in extension file
        load_model_file_path = os.path.join(self.base_path,self.ext_in,self.model_name,self.file_name)
        print(f"Source Model File Path : {load_model_file_path}")

        # 각 확장자 별로 load함수를 달리 해야한다.
        tensors = load_model(ext_in=ext_in,path=load_model_file_path)

        # 2. store model weight with ext_out extension file
        # .pkl 파일 경로 지정
        store_model_file_name = '.'.join([self.file_name.split('.')[0],self.ext_out])
        store_model_file_path = os.path.join(self.base_path,ext_out,self.model_name,store_model_file_name)
        print(f"Store File Model File Path : {store_model_file_path}")
        store_model(ext_out=ext_out,path=store_model_file_path,tensors=tensors) 
    
    def scan(self):
        


In [28]:
model_file_list = os.listdir(os.path.join(BASE_PATH,ext_in,MODEL_NAME))
model_weight_file_list = [fn for fn in model_file_list if fn.endswith('.safetensors') and not 'consolidated' in fn]
model_weight_file_list

['model-00003-of-00005.safetensors',
 'model-00002-of-00005.safetensors',
 'model-00001-of-00005.safetensors',
 'model-00004-of-00005.safetensors',
 'model-00005-of-00005.safetensors']

In [44]:
model_file_list = os.listdir(os.path.join(BASE_PATH,ext_in,MODEL_NAME))
model_weight_file_list = [fn for fn in model_file_list if fn.endswith('.safetensors') and not 'consolidated' in fn]

for weight in model_weight_file_list:
    convert(base_path=BASE_PATH,model_name=MODEL_NAME,file_name=weight,ext_in=ext_in,ext_out=ext_out)

Source Model File Path : /Users/dataeng/modelscan-test/modelscan/unscaned/3901296/request-01/model/safetensors/Mistral-Nemo-Instruct-2407/model-00003-of-00005.safetensors
execute load_file
Store File Model File Path : /Users/dataeng/modelscan-test/modelscan/unscaned/3901296/request-01/model/pkl/Mistral-Nemo-Instruct-2407/model-00003-of-00005.pkl
Source Model File Path : /Users/dataeng/modelscan-test/modelscan/unscaned/3901296/request-01/model/safetensors/Mistral-Nemo-Instruct-2407/model-00002-of-00005.safetensors
execute load_file
Store File Model File Path : /Users/dataeng/modelscan-test/modelscan/unscaned/3901296/request-01/model/pkl/Mistral-Nemo-Instruct-2407/model-00002-of-00005.pkl
Source Model File Path : /Users/dataeng/modelscan-test/modelscan/unscaned/3901296/request-01/model/safetensors/Mistral-Nemo-Instruct-2407/model-00001-of-00005.safetensors
execute load_file
Store File Model File Path : /Users/dataeng/modelscan-test/modelscan/unscaned/3901296/request-01/model/pkl/Mistral-

In [None]:
# 각 파일 별 modelscan 수행.
