In [1]:
!module load anaconda/2020.11-py38
!module load cuda/11.7.0
!module load gcc/6.3.0

In [13]:
import re
import os
from tqdm import tqdm
import openai
import json

from loguru import logger

In [3]:

def remove_code_snippets(text):
    return re.sub(r'```.*?```', '', text, flags=re.DOTALL)


def remove_tables(text):
    def replace_table(match):
        header = re.findall(r'\|([^|\n]+)\|', match.group(0))
        values = re.findall(r'\|([^|\n]+)\|', match.group(0)[match.end(1):])
        return "\n".join(header + values)

    return re.sub(r'(\|.*\|\n\|:?-+:?\|.*\n)((\|.*\|(\n)?)+)', replace_table, text, flags=re.MULTILINE)


def remove_urls(text):
    return re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)


def parse_readme(readme):
    main_text = re.sub(r"---\n.+?\n---", "", readme, flags=re.DOTALL).strip()
    main_text = remove_code_snippets(main_text)
    main_text = remove_tables(main_text)
    main_text = remove_urls(main_text)
    return main_text

In [4]:

def read_readme_files(base_dir):
    readmes = {}
    for author_name in tqdm(os.listdir(base_dir)):
        for model_name in os.listdir(os.path.join(base_dir, author_name)):
            readme_path = os.path.join(base_dir, author_name, model_name, "README.md")
            # logger.debug(readme_path)
            if os.path.exists(readme_path):
                with open(readme_path, "r") as f:
                    readmes[model_name] = f.read()
            logger.info(f"author_name, model_name, readme_path are {author_name}, {model_name}, {readme_path}")
            break
        break
    return readmes

In [5]:

def extract_metadata_gpt35(prompt):
    MODEL = "gpt-3.5-turbo"
    response = openai.ChatCompletion.create(
        model = MODEL,
        messages=[
        {"role": "system", "content": "You are classifying the category and task of pre-trained model based on the given README files."},
        {"role": "user", "content": "Can you clasify the following readme file into one of the element in the dict?\n \
        Use this format, dont't generate any other texts 'task: category'" + prompt},
        ],
        temperature = 0.1,    
        max_tokens = 200,
    )
    logger.info(response)
    message = response["choices"][0]["message"]["content"].strip().lower().replace("\n", " ")

    return response.choices[0].text.strip()

def truncate_text(text, max_tokens=200):
    tokens = text.split()
    truncated_tokens = tokens[:max_tokens]
    truncated_text = ' '.join(truncated_tokens)
    return truncated_text

def extract_metadata(metadata, readme):
    readme = parse_readme(readme)
    truncated_documentation = truncate_text(readme)
    # prompt = f"Extract pre-trained deep learning model metadata for the following model documentation: \n{readme}"
    prompt = f"Here is a mapping of {{task: category}}: mapping = {{\n"
    for task in [
        'feature-extraction: multimodal',
        'text-to-image:multimodal',
        'image-to-text:multimodal',
        'text-to-video:multimodal',
        'visual-question-answering:multimodal',
        'graph-machine-learning:multimodal',
        'graph-ml:multimodal',
        'depth-estimation:computer-vision',
        'image-classification:computer-vision',
        'object-detection:computer-vision',
        'image-segmentation:computer-vision',
        'image-to-image:computer-vision',
        'unconditional-image-generation:computer-vision',
        'video-classification:computer-vision',
        'zero-shot-image-classification:computer-vision',
        'text-classification:natural-language-processing',
        'token-classification:natural-language-processing',
        'table-question-answering:natural-language-processing',
        'question-answering:natural-language-processing',
        'zero-shot-classification:natural-language-processing',
        'translation:natural-language-processing',
        'summarization:natural-language-processing',
        'conversational:natural-language-processing',
        'text-generation:natural-language-processing',
        'text2text-generation:natural-language-processing',
        'fill-mask:natural-language-processing',
        'sentence-similarity:natural-language-processing',
        'table-to-text:natural-language-processing', # dataset task
        'multiple-choice:natural-language-processing',
        'text-retrieval:natural-language-processing', # dataset task
        'document-question-answering:natural-language-processing', # model task
        'text-to-speech:audio',
        'automatic-speech-recognition:audio',
        'audio-to-audio:audio',
        'audio-classification:audio',
        'voice-activity-detection:audio',
        'tabular-classification:tabular',
        'tabular-regression:tabular',
        'tabular-to-text:tabular', # dataset task
        'time-series-forecasting:tabular', # dataset task
        'reinforcement-learning:reinforcement-learning', # model task
        'robotics:reinforcement-learning', # model task
        'null: None',
        'other: None',
        ]:
            prompt = f"{prompt} '{task}'\n"
    logger.debug(prompt)

    # prompt = f"{prompt}}}\n\n"
    prompt = f"{prompt}\n{truncated_documentation}"

    # logger.debug(prompt)
    # response = openai.Completion.create(
    #     engine="davinci",
    #     prompt=prompt,
    #     max_tokens=100,
    #     n=1,
    #     stop=None,
    #     temperature=0.5,
    # )

    prompt = f""
    metadata_text = extract_metadata_gpt35(prompt)
    logger.debug(metadata_text)
    metadata[model_name] = metadata_text

    with open("metadata.txt", "w") as file:
        for entry in metadata:
            for key, value in entry.items():
                file.write(f"{key}: {value}\n")
            file.write("\n")
    return metadata

In [11]:
base_dir = "/scratch/gilbreth/jiang784/PTMTorrent/huggingface/PTM-Torrent/ptm_torrent/huggingface/data/huggingface/repos"
metadata_dir = "/scratch/gilbreth/jiang784/PTMTorrent/PTMTorrent/ptm_torrent/huggingface/data/huggingface/json"
metadata_file = metadata_dir + "/huggingface.json"

In [24]:
with open(metadata_file, 'r') as f:
    HF_metadata = json.load(f)

In [25]:
HF_metadata

[{'id': 0,
  'LatestGitCommitSHA': 'aeffd769076a5c4f83b2546aea99ca45a15a5da4',
  'ModelHub': {'MetadataFilePath': 'data/huggingface/json/metadata/hf_metadata.json',
   'MetadataObjectID': 'albert-base-v1',
   'ModelHubName': 'Hugging Face',
   'ModelHubURL': 'https://huggingface.co/'},
  'ModelName': 'albert-base-v1',
  'ModelOwner': 'http://huggingface.co/',
  'ModelOwnerURL': 'http://huggingface.co/',
  'ModelURL': 'https://huggingface.co/albert-base-v1',
  'ModelArchitecture': 'albert',
  'ModelTask': 'fill-mask'},
 {'id': 1,
  'LatestGitCommitSHA': '51dbd9db43a0c6eba97f74b91ce26fface509e0b',
  'ModelHub': {'MetadataFilePath': 'data/huggingface/json/metadata/hf_metadata.json',
   'MetadataObjectID': 'albert-base-v2',
   'ModelHubName': 'Hugging Face',
   'ModelHubURL': 'https://huggingface.co/'},
  'ModelName': 'albert-base-v2',
  'ModelOwner': 'http://huggingface.co/',
  'ModelOwnerURL': 'http://huggingface.co/',
  'ModelURL': 'https://huggingface.co/albert-base-v2',
  'ModelArchit

In [27]:
for idx, metadata in enumerate(HF_metadata):
    logger.debug(metadata)
    model_name = metadata['ModelName']
    model_path = metadata['ModelHub']['MetadataObjectID']
    model_url = metadata['ModelURL']
    model_arch = metadata['ModelArchitecture']
    model_task = metadata['ModelTask']
    break

2023-04-23 15:30:04.214 | DEBUG    | __main__:<module>:2 - {'id': 0, 'LatestGitCommitSHA': 'aeffd769076a5c4f83b2546aea99ca45a15a5da4', 'ModelHub': {'MetadataFilePath': 'data/huggingface/json/metadata/hf_metadata.json', 'MetadataObjectID': 'albert-base-v1', 'ModelHubName': 'Hugging Face', 'ModelHubURL': 'https://huggingface.co/'}, 'ModelName': 'albert-base-v1', 'ModelOwner': 'http://huggingface.co/', 'ModelOwnerURL': 'http://huggingface.co/', 'ModelURL': 'https://huggingface.co/albert-base-v1', 'ModelArchitecture': 'albert', 'ModelTask': 'fill-mask'}


In [32]:
repo_path = base_dir + '/' + model_path
repo_path

'/scratch/gilbreth/jiang784/PTMTorrent/huggingface/PTM-Torrent/ptm_torrent/huggingface/data/huggingface/repos/albert-base-v1'

In [33]:
!ls repo_path

ls: cannot access repo_path: No such file or directory


In [34]:
import pandas as pd

In [36]:
# Initialize an empty list to store metadata dictionaries
metadata_list = []
metadata_list_noTask = []
metadata_list_noArch = []
metadata_list_nothing = []

for idx, metadata in enumerate(HF_metadata):
    # Extract metadata information
    try:
        metadata_dict = {
            'model_name': metadata['ModelName'],
            'model_path': metadata['ModelHub']['MetadataObjectID'],
            'model_url': metadata['ModelURL'],
            'model_architecture': metadata['ModelArchitecture'],
            'model_task': metadata['ModelTask'],
        }
        # Add the metadata dictionary to the list
        metadata_list.append(metadata_dict)
    except:
        try:
            metadata_dict = {
                'model_name': metadata['ModelName'],
                'model_path': metadata['ModelHub']['MetadataObjectID'],
                'model_url': metadata['ModelURL'],
                'model_architecture': metadata['ModelArchitecture'],
            }
            metadata_list_noTask.append(metadata_dict)
        except:
            try:
                metadata_dict = {
                    'model_name': metadata['ModelName'],
                    'model_path': metadata['ModelHub']['MetadataObjectID'],
                    'model_url': metadata['ModelURL'],
                    'model_task': metadata['ModelTask'],
                }
                metadata_list_noArch.append(metadata_dict)
            except:
                metadata_list_nothing.append((idx, metadata))

# Create a pandas DataFrame from the list of dictionaries
metadata_df = pd.DataFrame(metadata_list)
metadata_noTask_df = pd.DataFrame(metadata_list_noTask)
metadata_noArch_df = pd.DataFrame(metadata_list_noArch)


# Save the DataFrame to a CSV file
metadata_df.to_csv('/depot/davisjam/data/wenxin/CS577-NLP/project/metadata.csv', index=False)
metadata_noArch_df.to_csv('/depot/davisjam/data/wenxin/CS577-NLP/project/metadata_noArch.csv', index=False)
metadata_noTask_df.to_csv('/depot/davisjam/data/wenxin/CS577-NLP/project/metadata_noTask.csv', index=False)

# Save the metadata_nothing list
with open('/depot/davisjam/data/wenxin/CS577-NLP/project/metadata_nothing.txt', 'w') as file:
    for item in metadata_list_nothing:
        file.write(f"{item}\n")
        

In [38]:
len(metadata_df), len(metadata_noTask_df), len(metadata_noArch_df)

(61677, 3947, 12773)

In [40]:
mapping = {
    'feature-extraction': 'multimodal',
    'text-to-image': 'multimodal',
    'image-to-text': 'multimodal',
    'text-to-video': 'multimodal',
    'visual-question-answering': 'multimodal',
    'graph-machine-learning': 'multimodal',
    'depth-estimation': 'computer-vision',
    'image-classification': 'computer-vision',
    'object-detection': 'computer-vision',
    'image-segmentation': 'computer-vision',
    'image-to-image': 'computer-vision',
    'unconditional-image-generation': 'computer-vision',
    'video-classification': 'computer-vision',
    'zero-shot-image-classification': 'computer-vision',
    'text-classification': 'natural-language-processing',
    'token-classification': 'natural-language-processing',
    'table-question-answering': 'natural-language-processing',
    'question-answering': 'natural-language-processing',
    'zero-shot-classification': 'natural-language-processing',
    'translation': 'natural-language-processing',
    'summarization': 'natural-language-processing',
    'conversational': 'natural-language-processing',
    'text-generation': 'natural-language-processing',
    'text2text-generation': 'natural-language-processing',
    'fill-mask': 'natural-language-processing',
    'sentence-similarity': 'natural-language-processing',
    'multiple-choice': 'natural-language-processing',
    'document-question-answering': 'natural-language-processing',
    'text-to-speech': 'audio',
    'automatic-speech-recognition': 'audio',
    'audio-to-audio': 'audio',
    'audio-classification': 'audio',
    'voice-activity-detection': 'audio',
    'tabular-classification': 'tabular',
    'tabular-regression': 'tabular',
    'reinforcement-learning': 'reinforcement-learning',
    'robotics': 'reinforcement-learning',
    'null': None,
    'other': None,
}

In [41]:
def get_category(task):
    return mapping.get(task, "None")
for df in (metadata_df, metadata_noArch_df):
    # Add the new column based on the mapping
    df["model_category"] = df["model_task"].apply(get_category)
    

In [44]:
metadata_noArch_df

Unnamed: 0,model_name,model_path,model_url,model_task,model_category
0,AIDA-UPM/MSTSb_paraphrase-multilingual-MiniLM-...,AIDA-UPM/MSTSb_paraphrase-multilingual-MiniLM-...,https://huggingface.co/AIDA-UPM/MSTSb_paraphra...,sentence-similarity,natural-language-processing
1,AJ/rick-sanchez-bot,AJ/rick-sanchez-bot,https://huggingface.co/AJ/rick-sanchez-bot,conversational,natural-language-processing
2,AetherIT/DialoGPT-small-Hal,AetherIT/DialoGPT-small-Hal,https://huggingface.co/AetherIT/DialoGPT-small...,conversational,natural-language-processing
3,AkshaySg/langid,AkshaySg/langid,https://huggingface.co/AkshaySg/langid,audio-classification,audio
4,AriakimTaiyo/DialoGPT-medium-Kumiko,AriakimTaiyo/DialoGPT-medium-Kumiko,https://huggingface.co/AriakimTaiyo/DialoGPT-m...,conversational,natural-language-processing
...,...,...,...,...,...
12768,Kenemo/q-learning-frozenlake-v1-no-slippery-4x4,Kenemo/q-learning-frozenlake-v1-no-slippery-4x4,https://huggingface.co/Kenemo/q-learning-froze...,reinforcement-learning,reinforcement-learning
12769,Kenemo/q-learning-frozenlake-v1-no-slippery-8x8,Kenemo/q-learning-frozenlake-v1-no-slippery-8x8,https://huggingface.co/Kenemo/q-learning-froze...,reinforcement-learning,reinforcement-learning
12770,Kenemo/q-learning-frozenlake-v1-4x4,Kenemo/q-learning-frozenlake-v1-4x4,https://huggingface.co/Kenemo/q-learning-froze...,reinforcement-learning,reinforcement-learning
12771,iriscope/oscavatar,iriscope/oscavatar,https://huggingface.co/iriscope/oscavatar,text-to-image,multimodal


In [51]:
metadata_df = metadata_df.drop("model_path", axis=1)
metadata_noArch_df = metadata_noArch_df.drop("model_path", axis=1)
metadata_noTask_df = metadata_noTask_df.drop("model_path", axis=1)
     

In [53]:
metadata_df.to_csv('/depot/davisjam/data/wenxin/CS577-NLP/project/metadata.csv', index=False)
metadata_noArch_df.to_csv('/depot/davisjam/data/wenxin/CS577-NLP/project/metadata_noArch.csv', index=False)
metadata_noTask_df.to_csv('/depot/davisjam/data/wenxin/CS577-NLP/project/metadata_noTask.csv', index=False)