In [1]:
import requests
import json
import time

In [2]:
# Configuration
API_KEY = "80d2590161de42a9b0f6d1582e17a4d6"
headers = {
    "Content-Type": "application/json",
    "api-key": API_KEY,
}

ENDPOINT = "https://yuwa-m2oi18l3-swedencentral.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-08-01-preview"

with open('extracted_data.json', 'r', encoding='utf-8') as file:
    datasets = json.load(file)

#print(datasets[0]['name'])


In [3]:
# List to store results
results = []

In [4]:
def send_request(name, description):
    # Payload for the request
    payload = {
        "messages": [
            {
                "role": "system",
                "content": [
                    {
                        "type": "text",
                        "text": f"""You are a tagging system that identifies and extracts modalities and data types from dataset information (name and description) to categorize datasets and form a comprehensive data repository.
                        
                        This is about a multimodal data task. Please identify and provide tags for modalities and data types that may be included in this dataset. Use detailed tags that capture the essence of the dataset. 
                                Modalities refer to the broad categories of data (e.g., text, image, video). Data types refer to the specific forms or instances of data within each modality (e.g., bird images, captions, sentences).  

                                Requirements: 
                                1. Use consistent tags across all datasets. For example, if a tag for "text" is used, do not switch to other similar tags like "textual." Ensure that modality and data type tags are reused consistently across datasets. Avoid inventing new tags unless necessary.  
                                2. The response should follow this JSON structure: 
                                {{
                                    "modalities": [
                                        {{
                                            "title": "modality_title",
                                            "explanation": "modality_explanation",
                                            "data_types": [
                                                {{
                                                "title": "data_type_title",
                                                "explanation": "data_type_explanation"
                                                }},
                                            ]
                                        }}
                                    ]
                                }}
                                3. Only extract information from the provided dataset(dataset_name and dataset_description), and avoid using overly specific terminology or overly general terms unless the dataset description explicitly calls for it. Keep the tags general but informative. 
                                4. Please provide your response in English.
                                5. Don't use _ in your answer, e.g., use bird images, not bird_images."""
                    }
                ]
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": f"""Below is a dataset's information:  
                                [begin]  
                                {{dataset_name：{name}}}  
                                {{dataset_description: {description}}} 
                                [end]  """
                    }
                ]
            }
        ],
        "temperature": 0.1,
        "top_p": 0.95,
        "max_tokens": 8000
    }

    # Send request
    retries = 3  # Number of retries for 429 errors
    for attempt in range(retries):
        try:
            response = requests.post(ENDPOINT, headers=headers, json=payload)
            response.raise_for_status()  # Will raise an HTTPError if the HTTP request returned an unsuccessful status code
            answer = response.json().get('choices', [{}])[0].get('message', {}).get('content', 'No content received.')
            # Parse the JSON content from the answer string
            if answer.startswith("```json") and answer.endswith("```"):
                answer = answer[8:-3].strip()  # Remove the "```json" and trailing "```"
                
            tags_json = json.loads(answer)

            results.append({
                "name": name,
                "tags": tags_json  # Store the response as tags
            })

            break  # Break the loop if the request is successful
        except requests.RequestException as e:
            if response.status_code == 429:
                wait_time = 2 ** attempt  # Exponential backoff
                print(f"Too many requests for {name}. Waiting for {wait_time} seconds before retrying...")
                time.sleep(wait_time)  # Wait before retrying
            else:
                print(f"Failed to make the request for {name}. Error: {e}")
                break  # Exit on other errors
        except Exception as e:
            print(f"An unexpected error occurred for {name}. Error: {e}")
            break  # Exit on unexpected errors
            

In [5]:
# Iterate over datasets and send requests
for dataset in datasets[:1000]:
    name = dataset.get('name')
    description = dataset.get('description')
    
    if name and description:
        send_request(name, description)
    else:
        print(f"Dataset entry is missing name or description: {dataset}")

Dataset entry is missing name or description: {'name': 'IPC-grounded', 'description': ''}
Dataset entry is missing name or description: {'name': 'SunYs (Lungvesselct)', 'description': ''}
Dataset entry is missing name or description: {'name': 'Multi-dSprites', 'description': ''}
Dataset entry is missing name or description: {'name': 'CL-SciSumm', 'description': ''}
Dataset entry is missing name or description: {'name': 'RecipeNLG', 'description': ''}


In [6]:
# Save results to a JSON file
with open('results.json', 'w', encoding='utf-8') as result_file:
    json.dump(results, result_file, ensure_ascii=False, indent=4)

In [7]:
for dataset in datasets[1000:2000]:
    name = dataset.get('name')
    description = dataset.get('description')
    
    if name and description:
        send_request(name, description)
    else:
        print(f"Dataset entry is missing name or description: {dataset}")

Dataset entry is missing name or description: {'name': 'BSD100', 'description': ''}
Dataset entry is missing name or description: {'name': 'Cornell', 'description': ''}
Dataset entry is missing name or description: {'name': 'Google', 'description': ''}
Dataset entry is missing name or description: {'name': 'ICVL', 'description': ''}


In [10]:
len(results)

1991

In [9]:
# Save results to a JSON file
with open('results.json', 'w', encoding='utf-8') as result_file:
    json.dump(results, result_file, ensure_ascii=False, indent=4)

In [11]:
for dataset in datasets[2000:3000]:
    name = dataset.get('name')
    description = dataset.get('description')
    
    if name and description:
        send_request(name, description)
    else:
        print(f"Dataset entry is missing name or description: {dataset}")

In [12]:
len(results)

2991

In [13]:
# Save results to a JSON file
with open('results.json', 'w', encoding='utf-8') as result_file:
    json.dump(results, result_file, ensure_ascii=False, indent=4)

In [14]:
for dataset in datasets[3000:4000]:
    name = dataset.get('name')
    description = dataset.get('description')
    
    if name and description:
        send_request(name, description)
    else:
        print(f"Dataset entry is missing name or description: {dataset}")

Dataset entry is missing name or description: {'name': 'The Annotated Gumar Corpus', 'description': ''}
Dataset entry is missing name or description: {'name': 'B-T4SA (B-T4SA)', 'description': ''}
Dataset entry is missing name or description: {'name': 'VESSEL12 (VESsel SEgmentation in the Lung 2012)', 'description': ''}
Dataset entry is missing name or description: {'name': 'trek05-1 (trek05-1)', 'description': ''}
Dataset entry is missing name or description: {'name': 'Drebin (Drebin)', 'description': ''}
Dataset entry is missing name or description: {'name': 'TREC-05 (TREC 2005 Spam Public Corpora)', 'description': ''}
Dataset entry is missing name or description: {'name': 'Falling Objects', 'description': ''}
Dataset entry is missing name or description: {'name': 'TbD-3D', 'description': ''}
Dataset entry is missing name or description: {'name': 'TbD', 'description': ''}
Dataset entry is missing name or description: {'name': 'MULTIMODAL HUMOR', 'description': ''}
Dataset entry is mi

In [15]:
# Save results to a JSON file
with open('results.json', 'w', encoding='utf-8') as result_file:
    json.dump(results, result_file, ensure_ascii=False, indent=4)

In [16]:
len(results)

3963

In [18]:
for dataset in datasets[4000:5000]:
    name = dataset.get('name')
    description = dataset.get('description')
    
    if name and description:
        send_request(name, description)
    else:
        print(f"Dataset entry is missing name or description: {dataset}")

Dataset entry is missing name or description: {'name': 'FB122 (Freebase-122)', 'description': ''}
Dataset entry is missing name or description: {'name': 'Onmiglot', 'description': ''}
Dataset entry is missing name or description: {'name': 'KNNIST', 'description': ''}
Dataset entry is missing name or description: {'name': 'KMNIST', 'description': ''}
Dataset entry is missing name or description: {'name': 'notMNIST', 'description': ''}
Dataset entry is missing name or description: {'name': 'HErlev (HErlev Pap Smear Dataset)', 'description': ''}
Dataset entry is missing name or description: {'name': 'World War III - International signed relations network', 'description': ''}
Dataset entry is missing name or description: {'name': 'WNUT 2016 NER (WNUT 2016 Twitter Named Entity Recognition)', 'description': ''}


In [19]:
# Save results to a JSON file
with open('results.json', 'w', encoding='utf-8') as result_file:
    json.dump(results, result_file, ensure_ascii=False, indent=4)

In [20]:
len(results)

4955

In [4]:
with open('results.json', 'r', encoding='utf-8') as result_file:
        results = json.load(result_file)

In [5]:
len(results)

4955

In [6]:
# Process datasets and track progress
for idx, dataset in enumerate(datasets[5000:6000], start=5001):  # Start from 5001 for clarity
    name = dataset.get('name')
    description = dataset.get('description')
    
    if name and description:
        print(f"Processing dataset #{idx}")  # Print current progress
        send_request(name, description)
    else:
        print(f"Dataset entry #{idx} is missing name or description: {dataset}")


Processing dataset #5001
Processing dataset #5002
Processing dataset #5003
Processing dataset #5004
Processing dataset #5005
Processing dataset #5006
Processing dataset #5007
Processing dataset #5008
Processing dataset #5009
Processing dataset #5010
Processing dataset #5011
Processing dataset #5012
Processing dataset #5013
Processing dataset #5014
Processing dataset #5015
Processing dataset #5016
Processing dataset #5017
Processing dataset #5018
Processing dataset #5019
Processing dataset #5020
Processing dataset #5021
Processing dataset #5022
Processing dataset #5023
Processing dataset #5024
Processing dataset #5025
Processing dataset #5026
Processing dataset #5027
Processing dataset #5028
Processing dataset #5029
Processing dataset #5030
Processing dataset #5031
Processing dataset #5032
Processing dataset #5033
Processing dataset #5034
Processing dataset #5035
Processing dataset #5036
Processing dataset #5037
Processing dataset #5038
Processing dataset #5039
Processing dataset #5040


In [7]:
# Save results to a JSON file
with open('results.json', 'w', encoding='utf-8') as result_file:
    json.dump(results, result_file, ensure_ascii=False, indent=4)

In [9]:
len(results)

5955

In [10]:
# Process datasets and track progress
for idx, dataset in enumerate(datasets[6000:7000], start=6001):  # Start from 5001 for clarity
    name = dataset.get('name')
    description = dataset.get('description')
    
    if name and description:
        print(f"Processing dataset #{idx}")  # Print current progress
        send_request(name, description)
    else:
        print(f"Dataset entry #{idx} is missing name or description: {dataset}")


Processing dataset #6001
Processing dataset #6002
Processing dataset #6003
Processing dataset #6004
Processing dataset #6005
Processing dataset #6006
Processing dataset #6007
Processing dataset #6008
Processing dataset #6009
Processing dataset #6010
Processing dataset #6011
Processing dataset #6012
Processing dataset #6013
Processing dataset #6014
Processing dataset #6015
Processing dataset #6016
Processing dataset #6017
Processing dataset #6018
Processing dataset #6019
Processing dataset #6020
Processing dataset #6021
Processing dataset #6022
Processing dataset #6023
Processing dataset #6024
Processing dataset #6025
Processing dataset #6026
Processing dataset #6027
Processing dataset #6028
Processing dataset #6029
Processing dataset #6030
Processing dataset #6031
Processing dataset #6032
Processing dataset #6033
Processing dataset #6034
Processing dataset #6035
Processing dataset #6036
Processing dataset #6037
Processing dataset #6038
Processing dataset #6039
Processing dataset #6040


In [11]:
# Save results to a JSON file
with open('results.json', 'w', encoding='utf-8') as result_file:
    json.dump(results, result_file, ensure_ascii=False, indent=4)

In [12]:
len(results)

6955

In [13]:
# Process datasets and track progress
for idx, dataset in enumerate(datasets[7000:8000], start=7001):  # Start from 5001 for clarity
    name = dataset.get('name')
    description = dataset.get('description')
    
    if name and description:
        print(f"Processing dataset #{idx}")  # Print current progress
        send_request(name, description)
    else:
        print(f"Dataset entry #{idx} is missing name or description: {dataset}")

Processing dataset #7001
Processing dataset #7002
Processing dataset #7003
Processing dataset #7004
Processing dataset #7005
Processing dataset #7006
Processing dataset #7007
Processing dataset #7008
Processing dataset #7009
Processing dataset #7010
Processing dataset #7011
Processing dataset #7012
Processing dataset #7013
Processing dataset #7014
Processing dataset #7015
Processing dataset #7016
Processing dataset #7017
Processing dataset #7018
Processing dataset #7019
Processing dataset #7020
Processing dataset #7021
Processing dataset #7022
Processing dataset #7023
Processing dataset #7024
Processing dataset #7025
Processing dataset #7026
Processing dataset #7027
Processing dataset #7028
Processing dataset #7029
Processing dataset #7030
Processing dataset #7031
Processing dataset #7032
Processing dataset #7033
Processing dataset #7034
Processing dataset #7035
Processing dataset #7036
Processing dataset #7037
Processing dataset #7038
Processing dataset #7039
Processing dataset #7040


In [15]:
# Save results to a JSON file
with open('results.json', 'w', encoding='utf-8') as result_file:
    json.dump(results, result_file, ensure_ascii=False, indent=4)

In [16]:
len(results)

7955

In [17]:
# Process datasets and track progress
for idx, dataset in enumerate(datasets[8000:9000], start=8001):  # Start from 5001 for clarity
    name = dataset.get('name')
    description = dataset.get('description')
    
    if name and description:
        print(f"Processing dataset #{idx}")  # Print current progress
        send_request(name, description)
    else:
        print(f"Dataset entry #{idx} is missing name or description: {dataset}")

Processing dataset #8001
Processing dataset #8002
Processing dataset #8003
Processing dataset #8004
Processing dataset #8005
Processing dataset #8006
Processing dataset #8007
Processing dataset #8008
Processing dataset #8009
Processing dataset #8010
Processing dataset #8011
Processing dataset #8012
Processing dataset #8013
Processing dataset #8014
Processing dataset #8015
Processing dataset #8016
Processing dataset #8017
Processing dataset #8018
Processing dataset #8019
Processing dataset #8020
Processing dataset #8021
Processing dataset #8022
Processing dataset #8023
Processing dataset #8024
Processing dataset #8025
Processing dataset #8026
Processing dataset #8027
Processing dataset #8028
Processing dataset #8029
Processing dataset #8030
Processing dataset #8031
Processing dataset #8032
Processing dataset #8033
Processing dataset #8034
Processing dataset #8035
Processing dataset #8036
Processing dataset #8037
Processing dataset #8038
Processing dataset #8039
Processing dataset #8040


UnboundLocalError: local variable 'response' referenced before assignment

In [18]:
len(results)

8437

In [19]:
# Process datasets and track progress
for idx, dataset in enumerate(datasets[8483:9000], start=8484):  # Start from 5001 for clarity
    name = dataset.get('name')
    description = dataset.get('description')
    
    if name and description:
        print(f"Processing dataset #{idx}")  # Print current progress
        send_request(name, description)
    else:
        print(f"Dataset entry #{idx} is missing name or description: {dataset}")

Processing dataset #8484
Processing dataset #8485
Processing dataset #8486
Processing dataset #8487
Processing dataset #8488
Processing dataset #8489
Processing dataset #8490
Processing dataset #8491
Processing dataset #8492
Processing dataset #8493
Processing dataset #8494
Processing dataset #8495
Processing dataset #8496
Processing dataset #8497
Processing dataset #8498
Processing dataset #8499
Processing dataset #8500
Processing dataset #8501
Processing dataset #8502
Processing dataset #8503
Processing dataset #8504
Processing dataset #8505
Processing dataset #8506
Processing dataset #8507
Processing dataset #8508
Processing dataset #8509
Processing dataset #8510
Processing dataset #8511
Processing dataset #8512
Processing dataset #8513
Processing dataset #8514
Processing dataset #8515
Processing dataset #8516
Processing dataset #8517
Processing dataset #8518
Processing dataset #8519
Processing dataset #8520
Processing dataset #8521
Processing dataset #8522
Processing dataset #8523


In [22]:
len(results)

8955

In [21]:
# Process datasets and track progress
for idx, dataset in enumerate(datasets[8482:8483], start=8483):  # Start from 5001 for clarity
    name = dataset.get('name')
    description = dataset.get('description')
    
    if name and description:
        print(f"Processing dataset #{idx}")  # Print current progress
        send_request(name, description)
    else:
        print(f"Dataset entry #{idx} is missing name or description: {dataset}")

Processing dataset #8483


In [23]:
# Save results to a JSON file
with open('results.json', 'w', encoding='utf-8') as result_file:
    json.dump(results, result_file, ensure_ascii=False, indent=4)

In [24]:
# Process datasets and track progress
for idx, dataset in enumerate(datasets[9000:10000], start=9001):  # Start from 5001 for clarity
    name = dataset.get('name')
    description = dataset.get('description')
    
    if name and description:
        print(f"Processing dataset #{idx}")  # Print current progress
        send_request(name, description)
    else:
        print(f"Dataset entry #{idx} is missing name or description: {dataset}")

Processing dataset #9001
Processing dataset #9002
Processing dataset #9003
Processing dataset #9004
Processing dataset #9005
Processing dataset #9006
Processing dataset #9007
Processing dataset #9008
Processing dataset #9009
Processing dataset #9010
Processing dataset #9011
Processing dataset #9012
Processing dataset #9013
Processing dataset #9014
Processing dataset #9015
Processing dataset #9016
Processing dataset #9017
Processing dataset #9018
Processing dataset #9019
Processing dataset #9020
Processing dataset #9021
Processing dataset #9022
Processing dataset #9023
Processing dataset #9024
Processing dataset #9025
Processing dataset #9026
Processing dataset #9027
Processing dataset #9028
Processing dataset #9029
Processing dataset #9030
Processing dataset #9031
Processing dataset #9032
Processing dataset #9033
Processing dataset #9034
Processing dataset #9035
Processing dataset #9036
Processing dataset #9037
Processing dataset #9038
Processing dataset #9039
Processing dataset #9040


In [25]:
len(results)

9893

In [26]:
# Save results to a JSON file
with open('results.json', 'w', encoding='utf-8') as result_file:
    json.dump(results, result_file, ensure_ascii=False, indent=4)

In [30]:
# Process datasets and track progress
for idx, dataset in enumerate(datasets[10000:10562], start=10001):  # Start from 5001 for clarity
    name = dataset.get('name')
    description = dataset.get('description')
    
    if name and description:
        print(f"Processing dataset #{idx}")  # Print current progress
        send_request(name, description)
    else:
        print(f"Dataset entry #{idx} is missing name or description: {dataset}")

Processing dataset #10001
Processing dataset #10002
Processing dataset #10003
Processing dataset #10004
Processing dataset #10005
Processing dataset #10006
Processing dataset #10007
Processing dataset #10008
Processing dataset #10009
Processing dataset #10010
Processing dataset #10011
Processing dataset #10012
Processing dataset #10013
Processing dataset #10014
Processing dataset #10015
Processing dataset #10016
Processing dataset #10017
Processing dataset #10018
Processing dataset #10019
Processing dataset #10020
Processing dataset #10021
Processing dataset #10022
Processing dataset #10023
Processing dataset #10024
Processing dataset #10025
Processing dataset #10026
Processing dataset #10027
Processing dataset #10028
Processing dataset #10029
Processing dataset #10030
Processing dataset #10031
Processing dataset #10032
Processing dataset #10033
Processing dataset #10034
Processing dataset #10035
Processing dataset #10036
Processing dataset #10037
Processing dataset #10038
Processing d

UnboundLocalError: local variable 'response' referenced before assignment

In [31]:
# Process datasets and track progress
for idx, dataset in enumerate(datasets[10474:10562], start=10475):  # Start from 5001 for clarity
    name = dataset.get('name')
    description = dataset.get('description')
    
    if name and description:
        print(f"Processing dataset #{idx}")  # Print current progress
        send_request(name, description)
    else:
        print(f"Dataset entry #{idx} is missing name or description: {dataset}")

Processing dataset #10475
Processing dataset #10476
Processing dataset #10477
Processing dataset #10478
Processing dataset #10479
Processing dataset #10480
Processing dataset #10481
Processing dataset #10482
Processing dataset #10483
Processing dataset #10484
Processing dataset #10485
Processing dataset #10486
Processing dataset #10487
Processing dataset #10488
Processing dataset #10489
Processing dataset #10490
Processing dataset #10491
Processing dataset #10492
Processing dataset #10493
Processing dataset #10494
Processing dataset #10495
Processing dataset #10496
Processing dataset #10497
Processing dataset #10498
Processing dataset #10499
Processing dataset #10500
Processing dataset #10501
Processing dataset #10502
Processing dataset #10503
Processing dataset #10504
Processing dataset #10505
Processing dataset #10506
Processing dataset #10507
Processing dataset #10508
Processing dataset #10509
Processing dataset #10510
Processing dataset #10511
Processing dataset #10512
Processing d

In [34]:
# Save results to a JSON file
with open('results.json', 'w', encoding='utf-8') as result_file:
    json.dump(results, result_file, ensure_ascii=False, indent=4)

In [35]:
len(results)

10454

In [37]:
for idx, dataset in enumerate(datasets[10466:10467], start=10467):  # Start from 5001 for clarity
    name = dataset.get('name')
    description = dataset.get('description')
    
    if name and description:
        print(f"Processing dataset #{idx}")  # Print current progress
        send_request(name, description)
    else:
        print(f"Dataset entry #{idx} is missing name or description: {dataset}")

Processing dataset #10467
Failed to make the request for Poser models. Error: 400 Client Error: model_error for url: https://yuwa-m2oi18l3-swedencentral.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-08-01-preview
