# 1. Retrieval of Datasets

In [1]:
import gzip
import shutil
import ijson
import json
import pandas as pd


In [2]:
# Decompress the .gz file to a regular .json file
input_file_path = 'datasets.json.gz'
output_file_path = 'datasets.json'

with gzip.open(input_file_path, 'rb') as f_in:
    with open(output_file_path, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [3]:
# Now open the decompressed JSON file
with open('datasets.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

In [4]:
len(data)

10562

In [5]:
def get_json_structure(data, indent=0):
    indent_str = ' ' * indent
    if isinstance(data, dict):
        print(f"{indent_str}{{")
        for key, value in data.items():
            print(f"{indent_str}  \"{key}\": ", end="")
            get_json_structure(value, indent + 4)
        print(f"{indent_str}}}")
    
    elif isinstance(data, list):
        print(f"{indent_str}[")
        if data:
            # Only analyze the first element in the list to determine structure
            get_json_structure(data[0], indent + 4)
        print(f"{indent_str}]")
    
    else:
        print(f"{indent_str}{type(data).__name__}")

# Get the structure of JSON
get_json_structure(data)

[
    {
      "url":         str
      "name":         str
      "full_name":         str
      "homepage":         str
      "description":         str
      "paper":         {
          "title":             str
          "url":             str
        }
      "introduced_date":         str
      "modalities":         [
            str
        ]
      "tasks":         [
            {
              "task":                 str
              "url":                 str
            }
        ]
      "languages":         [
        ]
      "variants":         [
            str
        ]
      "num_papers":         int
      "data_loaders":         [
            {
              "url":                 str
              "repo":                 str
              "frameworks":                 [
                    str
                ]
            }
        ]
    }
]


In [6]:
def print_first_entry(file_path):
    with open(file_path, 'r') as file:
        # Initialize the parser to read items from the array
        parser = ijson.items(file, 'item')
        
        # Get the first item from the parser
        first_item = next(parser, None)
        
        if first_item:
            # Pretty-print the first item
            print(json.dumps(first_item, indent=4))
        else:
            print("No items found in the JSON file.")

# print the first entry
print_first_entry(output_file_path)

{
    "url": "https://paperswithcode.com/dataset/mnist",
    "name": "MNIST",
    "full_name": "",
    "homepage": "http://yann.lecun.com/exdb/mnist/",
    "description": "The **MNIST** database (**Modified National Institute of Standards and Technology** database) is a large collection of handwritten digits. It has a training set of 60,000 examples, and a test set of 10,000 examples. It is a subset of a larger NIST Special Database 3 (digits written by employees of the United States Census Bureau) and Special Database 1 (digits written by high school students) which contain monochrome images of handwritten digits. The digits have been size-normalized and centered in a fixed-size image. The original black and white (bilevel) images from NIST were size normalized to fit in a 20x20 pixel box while preserving their aspect ratio. The resulting images contain grey levels as a result of the anti-aliasing technique used by the normalization algorithm. the images were centered in a 28x28 image

In [7]:
def load_json_to_dataframe(file_path):
    entries = []
    
    with open(file_path, 'r', encoding='utf-8') as file:
        # Initialize the parser to read items from the array
        parser = ijson.items(file, 'item')
        
        # Loop through all the items and append to the list
        for item in parser:
            entries.append(item)
    
    # Convert the list of entries into a DataFrame
    df = pd.DataFrame(entries)
    
    return df

# Load the data into a DataFrame
df = load_json_to_dataframe('datasets.json')

In [8]:
df.head(3)

Unnamed: 0,url,name,full_name,homepage,description,paper,introduced_date,warning,modalities,tasks,languages,variants,num_papers,data_loaders
0,https://paperswithcode.com/dataset/mnist,MNIST,,http://yann.lecun.com/exdb/mnist/,The **MNIST** database (**Modified National In...,{'title': 'Gradient-based learning applied to ...,1998-11-01,,[Images],"[{'task': 'Image Classification', 'url': 'http...",[],"[USPS-to-MNIST, MNIST-to-USPS, Rotating MNIST,...",7196,[{'url': 'https://huggingface.co/datasets/ylec...
1,https://paperswithcode.com/dataset/celeba,CelebA,CelebFaces Attributes Dataset,http://mmlab.ie.cuhk.edu.hk/projects/CelebA.html,"CelebFaces Attributes dataset contains 202,599...",{'title': 'Deep Learning Face Attributes in th...,2015-01-01,,[Images],"[{'task': 'Image Classification', 'url': 'http...",[],"[CelebA-wild, CelebA Unaligned, CelebA-Test, C...",3232,[{'url': 'https://pytorch.org/vision/stable/ge...
2,https://paperswithcode.com/dataset/jft-300m,JFT-300M,JFT-300M,,**JFT-300M** is an internal Google dataset use...,{'title': 'Revisiting Unreasonable Effectivene...,2017-07-10,,[Images],"[{'task': 'Image Classification', 'url': 'http...",[Chinese],[JFT-300M],122,[{'url': 'https://github.com/tensorflow/models...


In [9]:
df.to_csv('paperwithcode.csv', index=0, encoding='utf-8')

# 2.Extract Relevant Information

In [11]:
if isinstance(data, list):
    extracted_data = []
    for item in data:
        # retrieve the name and full name
        name = item.get("name", "") 
        full_name = item.get("full_name", "") 
        # combine the name and full name
        combined_name = f"{name} ({full_name})" if full_name else name
        
        extracted_item = {
            "name": combined_name,  
            "description": item.get("description", ""),  # extract the description
        }
        # append the extracted item to the list
        extracted_data.append(extracted_item)

else:
    name = data.get("name", "")  
    full_name = data.get("full_name", "") 

    combined_name = f"{name} ({full_name})" if full_name else name
    
    extracted_data = {
        "name": combined_name,  
        "description": data.get("description", ""),  
    }

# Convert the extracted data into a JSON string
new_json = json.dumps(extracted_data, indent=4)

# Print the new JSON string
print(new_json)

[
    {
        "name": "MNIST",
        "description": "The **MNIST** database (**Modified National Institute of Standards and Technology** database) is a large collection of handwritten digits. It has a training set of 60,000 examples, and a test set of 10,000 examples. It is a subset of a larger NIST Special Database 3 (digits written by employees of the United States Census Bureau) and Special Database 1 (digits written by high school students) which contain monochrome images of handwritten digits. The digits have been size-normalized and centered in a fixed-size image. The original black and white (bilevel) images from NIST were size normalized to fit in a 20x20 pixel box while preserving their aspect ratio. The resulting images contain grey levels as a result of the anti-aliasing technique used by the normalization algorithm. the images were centered in a 28x28 image by computing the center of mass of the pixels, and translating the image so as to position this point at the cente

In [12]:
# Save the extracted data to a new JSON file
with open("extracted_data.json", "w", encoding='utf-8') as json_file:
    json.dump(extracted_data, json_file, indent=4)

## 3. Manually Extract Datatypes

```
{
    "name": "MNIST",
    "modalities": [
        "Images"
    ],
    "data_type": [
        "Handwritten digits", 
        "Grayscale images"
    ],
    "data_format": "28x28 Pixel Images",
    "data_size": {
        "training_samples": 60000,
        "test_samples": 10000,
        "number_of_classes":10
    },
}
```

```
{
    "name": "CelebA",
    "modalities": [
        "Images"
    ],
    "data_type": [
        "Face images",
        "Color images"
    ],
    "data_format": "178×218 Pixel Images",
    "data_size": {
        "total_samples": 202599,
        "number_of_classes": 10177,
        "attributes_per_image": 40
    },
}
```

```
{
    "name": "JFT-300M",
    "modalities": ["Images"],
    "data_type": ["Web Images", "Color Images"],
    "data_format": "Varied Resolutions",
    "data_size": {
        "total_samples": 300000000,
        "total_labels": 1000000000,
        "selected_labels": 375000000,
        "labels_per_image": "Multiple"
    },
}
```

```
{
    "name": "GLUE",
    "modalities": ["Text"],
    "data_type": ["Natural Language Text", "Sentences", "Sentence Pairs"],
    "data_format": "Plain text with labels specific to each task",
    "data_size": {
        "number_of_tasks": 9,
        "tasks": {
            "CoLA",
            "SST-2",
            "MRPC",
            "STS-B",
            "QQP",
            "MNLI",
            "QNLI",
            "RTE",
            "WNLI"
        }
    }
}
```

```
{
    "name": "MultiNLI",
    "modalities": ["Text"],
    "data_type": ["Natural Language Text", "Sentence Pairs"],
    "data_format": "Plain text with labels for entailment relations",
    "data_size": {
        "total_samples": 433000,
        "number_of_genres": 10,
        "genres": [
            "Face-to-face",
            "Telephone",
            "9/11",
            "Travel",
            "Letters",
            "Oxford University Press",
            "Slate",
            "Verbatim",
            "Government",
            "Fiction"
        ],
    },
}
```

```
{
    "name": "ImageNet",
    "modalities": ["Images"],
    "data_type": ["Annotated Images", "Color Images"],
    "data_format": "Varied Resolutions",
    "data_size": {
        "total_samples": 14197122,
        "number_of_classes": 21841,
        "bounding_box_annotations": 1034908,
        "images_with_SIFT_features": 1200000,
        "synsets_with_SIFT_features": 1000
    },
}
```

In [53]:
#extract name, description, modalities
df2 = df[['description']]
df2.to_csv('paperwithcode_description.csv', index=0, encoding='utf-8')

df2.head(5)

Unnamed: 0,description
0,The **MNIST** database (**Modified National In...
1,"CelebFaces Attributes dataset contains 202,599..."
2,**JFT-300M** is an internal Google dataset use...
3,General Language Understanding Evaluation (**G...
4,The **Multi-Genre Natural Language Inference**...


In [12]:
# Explode the modalities into separate rows to allow grouping
df2['modalities'] = df2['modalities'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)  # Convert string lists into actual lists
df_exploded = df2.explode('modalities')

# Group by 'modalities' and collect dataset names under each modality
grouped_datasets = df_exploded.groupby('modalities')['name'].apply(list).reset_index()

# Save the grouped datasets to a new CSV file
grouped_datasets.to_csv('grouped_by_modalities.csv', index=False, encoding='utf-8')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['modalities'] = df2['modalities'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)  # Convert string lists into actual lists


# 4. Generate Key Phrases

In [13]:
!pip3 install nltk



In [14]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\willa\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\willa\AppData\Roaming\nltk_data...


True

In [16]:
from nltk.corpus import wordnet as wn

def get_synonyms(word):
    synonyms = set()
    for synset in wn.synsets(word):
        for lemma in synset.lemmas():
            synonyms.add(lemma.name())
    return list(synonyms)

image_synonyms = get_synonyms('images')
print("Synonyms for 'images':", image_synonyms)


Synonyms for 'images': ['fancy', 'persona', 'picture', 'epitome', 'image', 'look-alike', 'range', 'see', 'trope', 'ikon', 'icon', 'prototype', 'mental_image', 'visualise', 'double', 'visualize', 'simulacrum', 'project', 'figure_of_speech', 'paradigm', 'envision', 'figure', 'range_of_a_function', 'effigy']


## 5. INSTAG

In [1]:
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pipe = pipeline("text-generation", model="OFA-Sys/InsTagger")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]