In [1]:
from google.cloud import bigquery
from google.cloud.bigquery import LoadJobConfig
from google.cloud.bigquery import SchemaField
from google.cloud.exceptions import NotFound

import pandas as pd
import numpy as np
from datasets import load_dataset
from tqdm import tqdm, trange

from dotenv import load_dotenv
load_dotenv()

def upload_to_bigquery(df, table_id, schema=None, table_description=None):
    """
    Upload a DataFrame to BigQuery. If table exists, append to it.
    If table doesn't exist, create it with the provided schema.
    
    Args:
        df: pandas DataFrame containing the data to upload
        table_id: str, full path to BigQuery table (project.dataset.table)
        schema: list of SchemaField objects defining the table schema.
               Required only when creating a new table.
        table_description: str, description of the table contents and usage.
                         Used only when creating a new table.
        
    Returns:
        None
    """
    # Initialize BigQuery client
    client = bigquery.Client()
    
    # Check if table exists
    try:
        table = client.get_table(table_id)
        table_exists = True
    except Exception:
        table_exists = False
        
    # Configure load job
    job_config = LoadJobConfig()
    
    if table_exists:
        # Append mode for existing table
        job_config.write_disposition = bigquery.WriteDisposition.WRITE_APPEND
        job_config.schema = table.schema
    else:
        # Create new table
        if schema is None:
            raise ValueError("Schema must be provided when creating a new table")
            
        job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
        job_config.schema = schema
        
        # Create table with schema and description
        table = bigquery.Table(table_id, schema=schema)
        if table_description:
            table.description = table_description
        table = client.create_table(table, exists_ok=True)

    # Load data from DataFrame
    job = client.load_table_from_dataframe(
        df,
        table,
        job_config=job_config
    )

    # Wait for job completion
    job.result()
    
    action = "appended to" if table_exists else "loaded into new"
    print(f"Successfully {action} {job.output_rows} rows to table {table.path}")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
huggingface_path = 'ErzhuoShao/SciSciGPT-SciSciNet'
revision = 'ef5553f34410575c8cab8ad209a7b11a4253b2b6'
project_id, dataset_id = os.getenv("GOOGLE_BIGQUERY_URI").split('/')[-2:]
bigquery_dataset = f'{project_id}.{dataset_id}'

max_shard_num = 3 # Should be changed to np.inf to upload the whole dataset

client = bigquery.Client(project=project_id)

try:
    # Try to get the dataset
    dataset = client.get_dataset(dataset_id)
    print(f"Dataset {dataset_id} already exists")
except NotFound:
    # Create the dataset if it doesn't exist
    dataset = bigquery.Dataset(f"{project_id}.{dataset_id}")
    dataset.location = "US"  # Set location as needed
    dataset.description = "SciSciNet dataset replication containing papers, authors, citations and other scientific metadata"
    
    dataset = client.create_dataset(dataset, timeout=30)
    print(f"Created dataset {dataset.dataset_id}")

# Clear all existing tables in the dataset before uploading
tables = client.list_tables(dataset_id)
table_list = list(tables)
print(f"Found {len(table_list)} existing tables. Deleting...")
for table in table_list:
	client.delete_table(table.reference)
	print(f"Deleted table: {table.table_id}")
print("All tables cleared successfully.")

Dataset SciSciNet already exists
Found 19 existing tables. Deleting...
Deleted table: authors
Deleted table: fields
Deleted table: institutions
Deleted table: nct
Deleted table: newsfeed
Deleted table: nih
Deleted table: nsf
Deleted table: paper_author_affiliations
Deleted table: paper_citations
Deleted table: paper_fields
Deleted table: paper_nct
Deleted table: paper_newsfeed
Deleted table: paper_nih
Deleted table: paper_nsf
Deleted table: paper_patents
Deleted table: paper_twitter
Deleted table: papers
Deleted table: patents
Deleted table: twitter
All tables cleared successfully.


### Papers

In [3]:
# Define table schema based on DataFrame columns
schema_fields = [
    SchemaField("paper_id", "INTEGER", mode="REQUIRED", description="(Primary Key) Paper Unique Identifier"),
    SchemaField("doi", "STRING", mode="NULLABLE", description="Digital Object Identifier"),
    # Publication Info
    SchemaField("doc_type", "STRING", mode="NULLABLE", description="Document type. Options include Conference, Journal, Thesis, Book, BookChapter, Repository, Dataset"),
    SchemaField("year", "INTEGER", mode="NULLABLE", description="Publication year"),
    SchemaField("date", "STRING", mode="NULLABLE", description="Publication date"),
    SchemaField("author_count", "INTEGER", mode="NULLABLE", description="Number of authors"),
    SchemaField("institution_count", "INTEGER", mode="NULLABLE", description="Number of institutions the authors are affiliated with"),
    # Journal & Conference
    SchemaField("journal_id", "INTEGER", mode="NULLABLE", description="Journal Unique Identifier in which the paper is published, if applicable."),
    SchemaField("journal_name", "STRING", mode="NULLABLE", description="Journal name"),
    SchemaField("journal_issn", "STRING", mode="NULLABLE", description="Journal ISSN code"),
    SchemaField("journal_publisher", "STRING", mode="NULLABLE", description="Journal publisher"),
    SchemaField("journal_url", "STRING", mode="NULLABLE", description="Journal web URL"),
    SchemaField("conference_id", "INTEGER", mode="NULLABLE", description="Conference Unique Identifier, if applicable."),
    SchemaField("conference_name", "STRING", mode="NULLABLE", description="Conference name"),
    SchemaField("conference_abbr_name", "STRING", mode="NULLABLE", description="Conference Abbreviated name"),
    # Paper Metrics
    SchemaField("citation_count", "INTEGER", mode="NULLABLE", description="Total number of citations received by the paper."),
    SchemaField("citation_count_10y", "INTEGER", mode="NULLABLE", description="Number of citations received within 10 years of publication."),
    SchemaField("citation_count_5y", "INTEGER", mode="NULLABLE", description="Number of citations received within 5 years of publication."),
    SchemaField("reference_count", "INTEGER", mode="NULLABLE", description="Number of references cited by the paper."),
    SchemaField("disruption_score", "FLOAT", mode="NULLABLE", description="Disruption score indicating the paper's impact in displacing prior work in its field. Its value spans from -1.0 to 1.0, with higher values indicating more disruption"),
    SchemaField("novelty_score", "FLOAT", mode="NULLABLE", description="Novelty score, based on the top 10 percentile of Z-score of reference pairs, representing the paper's atypicality in terms of knowledge combination. Lower values indicate higher novelty"),
    SchemaField("conventionality_score", "FLOAT", mode="NULLABLE", description="Conventionality score, based on the median percentile of Z-score of reference pairs, representing the paper's conventionality in terms of knowledge combination. Higher values indicate higher conventionality"),
    SchemaField("title", "STRING", mode="NULLABLE", description="Paper title"), 
    SchemaField("abstract", "STRING", mode="NULLABLE", description="Paper abstract"),
    SchemaField("abstract_embedding", "FLOAT64", mode="REPEATED", description="Paper abstract embedding. A 768-dimensional dense vector, generated by the TEXT_EMBEDDING function, which captures the semantic meaning of the text."),
    ]

# Set table description and location
table_id = f"{bigquery_dataset}.papers"  # Replace with actual project and dataset name
table_description = """Each paper's id, publication time, authorship, venue, title, impact metrics, title, abstract, embeddings, and many other details"""

for shard_id in trange(min(max_shard_num, 100)):    
    print(f"Downloading shard {shard_id} from Hugging Face")
    df = load_dataset(
		huggingface_path, revision=revision, split="train",
		data_files=f"papers/shard_{shard_id:02d}.parquet"
	).to_pandas()
    print(f"Uploading shard {shard_id} to Google BigQuery")
    upload_to_bigquery(
        df=df, 
        table_id=table_id, 
        schema=schema_fields if shard_id == 0 else None,  # Only provide schema for first partition
        table_description=table_description if shard_id == 0 else None  # Only provide description for first partition
    )

  0%|          | 0/3 [00:00<?, ?it/s]

Downloading shard 0 from Hugging Face


Generating train split: 111927 examples [00:01, 94607.16 examples/s]


Uploading shard 0 to Google BigQuery


 33%|███▎      | 1/3 [01:04<02:09, 65.00s/it]

Successfully loaded into new 111927 rows to table /projects/ksm-rch-sciscigpt/datasets/SciSciNet/tables/papers
Downloading shard 1 from Hugging Face


Generating train split: 111927 examples [00:00, 133383.30 examples/s]


Uploading shard 1 to Google BigQuery


 67%|██████▋   | 2/3 [02:01<01:00, 60.11s/it]

Successfully appended to 111927 rows to table /projects/ksm-rch-sciscigpt/datasets/SciSciNet/tables/papers
Downloading shard 2 from Hugging Face


Generating train split: 111927 examples [00:00, 128421.16 examples/s]


Uploading shard 2 to Google BigQuery


100%|██████████| 3/3 [03:14<00:00, 64.99s/it]

Successfully appended to 111927 rows to table /projects/ksm-rch-sciscigpt/datasets/SciSciNet/tables/papers





### Institutions

In [4]:
schema_fields = [
    SchemaField("institution_id", "INTEGER", mode="REQUIRED", description="(Primary Key) Institution Unique Identifier"),
    SchemaField("institution_name", "STRING", mode="NULLABLE", description="Institution's official name"),
    SchemaField("grid_id", "STRING", mode="NULLABLE", description="Institution's Global Research Identifier Database (GRID) ID"),
    SchemaField("url", "STRING", mode="NULLABLE", description="Institution's official webpage URL"),
    SchemaField("latitude", "FLOAT", mode="NULLABLE", description="Institution's geographical latitude"),
    SchemaField("longitude", "FLOAT", mode="NULLABLE", description="Institution's geographical longitude")
]

table_id = f"{bigquery_dataset}.institutions"
table_description = "Each institution's id, name, webpage url, and geographical coordinate."

df = load_dataset(
	huggingface_path, revision=revision, split="train", 
	data_files="institutions.parquet"
).to_pandas()

upload_to_bigquery(
    df=df,
    table_id=table_id,
    table_description=table_description,
    schema=schema_fields
)

Successfully loaded into new 6969 rows to table /projects/ksm-rch-sciscigpt/datasets/SciSciNet/tables/institutions


### Authors

In [5]:
schema_fields = [
    SchemaField("author_id", "INTEGER", mode="REQUIRED", description="(Primary Key) Author Unique Identifier"),
    SchemaField("author_name", "STRING", description="Author's name"),
    SchemaField("author_gender", "STRING", description="Author's gender. Options include 'male', 'female', and 'unknown'.")
]

table_id = f"{bigquery_dataset}.authors"
table_description = "Each author's id, name and gender."

df = load_dataset(
	huggingface_path, revision=revision, split="train", 
	data_files="authors.parquet"
).to_pandas()

upload_to_bigquery(df=df, table_id=table_id, schema=schema_fields, table_description=table_description)

Generating train split: 8102535 examples [00:00, 25018400.57 examples/s]


Successfully loaded into new 8102535 rows to table /projects/ksm-rch-sciscigpt/datasets/SciSciNet/tables/authors


### Fields

In [6]:
schema_fields = [
    SchemaField("field_id", "INTEGER", mode="REQUIRED", description="(Primary Key) A unique identifier for each field"),
    SchemaField("field_name", "STRING", description="The name of the research field"),
    SchemaField("field_level", "STRING", description="The level of the research field, categorizing it as either 'top' or 'sub'")
]

table_id = f"{bigquery_dataset}.fields"
table_description = "Each research field's id, name and field level."

df = load_dataset(
	huggingface_path, revision=revision, split="train", 
	data_files="fields.parquet"
).to_pandas()

upload_to_bigquery(df=df, table_id=table_id, schema=schema_fields, table_description=table_description)

Successfully loaded into new 311 rows to table /projects/ksm-rch-sciscigpt/datasets/SciSciNet/tables/fields


### NIH

In [7]:
schema_fields = [
    SchemaField("nih_project_id", "STRING", mode="REQUIRED", description="(Primary Key) A unique identifier for each NIH project")
]

table_id = f"{bigquery_dataset}.nih"
table_description = "Each national institutes of health (NIH) project's id."

df = load_dataset(
	huggingface_path, revision=revision, split="train", 
	data_files="nih.parquet"
).to_pandas()

upload_to_bigquery(df=df, table_id=table_id, schema=schema_fields, table_description=table_description)

Generating train split: 308741 examples [00:00, 18547295.31 examples/s]


Successfully loaded into new 308741 rows to table /projects/ksm-rch-sciscigpt/datasets/SciSciNet/tables/nih


### NSF

In [8]:
schema_fields = [
    SchemaField("nsf_award_id", "STRING", mode="REQUIRED", description="(Primary Key) A unique identifier for each NSF funding"),
    SchemaField("date", "STRING", description="The date of the NSF award"),
    SchemaField("title", "STRING", description="The title of the NSF award")
]

table_id = f"{bigquery_dataset}.nsf"
table_description = "Each national science foundation (NSF) funding's id, date and title."

df = load_dataset(
	huggingface_path, revision=revision, split="train", 
	data_files="nsf.parquet"
).to_pandas()

upload_to_bigquery(df=df, table_id=table_id, schema=schema_fields, table_description=table_description)

Generating train split: 120555 examples [00:00, 4925666.74 examples/s]


Successfully loaded into new 120555 rows to table /projects/ksm-rch-sciscigpt/datasets/SciSciNet/tables/nsf


### Clinical Trial

In [9]:
schema_fields = [
    SchemaField("nct_id", "STRING", mode="REQUIRED", description="(Primary Key) A unique identifier for each clinical trial")
]

table_id = f"{bigquery_dataset}.nct"
table_description = "Each clinical trial's id."

df = load_dataset(
	huggingface_path, revision=revision, split="train", 
	data_files="nct.parquet"
).to_pandas()

upload_to_bigquery(df=df, table_id=table_id, schema=schema_fields, table_description=table_description)

Generating train split: 29811 examples [00:00, 6961937.45 examples/s]


Successfully loaded into new 29811 rows to table /projects/ksm-rch-sciscigpt/datasets/SciSciNet/tables/nct


### Newsfeeds

In [10]:
schema_fields = [
    SchemaField("newsfeed_id", "STRING", mode="REQUIRED", description="(Primary Key) A unique indentifier for each newsfeed, which is also its URL"),
    SchemaField("date", "STRING", description="The date of the newsfeed"),
    SchemaField("title", "STRING", description="The title of the newsfeed")
]

table_id = f"{bigquery_dataset}.newsfeed"
table_description = "Each newsfeed's id, date and title."

df = load_dataset(
	huggingface_path, revision=revision, split="train", 
	data_files="newsfeed.parquet"
).to_pandas()

upload_to_bigquery(df=df, table_id=table_id, schema=schema_fields, table_description=table_description)

Generating train split: 103543 examples [00:00, 4757474.52 examples/s]


Successfully loaded into new 103543 rows to table /projects/ksm-rch-sciscigpt/datasets/SciSciNet/tables/newsfeed


### Tweets

In [11]:
schema_fields = [
    SchemaField("tweet_id", "INTEGER", mode="REQUIRED", description="(Primary Key) A unique indentifier for each tweet"),
    SchemaField("date", "STRING", description="The date of the tweet"),
    SchemaField("url", "STRING", description="The URL of the tweet")
]

table_id = f"{bigquery_dataset}.twitter" 
table_description = "Each tweet's id, date and URL."

df = load_dataset(
	huggingface_path, revision=revision, split="train", 
	data_files="twitter.parquet"
).to_pandas()

upload_to_bigquery(df=df, table_id=table_id, schema=schema_fields, table_description=table_description)

Generating train split: 9576426 examples [00:00, 21273516.75 examples/s]


Successfully loaded into new 9576426 rows to table /projects/ksm-rch-sciscigpt/datasets/SciSciNet/tables/twitter


### Patents

In [12]:
schema_fields = [
    SchemaField("patent_id", "STRING", mode="REQUIRED", description="(Primary Key) patent Unique Identifier"),
    SchemaField("type", "STRING", description="The type of patent (e.g. utility)"),
    SchemaField("date", "STRING", description="The date the patent was granted"),
    SchemaField("year", "INTEGER", description="The year the patent was granted"),
    SchemaField("title", "STRING", mode="NULLABLE", description="patent title"), 
    SchemaField("abstract", "STRING", mode="NULLABLE", description="patent abstract"),
    SchemaField("abstract_embedding", "FLOAT64", mode="REPEATED", description="patent abstract embedding"),
]

table_id = f"{bigquery_dataset}.patents"
table_description = "Each patent's id, type, date, year."

for shard_id in trange(min(max_shard_num, 50)):    
    print(f"Downloading shard {shard_id} from Hugging Face")
    df = load_dataset(
		huggingface_path, revision=revision, split="train", 
		data_files=f"patents/shard_{shard_id:02d}.parquet"
	).to_pandas()
    print(f"Uploading shard {shard_id} to Google BigQuery")
    upload_to_bigquery(
        df=df, 
        table_id=table_id, 
        schema=schema_fields if shard_id == 0 else None,  # Only provide schema for first partition
        table_description=table_description if shard_id == 0 else None  # Only provide description for first partition
    )

  0%|          | 0/3 [00:00<?, ?it/s]

Downloading shard 0 from Hugging Face


Generating train split: 179602 examples [00:01, 121799.14 examples/s]


Uploading shard 0 to Google BigQuery


 33%|███▎      | 1/3 [01:38<03:16, 98.08s/it]

Successfully loaded into new 179602 rows to table /projects/ksm-rch-sciscigpt/datasets/SciSciNet/tables/patents
Downloading shard 1 from Hugging Face


Generating train split: 179602 examples [00:01, 125327.63 examples/s]


Uploading shard 1 to Google BigQuery


 67%|██████▋   | 2/3 [03:28<01:45, 105.48s/it]

Successfully appended to 179602 rows to table /projects/ksm-rch-sciscigpt/datasets/SciSciNet/tables/patents
Downloading shard 2 from Hugging Face


Generating train split: 179602 examples [00:01, 173307.64 examples/s]


Uploading shard 2 to Google BigQuery


100%|██████████| 3/3 [04:06<00:00, 82.05s/it] 

Successfully appended to 179602 rows to table /projects/ksm-rch-sciscigpt/datasets/SciSciNet/tables/patents





### Links

##### Paper Author Affiliations

In [13]:
schema_fields = [
    SchemaField("paper_id", "INTEGER", mode="REQUIRED", description="(Foreign Key) Links to papers"),
    SchemaField("author_id", "INTEGER", mode="REQUIRED", description="(Foreign Key) Links to authors"),
    SchemaField("institution_id", "INTEGER", description="(Foreign Key) Links to institutions"),
    SchemaField("author_order", "INTEGER", mode="REQUIRED", description="Numeric order representing the author's position in the list of authors for the paper")
]

table_id = f"{bigquery_dataset}.paper_author_affiliations"
table_description = "Many-to-many-to-many relationships between papers, authors, and their affiliated institutions."

df = load_dataset(
	huggingface_path, revision=revision, split="train", 
	data_files="paper_author_affiliations.parquet"
).to_pandas()

upload_to_bigquery(df=df, table_id=table_id, schema=schema_fields, table_description=table_description)

Generating train split: 33038221 examples [00:00, 37390007.99 examples/s]


Successfully loaded into new 33038221 rows to table /projects/ksm-rch-sciscigpt/datasets/SciSciNet/tables/paper_author_affiliations


##### Paper Citations

In [14]:
schema_fields = [
    SchemaField("citing_paper_id", "INTEGER", mode="REQUIRED", description="(Foreign Key) Links to citing paper"),
    SchemaField("cited_paper_id", "INTEGER", mode="REQUIRED", description="(Foreign Key) Links to cited paper")
]

table_id = f"{bigquery_dataset}.paper_citations"
table_description = "Many-to-many citation relationships between papers."

df = load_dataset(
	huggingface_path, revision=revision, split="train", 
	data_files="paper_citations.parquet"
).to_pandas()

upload_to_bigquery(df=df, table_id=table_id, schema=schema_fields, table_description=table_description)

Generating train split: 78417834 examples [00:01, 62694984.78 examples/s]


Successfully loaded into new 78417834 rows to table /projects/ksm-rch-sciscigpt/datasets/SciSciNet/tables/paper_citations


##### Paper Fields

In [15]:
schema_fields = [
    SchemaField("paper_id", "INTEGER", mode="REQUIRED", description="(Foreign Key) Links to papers"),
    SchemaField("field_id", "INTEGER", mode="REQUIRED", description="(Foreign Key) Links to fields"),
    SchemaField("is_hit_1pct", "BOOLEAN", mode="REQUIRED", description="If the paper is in top 1% cited papers within its field and publication year"),
    SchemaField("is_hit_5pct", "BOOLEAN", mode="REQUIRED", description="If the paper is in top 5% cited papers within its field and publication year"), 
    SchemaField("is_hit_10pct", "BOOLEAN", mode="REQUIRED", description="If the paper is in top 10% cited papers within its field and publication year"),
    SchemaField("normalized_citations", "FLOAT", description="Number of citations normalized by field and year")
]

table_id = f"{bigquery_dataset}.paper_fields"
table_description = "Many-to-many relationships between papers and theirresearch fields."

df = load_dataset(
	huggingface_path, revision=revision, split="train", 
	data_files="paper_fields.parquet"
).to_pandas()

upload_to_bigquery(df=df, table_id=table_id, schema=schema_fields, table_description=table_description)

Generating train split: 27108203 examples [00:00, 36524912.67 examples/s]


Successfully loaded into new 27108203 rows to table /projects/ksm-rch-sciscigpt/datasets/SciSciNet/tables/paper_fields


##### Paper Patents

In [16]:
schema_fields = [
    SchemaField("paper_id", "INTEGER", mode="REQUIRED", description="(Foreign Key) Links to cited papers"),
    SchemaField("patent_id", "STRING", mode="REQUIRED", description="(Foreign Key) Links to citing patents")
]

table_id = f"{bigquery_dataset}.paper_patents"
table_description = "Many-to-many relationships between papers and their patent citations."

df = load_dataset(
	huggingface_path, revision=revision, split="train", 
	data_files="paper_patents.parquet"
).to_pandas()

upload_to_bigquery(df=df, table_id=table_id, schema=schema_fields, table_description=table_description)

Generating train split: 6908589 examples [00:00, 38635166.35 examples/s]


Successfully loaded into new 6908589 rows to table /projects/ksm-rch-sciscigpt/datasets/SciSciNet/tables/paper_patents


##### Paper NCT

In [17]:
schema_fields = [
    SchemaField("paper_id", "INTEGER", mode="REQUIRED", description="(Foreign Key) Links to papers"),
    SchemaField("nct_id", "STRING", mode="REQUIRED", description="(Foreign Key) Links to clinical trials")
]

table_id = f"{bigquery_dataset}.paper_nct"
table_description = "Many-to-many relationships between papers and clinical trials."

df = load_dataset(
	huggingface_path, revision=revision, split="train", 
	data_files="paper_nct.parquet"
).to_pandas()

upload_to_bigquery(df=df, table_id=table_id, schema=schema_fields, table_description=table_description)

Generating train split: 85676 examples [00:00, 12544550.36 examples/s]


Successfully loaded into new 85676 rows to table /projects/ksm-rch-sciscigpt/datasets/SciSciNet/tables/paper_nct


##### Paper NIH

In [18]:
schema_fields = [
    SchemaField("paper_id", "INTEGER", mode="REQUIRED", description="(Foreign Key) Links to papers"),
    SchemaField("nih_project_id", "STRING", mode="REQUIRED", description="(Foreign Key) Links to NIH projects")
]

table_id = f"{bigquery_dataset}.paper_nih"
table_description = "Many-to-many relationships between papers and National Institutes of Health (NIH) projects."

df = load_dataset(
	huggingface_path, revision=revision, split="train", 
	data_files="paper_nih.parquet"
).to_pandas()

upload_to_bigquery(df=df, table_id=table_id, schema=schema_fields, table_description=table_description)

Generating train split: 2967239 examples [00:00, 28460003.03 examples/s]


Successfully loaded into new 2967239 rows to table /projects/ksm-rch-sciscigpt/datasets/SciSciNet/tables/paper_nih


##### Paper NSF

In [19]:
schema_fields = [
    SchemaField("paper_id", "INTEGER", mode="REQUIRED", description="(Foreign Key) Links to papers"),
    SchemaField("nsf_award_id", "STRING", mode="REQUIRED", description="(Foreign Key) Links to NSF awards")
]

table_id = f"{bigquery_dataset}.paper_nsf"
table_description = "Many-to-many relationships between papers and National Science Foundation (NSF) awards."

df = load_dataset(
	huggingface_path, revision=revision, split="train", 
	data_files="paper_nsf.parquet"
).to_pandas()

upload_to_bigquery(df=df, table_id=table_id, schema=schema_fields, table_description=table_description)

Generating train split: 635033 examples [00:00, 24289792.19 examples/s]


Successfully loaded into new 635033 rows to table /projects/ksm-rch-sciscigpt/datasets/SciSciNet/tables/paper_nsf


##### Paper Newsfeed

In [20]:
schema_fields = [
    SchemaField("paper_id", "INTEGER", mode="REQUIRED", description="(Foreign Key) Links to papers"),
    SchemaField("newsfeed_id", "STRING", mode="REQUIRED", description="(Foreign Key) Links to newsfeeds")
]

table_id = f"{bigquery_dataset}.paper_newsfeed"
table_description = "Many-to-many relationships between papers and newsfeeds."

df = load_dataset(
	huggingface_path, revision=revision, split="train", 
	data_files="paper_newsfeed.parquet"
).to_pandas()

upload_to_bigquery(df=df, table_id=table_id, schema=schema_fields, table_description=table_description)

Generating train split: 133322 examples [00:00, 6136818.05 examples/s]


Successfully loaded into new 133322 rows to table /projects/ksm-rch-sciscigpt/datasets/SciSciNet/tables/paper_newsfeed


##### Paper Twitter

In [21]:
schema_fields = [
    SchemaField("paper_id", "INTEGER", mode="REQUIRED", description="(Foreign Key) Links to papers"),
    SchemaField("tweet_id", "INTEGER", mode="REQUIRED", description="(Foreign Key) Links to tweets")
]

table_id = f"{bigquery_dataset}.paper_twitter"
table_description = "Many-to-many relationships between papers and tweets."

df = load_dataset(
	huggingface_path, revision=revision, split="train", 
	data_files="paper_twitter.parquet"
).to_pandas()

upload_to_bigquery(df=df, table_id=table_id, schema=schema_fields, table_description=table_description)

Generating train split: 9267893 examples [00:00, 54694339.09 examples/s]


Successfully loaded into new 9267893 rows to table /projects/ksm-rch-sciscigpt/datasets/SciSciNet/tables/paper_twitter
