In [2]:
from database import ResearchPaperDB
import json

DB_NAME = "twinning_papers"
COLLECTION_NAME = "papers"
db = ResearchPaperDB(DB_NAME, COLLECTION_NAME)

def read_jsonl(file_path):
    with open(file_path, 'r') as file:
        return [json.loads(line) for line in file]

def insert_papers(papers):
    processed_paper_ids = set()  # Set to keep track of already processed paper IDs
    total_counter = 0            # Counter for total number of papers
    inserted_counter = 0         # Counter for inserted papers

    for paper in papers:
        paper_id = paper.get('paperId')
        if paper_id in processed_paper_ids:
            print(f"Paper ID {paper_id} already processed. Skipping.")
            continue

        title = paper.get('title', '') or ''
        abstract = paper.get('abstract', '') or ''
        year = paper.get('year', '')
        url = paper.get('url', '')
        authors = paper.get('authors', [])
        external_id = paper.get('externalIds', {})
        open_access_pdf = paper.get('openAccessPdf', '')

        # Extracting author names
        author_names = [author.get('name', '') for author in authors]

        db.insert_paper(title=title, abstract=abstract, year=year, url=url, authors=author_names, external_id=external_id, open_access_pdf=open_access_pdf)
        processed_paper_ids.add(paper_id)  # Mark this paper as processed
        inserted_counter += 1
        print(f"Inserted paper {inserted_counter}: {title}")

        total_counter += 1
        progress_percentage = (total_counter / len(papers)) * 100
        print(f"Progress: {total_counter}/{len(papers)} ({progress_percentage:.2f}%)")

    print(f"Total papers processed: {total_counter}")
    print(f"Total papers inserted: {inserted_counter}")

extended_papers = read_jsonl('papers_extended.jsonl')
print("Inserting extended papers...")
insert_papers(extended_papers)


Inserting extended papers...
Inserted paper 1: Twins: a study of heredity and environment
Progress: 1/991 (0.10%)
Inserted paper 2: Twins: A Study of Heredity and Environment.
Progress: 2/991 (0.20%)
Inserted paper 3: Characterising the Digital Twin: A systematic literature review
Progress: 3/991 (0.30%)
Inserted paper 4: De novo and inherited CNVs in MZ twin pairs selected for discordance and concordance on Attention Problems
Progress: 4/991 (0.40%)
Inserted paper 5: Five years of GWAS discovery.
Progress: 5/991 (0.50%)
Inserted paper 6: OpenMx: An Open Source Extended Structural Equation Modeling Framework
Progress: 6/991 (0.61%)
Inserted paper 7: GCTA: a tool for genome-wide complex trait analysis.
Progress: 7/991 (0.71%)
Inserted paper 8: PLINK: a tool set for whole-genome association and population-based linkage analyses.
Progress: 8/991 (0.81%)
Inserted paper 9: Theory and practice in quantitative genetics.
Progress: 9/991 (0.91%)
Inserted paper 10: Classical twin studies and bey