# Extracting the data out of the zip

In [None]:
import zipfile
import json
import sqlite3
import pandas as pd
from pathlib import Path

conn = sqlite3.connect("scopus.db")
cur = conn.cursor()

cur.execute("""
CREATE TABLE IF NOT EXISTS papers_raw (
    file_id INTEGER,
    year INTEGER,
    raw_json TEXT
)
""")

conn.commit()
conn.close()

keep = "ScopusData2018-2023/2018/201800000"
zip_path = "ScopusData2018-2023.zip"
years = ["2018", "2019", "2020", "2021", "2022", "2023"]

def loader(year: int, start_id: int, end_id: int):
    conn = sqlite3.connect("scopus.db")
    cur = conn.cursor()

    with zipfile.ZipFile(zip_path, "r") as z:
        for file_id in range(start_id, end_id + 1):
            inner_path = f"ScopusData2018-2023/{year}/{file_id}"
            try:
                with z.open(inner_path) as f:
                    try:
                        obj = json.load(f)
                    except Exception:
                        continue
            except KeyError:
                continue
            raw_text = json.dumps(obj, ensure_ascii=False)
            cur.execute(
                """
                INSERT INTO papers_raw (file_id, year, raw_json)
                VALUES (?, ?, ?)
                """,
                (file_id, year, raw_text)
            )
    conn.commit()
    conn.close()




In [42]:
sql_data_path = Path("scopus.db")
if not sql_data_path.exists():

    # 2018
    loader(2018, 201800000, 201802761)

    # 2019
    loader(2019, 201900000, 201903081)

    # 2020
    loader(2020, 202000000, 202003392)

    # 2021
    loader(2021, 202100000, 202103814)

    # 2022
    loader(2022, 202200000, 202204243)

    # 2023
    loader(2023, 202300000, 202302889)
else:
    print("scopus.db already exists")

scopus.db already exists


In [None]:
conn = sqlite3.connect("scopus.db")

df_2018 = pd.read_sql_query(
    """
    SELECT file_id, year, raw_json
    FROM papers_raw
    WHERE year = 2018
    """,
    conn
)

conn.close()

print(df_2018.shape)
print(df_2018.head())

In [None]:
import sqlite3, pandas as pd

conn = sqlite3.connect("scopus.db")

SQL = """
SELECT
  file_id,
  year,

  -- basics
  json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head."citation-title"') AS citation_title,
  json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head.abstracts')        AS abstracts,
  json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head.source.publisher.publishername') AS publishername,
  json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head.source.sourcetitle')             AS sourcetitle,

  /* publication_date: DD/MM/YYYY from day, month, year (NULL if any part missing) */
  CASE
    WHEN json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head.source.publicationdate.day')   IS NOT NULL
     AND json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head.source.publicationdate.month') IS NOT NULL
     AND json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head.source.publicationdate.year')  IS NOT NULL
    THEN printf('%02d/%02d/%04d',
                CAST(json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head.source.publicationdate.day')   AS INTEGER),
                CAST(json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head.source.publicationdate.month') AS INTEGER),
                CAST(json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head.source.publicationdate.year')  AS INTEGER))
    ELSE NULL
  END AS publication_date,

  /* ce:doi pulled then aliased as document_classification_codes */
  COALESCE(
    json_extract(raw_json,'$."abstracts-retrieval-response".item."item-info"."itemidlist"."ce:doi"'),
    (SELECT t.value
     FROM json_tree(raw_json, '$."abstracts-retrieval-response"') AS t
     WHERE t.key = 'ce:doi'
     LIMIT 1)
  ) AS document_classification_codes,

  -- counts
  json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.tail.bibliography."@refcount"') AS refcount,
  CAST(
    COALESCE(
      json_extract(raw_json,'$."abstracts-retrieval-response".coredata."citedby-count"'),
      json_extract(raw_json,'$."abstracts-retrieval-response".item.coredata."citedby-count"'),
      (SELECT t.value
       FROM json_tree(raw_json, '$."abstracts-retrieval-response"') AS t
       WHERE t.key = 'citedby-count'
       LIMIT 1)
    ) AS INTEGER
  ) AS citedbycount,

  /* authors (degree + given + surname) as JSON array of strings */
  (
    SELECT json_group_array(name_str)
    FROM (
      SELECT
        TRIM(
          COALESCE(json_extract(a.value,'$."ce:degrees"') || ' ', '') ||
          TRIM(
            COALESCE(json_extract(a.value,'$."preferred-name"."ce:given-name"'),
                     json_extract(a.value,'$."ce:given-name"'), '') || ' ' ||
            COALESCE(json_extract(a.value,'$."preferred-name"."ce:surname"'),
                     json_extract(a.value,'$."ce:surname"'), '')
          )
        ) AS name_str
      FROM json_each(
             COALESCE(
               json_extract(raw_json,'$."abstracts-retrieval-response".authors.author'),
               json_array()
             )
           ) AS a
      WHERE TRIM(
              COALESCE(json_extract(a.value,'$."preferred-name"."ce:given-name"'),
                       json_extract(a.value,'$."ce:given-name"'), '') || ' ' ||
              COALESCE(json_extract(a.value,'$."preferred-name"."ce:surname"'),
                       json_extract(a.value,'$."ce:surname"'), '')
            ) <> ''
    )
  ) AS authors_deg_name_json,

  /* categories (subject → abbrev) as JSON array of {"<name>":"<abbrev>"} */
  (
    SELECT json_group_array(json_object(subject, abbrev))
    FROM (
      SELECT DISTINCT
        json_extract(sa.value,'$."$"')        AS subject,
        json_extract(sa.value,'$."@abbrev"')  AS abbrev
      FROM (
        -- primary
        SELECT * FROM json_each(
          CASE json_type(json_extract(raw_json,'$."abstracts-retrieval-response"."subject-areas"."subject-area"'))
            WHEN 'array'  THEN json_extract(raw_json,'$."abstracts-retrieval-response"."subject-areas"."subject-area"')
            WHEN 'object' THEN json_array(json_extract(raw_json,'$."abstracts-retrieval-response"."subject-areas"."subject-area"'))
            ELSE json_array()
          END
        )
        UNION ALL
        -- fallback A
        SELECT * FROM json_each(
          CASE json_type(json_extract(raw_json,'$."abstracts-retrieval-response".item.coredata."subject-areas"."subject-area"'))
            WHEN 'array'  THEN json_extract(raw_json,'$."abstracts-retrieval-response".item.coredata."subject-areas"."subject-area"')
            WHEN 'object' THEN json_array(json_extract(raw_json,'$."abstracts-retrieval-response".item.coredata."subject-areas"."subject-area"'))
            ELSE json_array()
          END
        )
        UNION ALL
        -- fallback B
        SELECT * FROM json_each(
          CASE json_type(json_extract(raw_json,'$."abstracts-retrieval-response".coredata."subject-areas"."subject-area"'))
            WHEN 'array'  THEN json_extract(raw_json,'$."abstracts-retrieval-response".coredata."subject-areas"."subject-area"')
            WHEN 'object' THEN json_array(json_extract(raw_json,'$."abstracts-retrieval-response".coredata."subject-areas"."subject-area"'))
            ELSE json_array()
          END
        )
      ) AS sa
      WHERE subject IS NOT NULL AND abbrev IS NOT NULL
    )
  ) AS categories,

  /* creator = "<given> <surname>" (first creator found) */
  (
    SELECT name_full
    FROM (
      SELECT TRIM(
               COALESCE(json_extract(a.value,'$."preferred-name"."ce:given-name"'),
                        json_extract(a.value,'$."ce:given-name"'), '') || ' ' ||
               COALESCE(json_extract(a.value,'$."preferred-name"."ce:surname"'),
                        json_extract(a.value,'$."ce:surname"'), '')
             ) AS name_full
      FROM (
        SELECT * FROM json_each(
          CASE json_type(json_extract(raw_json,'$."abstracts-retrieval-response".item.coredata."dc:creator".author'))
            WHEN 'array'  THEN json_extract(raw_json,'$."abstracts-retrieval-response".item.coredata."dc:creator".author')
            WHEN 'object' THEN json_array(json_extract(raw_json,'$."abstracts-retrieval-response".item.coredata."dc:creator".author'))
            ELSE json_array()
          END
        )
        UNION ALL
        SELECT * FROM json_each(
          CASE json_type(json_extract(raw_json,'$."abstracts-retrieval-response".coredata."dc:creator".author'))
            WHEN 'array'  THEN json_extract(raw_json,'$."abstracts-retrieval-response".coredata."dc:creator".author')
            WHEN 'object' THEN json_array(json_extract(raw_json,'$."abstracts-retrieval-response".coredata."dc:creator".author'))
            ELSE json_array()
          END
        )
      ) AS a
      WHERE name_full <> ''
      LIMIT 1
    )
  ) AS creator,

  /* ---------- NEW: keywords (JSON array of "$" strings) ---------- */
  (
    SELECT json_group_array(kw_src.kw)
    FROM (
      -- Path 1: head → citation-info → author-keywords
      SELECT json_extract(k.value,'$."$"') AS kw
      FROM json_each(
        CASE json_type(json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head."citation-info"."author-keywords"."author-keyword"'))
          WHEN 'array'  THEN json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head."citation-info"."author-keywords"."author-keyword"')
          WHEN 'object' THEN json_array(json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head."citation-info"."author-keywords"."author-keyword"'))
          ELSE json_array()
        END
      ) AS k
      UNION ALL
      -- Path 2: top-level authkeywords
      SELECT json_extract(k2.value,'$."$"') AS kw
      FROM json_each(
        CASE json_type(json_extract(raw_json,'$."abstracts-retrieval-response"."authkeywords"."author-keyword"'))
          WHEN 'array'  THEN json_extract(raw_json,'$."abstracts-retrieval-response"."authkeywords"."author-keyword"')
          WHEN 'object' THEN json_array(json_extract(raw_json,'$."abstracts-retrieval-response"."authkeywords"."author-keyword"'))
          ELSE json_array()
        END
      ) AS k2
    ) AS kw_src
    WHERE kw_src.kw IS NOT NULL
  ) AS keywords

FROM papers_raw
ORDER BY year, file_id;
"""

df = pd.read_sql_query(SQL, conn)
df
