# Extracting the data out of the zip

In [14]:
import zipfile
import json
import sqlite3
import pandas as pd

conn = sqlite3.connect("scopus.db")
cur = conn.cursor()

cur.execute("""
CREATE TABLE IF NOT EXISTS papers_raw (
    file_id INTEGER,
    year INTEGER,
    raw_json TEXT
)
""")

conn.commit()
conn.close()

keep = "ScopusData2018-2023/2018/201800000"
zip_path = "ScopusData2018-2023.zip"
years = ["2018", "2019", "2020", "2021", "2022", "2023"]

def loader(year: int, start_id: int, end_id: int):
    conn = sqlite3.connect("scopus.db")
    cur = conn.cursor()

    with zipfile.ZipFile(zip_path, "r") as z:
        for file_id in range(start_id, end_id + 1):
            inner_path = f"ScopusData2018-2023/{year}/{file_id}"
            try:
                with z.open(inner_path) as f:
                    try:
                        obj = json.load(f)
                    except Exception:
                        continue
            except KeyError:
                continue
            raw_text = json.dumps(obj, ensure_ascii=False)
            cur.execute(
                """
                INSERT INTO papers_raw (file_id, year, raw_json)
                VALUES (?, ?, ?)
                """,
                (file_id, year, raw_text)
            )
    conn.commit()
    conn.close()




In [15]:
# 2018
loader(2018, 201800000, 201802761)

# 2019
loader(2019, 201900000, 201903081)

# 2020
loader(2020, 202000000, 202003392)

# 2021
loader(2021, 202100000, 202103814)

# 2022
loader(2022, 202200000, 202204243)

# 2023
loader(2023, 202300000, 202302889)

KeyboardInterrupt: 

In [None]:
conn = sqlite3.connect("scopus.db")

df_2018 = pd.read_sql_query(
    """
    SELECT file_id, year, raw_json
    FROM papers_raw
    WHERE year = 2018
    """,
    conn
)

conn.close()

print(df_2018.shape)
print(df_2018.head())

(2762, 3)
     file_id  year                                           raw_json
0  201800000  2018  {"abstracts-retrieval-response": {"item": {"ai...
1  201800001  2018  {"abstracts-retrieval-response": {"item": {"ai...
2  201800002  2018  {"abstracts-retrieval-response": {"item": {"ai...
3  201800003  2018  {"abstracts-retrieval-response": {"item": {"ai...
4  201800004  2018  {"abstracts-retrieval-response": {"item": {"ai...


In [9]:
import sqlite3, pandas as pd

conn = sqlite3.connect("scopus.db")

SQL = """
SELECT
  file_id,
  year,
  json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head."citation-title"') AS citation_title,
  json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head.abstracts')        AS abstracts,
  json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head.source.publisher.publishername') AS publishername,
  json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head.source.sourcetitle')             AS sourcetitle,

  /* publication_date (DD/MM/YYYY). NULL if any part missing */
  CASE
    WHEN json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head.source.publicationdate.day')   IS NOT NULL
     AND json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head.source.publicationdate.month') IS NOT NULL
     AND json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head.source.publicationdate.year')  IS NOT NULL
    THEN printf(
      '%02d/%02d/%04d',
      CAST(json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head.source.publicationdate.day')   AS INTEGER),
      CAST(json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head.source.publicationdate.month') AS INTEGER),
      CAST(json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.head.source.publicationdate.year')  AS INTEGER)
    )
    ELSE NULL
  END AS publication_date,

  /* ce:doi: direct path, else first match anywhere; NULL if absent */
  COALESCE(
    json_extract(raw_json,'$."abstracts-retrieval-response".item."item-info"."itemidlist"."ce:doi"'),
    (SELECT t.value
     FROM json_tree(raw_json, '$."abstracts-retrieval-response"') AS t
     WHERE t.key = 'ce:doi'
     LIMIT 1)
  ) AS "ce:doi",

  json_extract(raw_json,'$."abstracts-retrieval-response".item.bibrecord.tail.bibliography."@refcount"') AS refcount,

  /* authors_deg_name_json (NULL if no authors) */
  (
    SELECT json_group_array(name_str)
    FROM (
      SELECT
        TRIM(
          COALESCE(json_extract(a.value,'$."ce:degrees"') || ' ', '') ||
          TRIM(
            COALESCE(json_extract(a.value,'$."preferred-name"."ce:given-name"'),
                     json_extract(a.value,'$."ce:given-name"'), '') || ' ' ||
            COALESCE(json_extract(a.value,'$."preferred-name"."ce:surname"'),
                     json_extract(a.value,'$."ce:surname"'), '')
          )
        ) AS name_str
      FROM json_each(
             COALESCE(
               json_extract(raw_json,'$."abstracts-retrieval-response".authors.author'),
               json_array()
             )
           ) AS a
      WHERE TRIM(
              COALESCE(json_extract(a.value,'$."preferred-name"."ce:given-name"'),
                       json_extract(a.value,'$."ce:given-name"'), '') || ' ' ||
              COALESCE(json_extract(a.value,'$."preferred-name"."ce:surname"'),
                       json_extract(a.value,'$."ce:surname"'), '')
            ) <> ''
    )
  ) AS authors_deg_name_json,

  /* categories (NULL if none) */
  (
    SELECT json_group_array(json_object(subject, abbrev))
    FROM (
      SELECT DISTINCT
        json_extract(sa.value,'$."$"')        AS subject,
        json_extract(sa.value,'$."@abbrev"')  AS abbrev
      FROM (
        SELECT * FROM json_each(
          CASE json_type(json_extract(raw_json,'$."abstracts-retrieval-response"."subject-areas"."subject-area"'))
            WHEN 'array'  THEN json_extract(raw_json,'$."abstracts-retrieval-response"."subject-areas"."subject-area"')
            WHEN 'object' THEN json_array(json_extract(raw_json,'$."abstracts-retrieval-response"."subject-areas"."subject-area"'))
            ELSE json_array()
          END
        )
        UNION ALL
        SELECT * FROM json_each(
          CASE json_type(json_extract(raw_json,'$."abstracts-retrieval-response".item.coredata."subject-areas"."subject-area"'))
            WHEN 'array'  THEN json_extract(raw_json,'$."abstracts-retrieval-response".item.coredata."subject-areas"."subject-area"')
            WHEN 'object' THEN json_array(json_extract(raw_json,'$."abstracts-retrieval-response".item.coredata."subject-areas"."subject-area"'))
            ELSE json_array()
          END
        )
        UNION ALL
        SELECT * FROM json_each(
          CASE json_type(json_extract(raw_json,'$."abstracts-retrieval-response".coredata."subject-areas"."subject-area"'))
            WHEN 'array'  THEN json_extract(raw_json,'$."abstracts-retrieval-response".coredata."subject-areas"."subject-area"')
            WHEN 'object' THEN json_array(json_extract(raw_json,'$."abstracts-retrieval-response".coredata."subject-areas"."subject-area"'))
            ELSE json_array()
          END
        )
      ) AS sa
      WHERE subject IS NOT NULL AND abbrev IS NOT NULL
    )
  ) AS categories

FROM papers_raw
ORDER BY year, file_id;
"""

df = pd.read_sql_query(SQL, conn)
df

Unnamed: 0,file_id,year,citation_title,abstracts,publishername,sourcetitle,publication_date,ce:doi,refcount,authors_deg_name_json,categories
0,201800000,2018,Public health and international epidemiology f...,,Springer International Publishing,"Radiology in Global Health: Strategies, Implem...",31/12/2018,10.1007/978-3-319-98485-8_15,76,"[""PhD.. Krit Pongpirul"",""VE. Matthew P. Lungren""]","[{""Medicine (all)"":""MEDI""}]"
1,201800001,2018,Flexible Printed Active Antenna for Digital Te...,"© 2018 The Institute of Electronics, Informati...",Institute of Electrical and Electronics Engine...,Progress in Electromagnetics Research Symposium,31/12/2018,10.23919/PIERS.2018.8597669,4,"[""Teerapong Pratumsiri"",""Panuwat Janpugdee""]","[{""Electrical and Electronic Engineering"":""ENG..."
2,201800002,2018,Parametric study of hydrogen production via so...,© 2018 Elsevier LtdComputational fluid dynamic...,Elsevier Ltd,Chemical Engineering Science,31/12/2018,10.1016/j.ces.2018.08.042,42,"[""Kiattikhoon Phuakpunk"",""Benjapon Chalermsins...","[{""Chemistry (all)"":""CHEM""},{""Chemical Enginee..."
3,201800003,2018,Superhydrophobic coating from fluoroalkylsilan...,© 2018 Elsevier B.V. A superhydrophobic/supero...,Elsevier B.V.,Applied Surface Science,31/12/2018,10.1016/j.apsusc.2018.08.059,45,"[""Jittraporn Saengkaew"",""Duy Le"",""Chanatip Sam...","[{""Chemistry (all)"":""CHEM""},{""Condensed Matter..."
4,201800004,2018,Electrochemical impedance-based DNA sensor usi...,© 2018 Elsevier B.V. A label-free electrochemi...,Elsevier B.V.,Analytica Chimica Acta,31/12/2018,10.1016/j.aca.2018.07.045,55,"[""Prinjaporn Teengam"",""Weena Siangproh"",""Adiso...","[{""Analytical Chemistry"":""CHEM""},{""Biochemistr..."
...,...,...,...,...,...,...,...,...,...,...,...
20181,202302885,2023,Long-chain bio-olefins production via oxidativ...,© 2021 Elsevier B.V.Long-chain α-olefins (≥ C1...,Elsevier B.V.,Catalysis Today,01/01/2023,10.1016/j.cattod.2021.07.034,63,"[""Duy Le"",""Nattaporn Chaidherasuwet"",""Atitarn ...","[{""Catalysis"":""CENG""},{""Chemistry (all)"":""CHEM""}]"
20182,202302886,2023,Recent Developments and Applications of Microf...,"© 2021 Taylor & Francis Group, LLC.Nowadays, f...",Taylor and Francis Ltd.,Critical Reviews in Analytical Chemistry,,10.1080/10408347.2021.1949695,115,"[""Waleed Alahmad"",""Puttaruksa Varanusupakul"",""...","[{""Analytical Chemistry"":""CHEM""}]"
20183,202302887,2023,"Social justice, education and peacebuilding: c...",© 2021 The Author(s). Published by Informa UK ...,Routledge,Compare,,10.1080/03057925.2021.1951666,76,"[""Tejendra Pherali""]","[{""Education"":""SOCI""}]"
20184,202302888,2023,Effects of black soldier fly (Hermetia illucen...,© 2021 Taylor & Francis.The effects of replaci...,Taylor and Francis Ltd.,Journal of Applied Aquaculture,,10.1080/10454438.2021.1923609,44,"[""Ratchaneegorn Mapanao"",""Wirat Jiwyam"",""Nudth...","[{""Ecology"":""ENVI""},{""Aquatic Science"":""AGRI""}]"


In [None]:
#hello

#idk