# Extracting the data out of the zip

In [2]:
import zipfile
import json
import sqlite3
import pandas as pd

conn = sqlite3.connect("scopus.db")
cur = conn.cursor()

cur.execute("""
CREATE TABLE IF NOT EXISTS papers_raw (
    file_id INTEGER,
    year INTEGER,
    raw_json TEXT
)
""")

conn.commit()
conn.close()

keep = "ScopusData2018-2023/2018/201800000"
zip_path = "ScopusData2018-2023.zip"
years = ["2018", "2019", "2020", "2021", "2022", "2023"]

def loader(year: int, start_id: int, end_id: int):
    conn = sqlite3.connect("scopus.db")
    cur = conn.cursor()

    with zipfile.ZipFile(zip_path, "r") as z:
        for file_id in range(start_id, end_id + 1):
            inner_path = f"ScopusData2018-2023/{year}/{file_id}"
            try:
                with z.open(inner_path) as f:
                    try:
                        obj = json.load(f)
                    except Exception:
                        continue
            except KeyError:
                continue
            raw_text = json.dumps(obj, ensure_ascii=False)
            cur.execute(
                """
                INSERT INTO papers_raw (file_id, year, raw_json)
                VALUES (?, ?, ?)
                """,
                (file_id, year, raw_text)
            )
    conn.commit()
    conn.close()




In [3]:
# 2018
loader(2018, 201800000, 201802761)

# 2019
loader(2019, 201900000, 201903081)

# 2020
loader(2020, 202000000, 202003392)

# 2021
loader(2021, 202100000, 202103814)

# 2022
loader(2022, 202200000, 202204243)

# 2023
loader(2023, 202300000, 202302889)

# Run the first two part of the code first to create a Database and the reason for this is because its much faster than saving in a list or dictionary. (I don't have to load 5.63 GB from extracting the zip file)

In [4]:
conn = sqlite3.connect("scopus.db")

df_2018 = pd.read_sql_query(
    """
    SELECT file_id, year, raw_json
    FROM papers_raw
    WHERE year = 2018
    """,
    conn
)

conn.close()

print(df_2018.shape)
print(df_2018.head())

(8286, 3)
     file_id  year                                           raw_json
0  201800000  2018  {"abstracts-retrieval-response": {"item": {"ai...
1  201800001  2018  {"abstracts-retrieval-response": {"item": {"ai...
2  201800002  2018  {"abstracts-retrieval-response": {"item": {"ai...
3  201800003  2018  {"abstracts-retrieval-response": {"item": {"ai...
4  201800004  2018  {"abstracts-retrieval-response": {"item": {"ai...


In [7]:
#Process raw JSON text in chunks
conn = sqlite3.connect("scopus.db") #Connect to db

df = pd.read_sql_query("""
SELECT 
file_id,
year,

-- Titles/abstracts
json_extract(raw_json, '$."abstracts-retrieval-response".bibrecord.head."citation-title"')                           AS citation_title,
COALESCE(
json_extract(raw_json, '$."abstracts-retrieval-response".item.abstracts'),
json_extract(raw_json, '$."abstracts-retrieval-response".abstracts')
) AS abstracts,

-- Source/publisher
json_extract(raw_json, '$."abstracts-retrieval-response".bibrecord.head.source')                                      AS source,
json_extract(raw_json, '$."abstracts-retrieval-response".bibrecord.head.source.publisher.publishername')              AS publishername,
json_extract(raw_json, '$."abstracts-retrieval-response".bibrecord.head.source.sourcetitle')                          AS sourcetitle,
json_extract(raw_json, '$."abstracts-retrieval-response".bibrecord.head.source.publicationdate')                      AS publicationdate,
json_extract(raw_json, '$."abstracts-retrieval-response".bibrecord.head.source.publicationdate."date-text"')          AS date_text,

-- Item-info / Itemidlist / ce:doi
json_extract(raw_json, '$."abstracts-retrieval-response".item."item-info"')                                           AS item_info,
json_extract(raw_json, '$."abstracts-retrieval-response".item."item-info"."itemidlist"')                              AS itemidlist,
json_extract(raw_json, '$."abstracts-retrieval-response".item."item-info"."itemidlist"."ce:doi"')                     AS "ce:doi",

-- Tail / bibliography / @refcount
json_extract(raw_json, '$."abstracts-retrieval-response".bibrecord.tail.bibliography')                                AS bibliography,
json_extract(raw_json, '$."abstracts-retrieval-response".bibrecord.tail.bibliography."@refcount"')                    AS refcount,

-- Coredata / dc:creator / subject-areas (robust to both shapes)
COALESCE(
json_extract(raw_json, '$."abstracts-retrieval-response".item.coredata'),
json_extract(raw_json, '$.coredata')
) AS coredata,
COALESCE(
json_extract(raw_json, '$."abstracts-retrieval-response".item.coredata."dc:creator"'),
json_extract(raw_json, '$.coredata."dc:creator"')
) AS "dc:creator",
COALESCE(
json_extract(raw_json, '$."abstracts-retrieval-response".item.coredata."subject-areas"'),
json_extract(raw_json, '$.coredata."subject-areas"')
) AS subject_areas
FROM papers_raw
ORDER BY year
""",conn
)
print(df.head())
print(df.shape)

     file_id  year citation_title abstracts source publishername sourcetitle  \
0  201800000  2018           None      None   None          None        None   
1  201800001  2018           None      None   None          None        None   
2  201800002  2018           None      None   None          None        None   
3  201800003  2018           None      None   None          None        None   
4  201800004  2018           None      None   None          None        None   

  publicationdate date_text item_info itemidlist ce:doi bibliography refcount  \
0            None      None      None       None   None         None     None   
1            None      None      None       None   None         None     None   
2            None      None      None       None   None         None     None   
3            None      None      None       None   None         None     None   
4            None      None      None       None   None         None     None   

  coredata dc:creator subject_ar