# Extracting the data out of the zip

In [28]:
import zipfile
import json
import sqlite3
import pandas as pd
from datetime import datetime

In [29]:
# conn = sqlite3.connect("scopus.db")
# cur = conn.cursor()

# cur.execute("""
# CREATE TABLE IF NOT EXISTS papers_raw (
#     file_id INTEGER,
#     year INTEGER,
#     raw_json TEXT
# )
# """)

# conn.commit()
# conn.close()

# keep = "ScopusData2018-2023/2018/201800000"
# zip_path = "ScopusData2018-2023.zip"
# years = ["2018", "2019", "2020", "2021", "2022", "2023"]

# def loader(year: int, start_id: int, end_id: int):
#     conn = sqlite3.connect("scopus.db")
#     cur = conn.cursor()

#     with zipfile.ZipFile(zip_path, "r") as z:
#         for file_id in range(start_id, end_id + 1):
#             inner_path = f"ScopusData2018-2023/{year}/{file_id}"
#             try:
#                 with z.open(inner_path) as f:
#                     try:
#                         obj = json.load(f)
#                     except Exception:
#                         continue
#             except KeyError:
#                 continue
#             raw_text = json.dumps(obj, ensure_ascii=False)
#             cur.execute(
#                 """
#                 INSERT INTO papers_raw (file_id, year, raw_json)
#                 VALUES (?, ?, ?)
#                 """,
#                 (file_id, year, raw_text)
#             )
#     conn.commit()
#     conn.close()




In [30]:
# # 2018
# loader(2018, 201800000, 201802761)

# # 2019
# loader(2019, 201900000, 201903081)

# # 2020
# loader(2020, 202000000, 202003392)

# # 2021
# loader(2021, 202100000, 202103814)

# # 2022
# loader(2022, 202200000, 202204243)

# # 2023
# loader(2023, 202300000, 202302889)

# Run the first two part of the code first to create a Database and the reason for this is because its much faster than saving in a list or dictionary. (I don't have to load 5.63 GB from extracting the zip file)

In [31]:
# conn = sqlite3.connect("scopus.db")

# df_2018 = pd.read_sql_query(
#     """
#     SELECT file_id, year, raw_json
#     FROM papers_raw
#     WHERE year = 2018
#     """,
#     conn
# )

# conn.close()

# print(df_2018.shape)
# print(df_2018.head())

In [32]:
# title = test_data["abstracts-retrieval-response"]["item"]["bibrecord"]["head"]["citation-title"]
# class_code = test_data["abstracts-retrieval-response"]["item"]["bibrecord"]["head"]["enhancement"]["classificationgroup"]["classifications"]
# date_of_publication = test_data["abstracts-retrieval-response"]["item"]["bibrecord"]["head"]["source"]["publicationdate"]
# affiliations = test_data["abstracts-retrieval-response"]["affiliation"]
# authors = test_data["abstracts-retrieval-response"]["authors"]
# author_groups = test_data["abstracts-retrieval-response"]["item"]["bibrecord"]["head"]["author-group"]
# reference = test_data["abstracts-retrieval-response"]["item"]["bibrecord"]["tail"]["bibliography"]["reference"]
# coredata = test_data["abstracts-retrieval-response"]["coredata"]

In [33]:
import os, sqlite3, json, zipfile, pandas as pd
from datetime import datetime

DB_PATH   = os.path.abspath("scopus.db")                 # <- single source of truth
ZIP_PATH  = os.path.abspath("ScopusData2018-2023.zip")   # <- your zip
ZIP_ROOT  = "ScopusData2018-2023"                        # folder name inside zip
YEARS     = [2018, 2019, 2020, 2021, 2022, 2023]

In [34]:
# Create tables once
with sqlite3.connect(DB_PATH) as conn:
    conn.execute("PRAGMA journal_mode=WAL;")
    conn.execute("""
    CREATE TABLE IF NOT EXISTS papers_raw(
        file_id   INTEGER PRIMARY KEY,
        year      INTEGER,
        raw_json  TEXT
    )""")
    conn.execute("""
    CREATE TABLE IF NOT EXISTS papers_clean(
        file_id INTEGER PRIMARY KEY,
        year INTEGER,
        title TEXT,
        abstract_text TEXT,
        pub_year INTEGER, pub_month INTEGER, pub_day INTEGER, pub_date_iso TEXT,
        keywords TEXT,
        classification_codes_json TEXT,
        affiliations_top_json TEXT,
        affiliations_groups_json TEXT,
        references_json TEXT
    )""")


In [35]:
def load_raw_year_from_zip(year: int):
    rows = []
    prefix = f"{ZIP_ROOT}/{year}/"     # e.g., ScopusData2018-2023/2018/

    with zipfile.ZipFile(ZIP_PATH, "r") as z:
        for member in z.namelist():
            if not member.startswith(prefix): 
                continue
            if member.endswith("/"):   # skip directories
                continue

            file_id_str = os.path.basename(member)  # e.g., "201800123"
            if not file_id_str.isdigit():
                continue
            file_id = int(file_id_str)

            try:
                with z.open(member) as f:
                    obj = json.load(f)
            except Exception:
                continue

            raw_text = json.dumps(obj, ensure_ascii=False)
            rows.append((file_id, year, raw_text))

    if rows:
        with sqlite3.connect(DB_PATH) as conn:
            conn.executemany(
                "INSERT OR REPLACE INTO papers_raw(file_id, year, raw_json) VALUES (?, ?, ?)",
                rows
            )
    return len(rows)

# Example: load all years’ raw rows
for yr in YEARS:
    n = load_raw_year_from_zip(yr)
    print(f"Loaded raw {yr}: {n} rows")


KeyboardInterrupt: 

In [None]:
import json
from datetime import datetime

def extract_fields(paper):
    ar = paper["abstracts-retrieval-response"]

    item_block = ar["item"]
    bibrec = item_block["bibrecord"]
    head = bibrec["head"]
    tail = bibrec["tail"] if ("tail" in bibrec and bibrec["tail"]) else {}
    core = ar["coredata"]

    # -------------------------
    # 1) Title + Abstract
    # -------------------------
    title = head["citation-title"]

    abstract_text = None
    if "abstracts" in head and head["abstracts"]:
        abs_block = head["abstracts"]
        if type(abs_block) == str:
            txt = abs_block.strip()
            abstract_text = txt if txt else None
        elif "abstract" in abs_block and abs_block["abstract"]:
            abs_list = abs_block["abstract"]
            if type(abs_list) == dict:
                abs_list = [abs_list]
            parts = []
            for entry in abs_list:
                if type(entry) == dict and "$" in entry:
                    parts.append(entry["$"])
                elif type(entry) == str:
                    parts.append(entry)
            if parts:
                abstract_text = " ".join(parts)

    # -------------------------
    # 2) Classification codes
    # -------------------------
    classification_codes = []
    if ("enhancement" in head and
        "classificationgroup" in head["enhancement"] and
        "classifications" in head["enhancement"]["classificationgroup"]):
        class_list = head["enhancement"]["classificationgroup"]["classifications"]
        if type(class_list) == dict:
            class_list = [class_list]
        for c in class_list:
            code_type = c["@type"] if "@type" in c else None
            code_val  = c["classification"] if "classification" in c else None
            classification_codes.append({"type": code_type, "code": code_val})

    # -------------------------
    # 3) Publication date (+ datetime)
    # -------------------------
    publication_date = {"year": None, "month": None, "day": None, "text": None}
    publication_date_dt = None
    if ("source" in head and "publicationdate" in head["source"]):
        pub = head["source"]["publicationdate"]
        y = pub["year"]  if "year"  in pub else None
        m = pub["month"] if "month" in pub else None
        d = pub["day"]   if "day"   in pub else None
        t = pub["date-text"]["$"] if "date-text" in pub and "$" in pub["date-text"] else None
        publication_date = {"year": y, "month": m, "day": d, "text": t}
        if y and m and d:
            publication_date_dt = datetime(int(y), int(m), int(d))

    # -------------------------
    # 4A) Affiliations (paper-level)
    # -------------------------
    affiliations_top = []
    if "affiliation" in ar and ar["affiliation"]:
        aff_list = ar["affiliation"]
        if type(aff_list) == dict:
            aff_list = [aff_list]
        for aff in aff_list:
            affilname = aff["affilname"] if "affilname" in aff else None
            city      = aff["affiliation-city"] if "affiliation-city" in aff else None
            country   = aff["affiliation-country"] if "affiliation-country" in aff else None
            affiliations_top.append({"affilname": affilname, "city": city, "country": country})

    # -------------------------
    # 4B) Affiliations (author-group with org + authors)
    # -------------------------
    affiliations_groups = []
    if "author-group" in head and head["author-group"]:
        ag_list = head["author-group"]
        if type(ag_list) == dict:
            ag_list = [ag_list]
        for grp in ag_list:
            aff_block = grp["affiliation"] if "affiliation" in grp else {}
            city    = aff_block["city"]    if "city"    in aff_block else None
            state   = aff_block["state"]   if "state"   in aff_block else None
            country = aff_block["country"] if "country" in aff_block else None

            org_names = []
            if "organization" in aff_block:
                org_field = aff_block["organization"]
                if type(org_field) in (dict, str):
                    org_list = [org_field]
                else:
                    org_list = org_field
                for org in org_list:
                    if type(org) == dict and "$" in org:
                        org_names.append(org["$"])
                    elif type(org) == str:
                        org_names.append(org)

            grp_author_names = []
            if "author" in grp and grp["author"]:
                grp_authors = grp["author"]
                if type(grp_authors) == dict:
                    grp_authors = [grp_authors]
                for ga in grp_authors:
                    pref = ga["preferred-name"] if "preferred-name" in ga else {}
                    if "ce:given-name" in pref:
                        given = pref["ce:given-name"]
                    elif "ce:given-name" in ga:
                        given = ga["ce:given-name"]
                    else:
                        given = None
                    if "ce:surname" in pref:
                        surname = pref["ce:surname"]
                    elif "ce:surname" in ga:
                        surname = ga["ce:surname"]
                    else:
                        surname = None
                    if "ce:indexed-name" in pref:
                        indexed = pref["ce:indexed-name"]
                    elif "ce:indexed-name" in ga:
                        indexed = ga["ce:indexed-name"]
                    else:
                        indexed = None
                    if given and surname:
                        name_out = f"{given} {surname}"
                    elif indexed:
                        name_out = indexed
                    else:
                        name_out = "Unknown"
                    grp_author_names.append(name_out)

            affiliations_groups.append({
                "city": city, "state": state, "country": country,
                "organizations": org_names, "authors_in_group": grp_author_names
            })

    # -------------------------
    # 5) References
    # -------------------------
    references = []
    if tail and "bibliography" in tail and "reference" in tail["bibliography"]:
        refs_list = tail["bibliography"]["reference"]
        if type(refs_list) == dict:
            refs_list = [refs_list]
        for r in refs_list:
            ref_info = r["ref-info"] if "ref-info" in r else {}
            if "ref-title" in ref_info and "ref-titletext" in ref_info["ref-title"]:
                ref_title = ref_info["ref-title"]["ref-titletext"]
            else:
                ref_title = None
            if "ref-publicationyear" in ref_info and "@first" in ref_info["ref-publicationyear"]:
                ref_year = ref_info["ref-publicationyear"]["@first"]
            else:
                ref_year = None
            full_cite = r["ref-fulltext"] if "ref-fulltext" in r else None
            references.append({"title": ref_title, "year": ref_year, "full_citation": full_cite})

    # -------------------------
    # 6) Keywords
    # -------------------------
    raw_keywords = core["authkeywords"] if "authkeywords" in core else None
    keywords = [kw.strip() for kw in raw_keywords.split(";")] if (type(raw_keywords) == str) else None

    return {
        "title": title,
        "abstract_text": abstract_text,
        "classification_codes": classification_codes,
        "publication_date": publication_date,
        "publication_date_dt": publication_date_dt,
        "affiliations_top": affiliations_top,
        "affiliations_groups": affiliations_groups,
        "references": references,
        "keywords": keywords,
    }


In [None]:
# test_data = json.loads(df_2018["raw_json"][0])
# info = extract_fields_simple2(test_data)

# print("TITLE:", info["title"])
# print("ABSTRACT:", info["abstract_text"])
# print("CLASS:", info["classification_codes"])

# print("PUB DATE FIELDS:", info["publication_date"])
# print("PUB DATE DATETIME:", info["publication_date_dt"])

# print("AFFILIATIONS_TOP:", info["affiliations_top"])
# print("AFFILIATIONS_GROUPS:", info["affiliations_groups"])
# print("KEYWORDS:", info["keywords"])

# print("NUM REFS:", len(info["references"]))
# print("FIRST REF:", info["references"][0] if info["references"] else None)


NameError: name 'df_2018' is not defined

In [None]:
# conn = sqlite3.connect("scopus_new.db")
# cur = conn.cursor()

# cur.execute("""
# CREATE TABLE IF NOT EXISTS papers_clean (
#     file_id INTEGER PRIMARY KEY,
#     year INTEGER,
#     title TEXT,
#     abstract_text TEXT,
#     pub_year INTEGER,
#     pub_month INTEGER,
#     pub_day INTEGER,
#     pub_date_iso TEXT,
#     keywords TEXT,
#     classification_codes_json TEXT,
#     affiliations_top_json TEXT,
#     affiliations_groups_json TEXT,
#     references_json TEXT
# )
# """)

# conn.commit()
# conn.close()


In [None]:
def make_row_for_db(file_id_val, year_val, info):
    pub = info["publication_date"]
    iso = info["publication_date_dt"].strftime("%Y-%m-%d") if info["publication_date_dt"] else None
    kws = "; ".join(info["keywords"]) if info["keywords"] else None

    return (
        int(file_id_val), int(year_val),
        info["title"], info["abstract_text"],
        pub["year"], pub["month"], pub["day"], iso,
        kws,
        json.dumps(info["classification_codes"], ensure_ascii=False),
        json.dumps(info["affiliations_top"],    ensure_ascii=False),
        json.dumps(info["affiliations_groups"], ensure_ascii=False),
        json.dumps(info["references"],          ensure_ascii=False)
    )


In [None]:
def load_year_into_clean_table(target_year: int):
    with sqlite3.connect(DB_PATH) as conn:
        df_year = pd.read_sql_query(
            "SELECT file_id, year, raw_json FROM papers_raw WHERE year = ? ORDER BY file_id",
            conn, params=(target_year,)
        )

        rows = []
        for _, r in df_year.iterrows():
            paper_dict = json.loads(r["raw_json"])
            info = extract_fields(paper_dict)   # uses your strict parser
            rows.append(make_row_for_db(r["file_id"], r["year"], info))

        if rows:
            conn.executemany("""
                INSERT OR REPLACE INTO papers_clean(
                    file_id, year, title, abstract_text,
                    pub_year, pub_month, pub_day, pub_date_iso,
                    keywords, classification_codes_json,
                    affiliations_top_json, affiliations_groups_json, references_json
                ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            """, rows)

# Example: build clean table for all years
for yr in YEARS:
    load_year_into_clean_table(yr)


In [None]:
# def load_year_into_clean_table(target_year):
#     # 1. pull raw rows for that year
#     conn = sqlite3.connect("scopus.db")
#     df_year = pd.read_sql_query(
#         """
#         SELECT file_id, year, raw_json
#         FROM papers_raw
#         WHERE year = ?
#         ORDER BY file_id
#         """,
#         conn,
#         params=(target_year,)
#     )

#     # 2. build rows for insert
#     rows_to_insert = []
#     for i in range(len(df_year)):
#         file_id_val = df_year.loc[i, "file_id"]
#         year_val    = df_year.loc[i, "year"]
#         raw_txt     = df_year.loc[i, "raw_json"]

#         paper_dict = json.loads(raw_txt)
#         info = extract_fields_simple(paper_dict)  # uses the function we built

#         row_tuple = make_row_for_db(file_id_val, year_val, info)
#         rows_to_insert.append(row_tuple)

#     # 3. insert into papers_clean
#     cur = conn.cursor()

#     cur.executemany(
#         """
#         INSERT OR REPLACE INTO papers_clean (
#             file_id,
#             year,
#             title,
#             abstract_text,
#             pub_year,
    #         pub_month,
    #         pub_day,
    #         pub_date_iso,
    #         keywords,
    #         classification_codes_json,
    #         affiliations_top_json,
    #         affiliations_groups_json,
    #         references_json
    #     )
    #     VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
    #     """,
    #     rows_to_insert
    # )

    # conn.commit()
    # conn.close()


In [39]:
import sqlite3

df_all = pd.read_sql_query(
    "SELECT * FROM papers_clean ORDER BY file_id",
    conn
)
df_all
# import json

# row = df_all.iloc[0]
# codes = json.loads(row["classification_codes_json"]) if row["classification_codes_json"] else []
# codes 
# [c["code"] for c in codes if "code" in c] 




Unnamed: 0,file_id,year,title,abstract_text,pub_year,pub_month,pub_day,pub_date_iso,keywords,classification_codes_json,affiliations_top_json,affiliations_groups_json,references_json
0,201800000,2018,Public health and international epidemiology f...,,2018,12.0,31.0,2018-12-31,,"[{""type"": ""ASJC"", ""code"": ""2700""}, {""type"": ""S...","[{""affilname"": ""Stanford University School of ...","[{""city"": ""Bangkok"", ""state"": null, ""country"":...","[{""title"": ""The untilled fields of public heal..."
1,201800001,2018,Flexible Printed Active Antenna for Digital Te...,"© 2018 The Institute of Electronics, Informati...",2018,12.0,31.0,2018-12-31,,"[{""type"": ""ASJC"", ""code"": [{""$"": ""2208""}, {""$""...","[{""affilname"": ""Chulalongkorn University"", ""ci...","[{""city"": ""Patumwan, Bangkok"", ""state"": null, ...","[{""title"": ""Development of built-in low-profil..."
2,201800002,2018,Parametric study of hydrogen production via so...,© 2018 Elsevier LtdComputational fluid dynamic...,2018,12.0,31.0,2018-12-31,,"[{""type"": ""CPXCLASS"", ""code"": [{""classificatio...","[{""affilname"": ""Chulalongkorn University"", ""ci...","[{""city"": ""Bangkok"", ""state"": null, ""country"":...","[{""title"": ""Capture of CO2from combustion gase..."
3,201800003,2018,Superhydrophobic coating from fluoroalkylsilan...,© 2018 Elsevier B.V. A superhydrophobic/supero...,2018,12.0,31.0,2018-12-31,,"[{""type"": ""CPXCLASS"", ""code"": [{""classificatio...","[{""affilname"": ""Hirosaki University"", ""city"": ...","[{""city"": ""Pathumthani"", ""state"": null, ""count...","[{""title"": ""Ceramic membrane performance in mi..."
4,201800004,2018,Electrochemical impedance-based DNA sensor usi...,© 2018 Elsevier B.V. A label-free electrochemi...,2018,12.0,31.0,2018-12-31,,"[{""type"": ""EMCLASS"", ""code"": {""classification-...","[{""affilname"": ""Chulalongkorn University"", ""ci...","[{""city"": ""Bangkok"", ""state"": null, ""country"":...","[{""title"": ""The diagnosis and misdiagnosis of ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20211,202302885,2023,Long-chain bio-olefins production via oxidativ...,© 2021 Elsevier B.V.Long-chain α-olefins (≥ C1...,2023,1.0,1.0,2023-01-01,,"[{""type"": ""CPXCLASS"", ""code"": [{""classificatio...","[{""affilname"": ""Chulalongkorn University"", ""ci...","[{""city"": ""Bangkok"", ""state"": null, ""country"":...","[{""title"": ""The chemistry and kinetics of poly..."
20212,202302886,2023,Recent Developments and Applications of Microf...,"© 2021 Taylor & Francis Group, LLC.Nowadays, f...",2023,,,,,"[{""type"": ""CPXCLASS"", ""code"": [{""classificatio...","[{""affilname"": ""Chulalongkorn University"", ""ci...","[{""city"": ""Bangkok"", ""state"": null, ""country"":...","[{""title"": null, ""year"": ""2021"", ""full_citatio..."
20213,202302887,2023,"Social justice, education and peacebuilding: c...",© 2021 The Author(s). Published by Informa UK ...,2023,,,,,"[{""type"": ""ASJC"", ""code"": ""3304""}, {""type"": ""S...","[{""affilname"": ""Chulalongkorn University"", ""ci...","[{""city"": ""London"", ""state"": null, ""country"": ...","[{""title"": ""The Rehabilitation of Jemaah Islam..."
20214,202302888,2023,Effects of black soldier fly (Hermetia illucen...,© 2021 Taylor & Francis.The effects of replaci...,2023,,,,,"[{""type"": ""GEOCLASS"", ""code"": {""classification...","[{""affilname"": ""Chulalongkorn University"", ""ci...","[{""city"": ""Nong Khai Province"", ""state"": null,...","[{""title"": ""Effect of dietary carbohydrate to ..."
