In [5]:
import polars as pl
import importlib
import numpy as np
import lyss
importlib.reload(lyss)
from lyss import get_keywords_cooccurrences
from itertools import combinations
import os
def df(name: str) -> str:
    return os.path.join("dataframes", name + ".parquet") 
bim_dataset_path = "bim_dataset.xlsx"

## Papers

In [25]:
def get_papers() -> pl.DataFrame:
    papers = pl.read_excel(
        bim_dataset_path,
        sheet_name="Papers",
        engine="calamine",
        schema_overrides={"Year": pl.UInt16 },
    ).rename({"__UNNAMED__0": "Id"})
    nb_end_cols_to_drop = 12
    to_drop = papers.columns[-nb_end_cols_to_drop:]
    #print(f"These columns will be dropped : {", ".join(to_drop)}.")
    papers = papers.drop(to_drop)
    with pl.StringCache():
        papers = papers.with_columns(
                pl.col("Keywords").str.split(";").list.eval(pl.element().str.strip_chars().str.to_lowercase()).cast(pl.List(pl.Categorical))
                )
    return papers
papers = get_papers()
papers.write_parquet(df("papers"))
papers.head()

Id,Author,Subject,Publication,Volume,Year,Others,Others_1,Others_2,Others_3,Abstract,Keywords,nbkw,KW1,KW2,KW3,KW4,KW5,KW6,KW7,KW8,KW9,KW10,KW11,KW12,KW13,KW14,KW15,KW16,KW17,KW18,KW19,KW20,KW21,KW22,KW23,KW24,KW25,KW26,KW27,KW28,KW29,KW30,KW31,KW32,KW33,KW34,KW35,KW36,KW37,KW38,KW39,KW40,KW41,KW42,KW43,KW44,KW45,KW46,KW47,KW48,KW49,KW50,KW51,KW52,KW53,KW54,KW55,KW56
i64,str,str,str,str,u16,str,str,str,str,str,list[cat],i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null
1,"""Lu Huang, Hongfeng Zhang, Hong…","""Can the digital economy promot…","""Ecological Indicators""","""Volume 155,""",2023,"""110977,""","""ISSN 1470-160X,""","""https://doi.org/10.1016/j.ecol…","""(https://www.sciencedirect.com…","""Abstract: The development of t…","[""green economic efficiency (gee)"", ""digital economy"", … ""china""]",6,"""Green economic efficiency (GEE…","""Digital economy""","""Super-efficient SBM""","""Transmission mechanism""","""Urban level""","""China""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,"""Zhou Zou, Munir Ahmad,""","""Economic digitalization and en…","""Ecological Informatics""","""Volume 78,""",2023,"""102323,""","""ISSN 1574-9541,""","""https://doi.org/10.1016/j.ecoi…","""(https://www.sciencedirect.com…","""Abstract: This study explores …","[""economic digitalization"", ""energy transition"", … ""g7 countries""]",6,"""Economic digitalization""","""Energy transition""","""Green industrial development""","""Carbon productivity""","""Sustainable development""","""G7 countries""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,"""Satu Paiho, Nina Wessberg, Mar…","""Twin transition in the built e…","""Sustainable Cities and Society""","""Volume 98,""",2023,"""104870,""","""ISSN 2210-6707,""","""https://doi.org/10.1016/j.scs.…","""(https://www.sciencedirect.com…","""Abstract: This paper studies t…","[""buildings"", ""green and digital transition"", … ""markets""]",5,"""Buildings""","""Green and digital transition""","""Regulations""","""Technologies""","""Markets""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,"""Awadesh Kumar Mallik,""","""The future of the technology-b…","""Results in Engineering""","""Volume 19,""",2023,"""101356,""","""ISSN 2590-1230,""","""https://doi.org/10.1016/j.rine…","""(https://www.sciencedirect.com…","""Abstract: The manufacturing in…","[""manufacturing"", ""eu"", … ""management""]",5,"""Manufacturing""","""EU""","""Carbon-neutrality""","""Digitalization""","""Management""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,"""Naomi A. Ubina, Hsun-Yu Lan, S…","""Digital twin-based intelligent…","""Smart Agricultural Technology""","""Volume 5,""",2023,"""100285,""","""ISSN 2772-3755,""","""https://doi.org/10.1016/j.atec…","""(https://www.sciencedirect.com…","""Abstract: This paper focuses o…","[""big data"", ""big data analytics"", … ""aiot system""]",5,"""Big data""","""Big data analytics""","""Digital twins""","""Smart aquaculture""","""AIoT system""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""","""""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


## Keyword pairs

In [92]:
c_kw = "Keywords"
c_kw_pairs = "kw_pairs"


q = (
    pl.scan_parquet(df("papers"))
    .filter(~pl.col(c_kw).list.eval(pl.element() == "_").list.all())  # remove papers without keywords
    .select(
        pl.col("Id").alias("paper_id"),
        pl.col(c_kw).alias(c_kw_pairs).map_elements(
            lambda a: list(map(tuple, combinations(a, 2))),
            return_dtype=pl.List(pl.List(pl.String))
        ).cast(pl.List(pl.List(pl.Categorical)))
    )
    .explode(c_kw_pairs)
    .with_columns(
        pl.col(c_kw_pairs)
        .list.sort()
    )
    .group_by(c_kw_pairs).agg(
        pl.col("paper_id").alias("paper_ids"),
        pl.col(c_kw_pairs).alias("count").len()
    )
    .sort("count", descending=True)
    .with_columns(
        pl.col("count")
        .sub(1)
        .truediv(pl.col("count").max().sub(1))
        .alias("color")
    )
)

# q.show_graph(optimized=False)
# q.show_graph(optimized=True)
with pl.StringCache():
    res = q.collect()
res.write_parquet(df("kw_pairs"))
res

kw_pairs,paper_ids,count,color
list[cat],list[i64],u32,f64
"[""digitalization"", ""sustainability""]","[41, 115, … 443]",8,1.0
"[""blockchain"", ""sustainability""]","[6, 21, … 288]",7,0.857143
"[""sustainability"", ""industry 4.0""]","[95, 233, … 419]",7,0.857143
"[""sustainability"", ""smart city""]","[21, 146, … 502]",6,0.714286
"[""sustainability"", ""innovation""]","[95, 236, … 429]",6,0.714286
…,…,…,…
"[""additive manufacturing"", ""load-responsive""]",[494],1,0.0
"[""stakeholder engagement"", ""ecological design""]",[415],1,0.0
"[""blockchain"", ""brute force attack""]",[157],1,0.0
"[""informal transport"", ""digital co-production""]",[55],1,0.0


## Keywords

In [110]:
with pl.StringCache():
    keywords = (
        pl.scan_parquet(df("papers"))
            .select(
                pl.col("Id").alias("paper_id"),
                pl.col("Keywords").alias("keyword")
            )
            .explode("keyword")
            .group_by("keyword")
            .agg(
                "paper_id",
                pl.col("keyword").alias("count").len()
            )
            .filter(pl.col("keyword") != "_")  # removes rows without keywords
            .sort("count", descending=True)
    ).collect()
keywords.write_parquet(df("keywords"))
keywords

keyword,paper_id,count
cat,list[i64],u32
"""sustainability""","[6, 10, … 541]",67
"""smart city""","[21, 51, … 502]",37
"""digitalization""","[4, 31, … 443]",29
"""internet of things""","[13, 15, … 477]",25
"""smart cities""","[35, 67, … 497]",25
…,…,…
"""site specificity""",[338],1
"""heritage building preservation""",[30],1
"""community energy storage""",[303],1
"""smart irrigation system""",[374],1


In [524]:
from graph_tool.all import Graph, graph_draw, GraphView

g = Graph(directed=False)

kw_pairs = pl.read_parquet(df("kw_pairs"))
keywords = pl.read_parquet(df("keywords"))

# kw_pairs_min_max = (
#     kw_pairs.lazy()
#     .select(
#         pl.col("count").alias("min").min(),
#         pl.col("count").alias("max").max(),
#     )
# ).collect()
# kw_pairs_min_max["max"][0]
g = Graph(directed=False)
v_text = g.add_edge_list(kw_pairs.get_column("KW_pairs"), hashed=True, hash_type="string")
vmapping = {v_text[i]: i for i in range(g.num_vertices())}

In [517]:
g.vp["names"] = v_text

In [512]:
gv = GraphView(g, vfilt=lambda i: i < 100)
graph_draw(
    gv,
    inline=False,
    vertex_text=v_text,
    vertex_text_position=0,
    vertex_text_color="black"
    # output="kw_pairs.pdf"
)
# str(g.vertex(0))

(<VertexPropertyMap object with value type 'vector<double>', for Graph 0x7f42b356a840, at 0x7f42f832d160>,
 <VertexPropertyMap object with value type 'bool', for Graph 0x7f42b356a840, at 0x7f431ad15d30>)

In [518]:
g.save("keywords_pairs.graphml")