In [194]:
import polars as pl
import importlib
import numpy as np
import lyss
importlib.reload(lyss)
from lyss import get_papers, get_keywords_cooccurrences
from itertools import combinations

In [274]:
import os
def df(name: str) -> str:
    return os.path.join("dataframes", name) 

In [275]:
get_papers().write_parquet(df("papers"))

## Keyword pairs

In [360]:
c_kw = "Keywords"
c_kw_pairs = "KW_pairs"



q = (
    pl.scan_parquet(df("papers"))
    .filter(~pl.col(c_kw).list.eval(pl.element() == "_").list.all())  # remove papers without keywords
    .select(
        pl.col(c_kw).alias(c_kw_pairs).map_elements(
            lambda a: list(map(tuple, combinations(a, 2))),
            return_dtype=pl.List(pl.List(pl.String))
        )
        # .cast(pl.List(pl.Array(pl.String, 2)))  # can't convert to array because value counts isn't supported yet
        .list.explode()
        .list.sort()
        .value_counts()
    )
    .unnest("KW_pairs")
    .sort("count", descending=True)
    .with_columns(
        pl.col("count")
        .sub(1)
        .truediv(pl.col("count").max().sub(1))
        .alias("color")
    )
)

# q.show_graph(optimized=False)
# q.show_graph(optimized=True)
res = q.collect()
res.write_parquet(df("kw_pairs"))
res

KW_pairs,count,color
list[str],u32,f64
"[""digitalization"", ""sustainability""]",8,1.0
"[""industry 4.0"", ""sustainability""]",7,0.857143
"[""blockchain"", ""sustainability""]",7,0.857143
"[""innovation"", ""sustainability""]",6,0.714286
"[""smart city"", ""sustainability""]",6,0.714286
…,…,…
"[""qualifications"", ""work architectures""]",1,0.0
"[""dynamic transit bus"", ""vehicle routing problem""]",1,0.0
"[""agricultural terraces"", ""very high resolution images""]",1,0.0
"[""new urbanism"", ""smart city""]",1,0.0


## Keywords

In [369]:
pl.read_excel("bim_dataset.xlsx", sheet_name="Keywords_Counts").select(pl.col("Keyword").str.to_lowercase(), "Count").unique("Keyword").sort("Count", descending=True)

Keyword,Count
str,i64
"""sustainability""",67
"""smart city""",37
"""digitalization""",29
"""smart cities""",25
"""internet of things""",25
…,…
"""knowledge""",1
"""hvac control""",1
"""e-tools""",1
"""urban future""",1


In [415]:
keywords = (
pl.scan_parquet(df("papers"))
    .select(pl.col("Keywords").alias("keyword").list.explode().value_counts())
    .unnest("keyword")
    .filter(pl.col("keyword").struct.field("keyword") != "_")  # removes rows without keywords
    .sort("count", descending=True)
).collect()
keywords.write_parquet(df("keywords"))
keywords

keyword,count
str,u32
"""sustainability""",67
"""smart city""",37
"""digitalization""",29
"""internet of things""",25
"""smart cities""",25
…,…
"""fishers' knowledge""",1
"""citizen-administration relatio…",1
"""communications technology""",1
"""long-term effect""",1


In [515]:
from graph_tool.all import Graph, graph_draw, GraphView

g = Graph(directed=False)

kw_pairs = pl.read_parquet(df("kw_pairs"))
keywords = pl.read_parquet(df("keywords"))

# kw_pairs_min_max = (
#     kw_pairs.lazy()
#     .select(
#         pl.col("count").alias("min").min(),
#         pl.col("count").alias("max").max(),
#     )
# ).collect()
# kw_pairs_min_max["max"][0]
g = Graph(directed=False)
v_text = g.add_edge_list(kw_pairs.get_column("KW_pairs"), hashed=True, hash_type="string")
vmapping = {v_text[i]: i for i in range(g.num_vertices())}

In [517]:
g.vp["names"] = v_text

In [512]:
gv = GraphView(g, vfilt=lambda i: i < 100)
graph_draw(
    gv,
    inline=False,
    vertex_text=v_text,
    vertex_text_position=0,
    vertex_text_color="black"
    # output="kw_pairs.pdf"
)
# str(g.vertex(0))

(<VertexPropertyMap object with value type 'vector<double>', for Graph 0x7f42b356a840, at 0x7f42f832d160>,
 <VertexPropertyMap object with value type 'bool', for Graph 0x7f42b356a840, at 0x7f431ad15d30>)

In [518]:
g.save("keywords_pairs.graphml")