In [None]:
################################################################################
# 0. Environment setup (required library installation) - Jupyter Notebook
#
# 0. 환경 구성 (필요 라이브러리 설치) - Jupyter Notebook 기준
################################################################################

#If already installed, you don't need to run the commands below.
#Uncomment according to whether you're on Colab or local.

# 만약 이미 설치되어 있다면, 아래 명령은 실행하지 않아도 됩니다.
# Colab/로컬 여부에 따라 주석 해제하여 사용하세요.

!pip install --upgrade plotly kaleido
!pip install --upgrade pip

# Install llama-cpp-python (CMake options can be adjusted)
# llama-cpp-python 설치 (CMake 옵션 조정 가능)
!pip install --force-reinstall --no-cache-dir "llama-cpp-python"

# BERTopic, hdbscan, umap, sentence-transformers, datasets, etc.
!pip install bertopic datasets
!pip install jinja2 # For token-level distribution visualization # 토큰 단위 분포 시각화용
!pip install spacy spacy-transformers en-core-web-trf
!python -m spacy download en_core_web_trf

# (Optional) When using GPU acceleration (CUDF, cuML, etc.):
# (옵션) GPU 가속(CUDF, cuML 등) 사용 시:
!pip install cudf-cu12 dask-cudf-cu12 --extra-index-url=https://pypi.nvidia.com
!pip install cuml-cu12 --extra-index-url=https://pypi.nvidia.com
!pip install cugraph-cu12 --extra-index-url=https://pypi.nvidia.com
!pip install cupy-cuda12x -f https://pip.cupy.dev/aarch64

# (Optional) Example usage of datamapplot (additional visualization library)
# (옵션) datamapplot (추가 시각화 라이브러리) 사용 예시
!git clone https://github.com/TutteInstitute/datamapplot.git
!pip install datamapplot/.

In [None]:
################################################################################
# 0. Environment setup (Quantized LLM model download) - save to /model folder after download
#
# 0. 환경 구성 (Qauntized LLM Model 다운로드) - 다운로드 후 /model 폴더에 저장 
################################################################################

#!wget https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-GGUF/resolve/main/openhermes-2.5-mistral-7b.Q4_K_M.gguf

In [1]:

################################################################################
# 1. import and spaCy pipeline loading
#
# 1. import 및 spaCy 파이프라인 로드
################################################################################

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"  # prevent tokenizer warning # tokenizer warning 방지

import spacy

# Load en_core_web_trf model (needs to be installed beforehand)
# en_core_web_trf 모델 로드 (사전에 설치 필요)
nlp = spacy.load("en_core_web_trf")

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', None)       
pd.set_option('display.max_columns', None)    
pd.set_option('display.max_colwidth', None)  

import plotly
import plotly.express as px

from datasets import load_dataset
from llama_cpp import Llama
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

# BERTopic related
# BERTopic 관련
from bertopic.representation import KeyBERTInspired, LlamaCPP
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer


In [2]:

################################################################################
# 2. Custom tokenizer definition based on spaCy (can be added or modified)
#    - en_core_web_trf + domain_stopwords + default stop/punct/lemma
#
# 2. spaCy 기반 커스텀 토크나이저 정의 (추가 및 수정 가능)
#    - en_core_web_trf + domain_stopwords + 기본 stop/punct/lemma
################################################################################

domain_stopwords = {
    "set", "none", "nothing", "unidentified", "not", "unvisible",
    "there", "was", "is", "in", "of", "feature", "building", "fa?ade", 
    "structure", "context", "element", "architectural", "emphasize",
    "visual", "create", "clearly", "include", "overhead", "provide",
    "compose", "highlight", "dramatic", "area", "earth", "form", "mass",
    "perspective", "style", "scale", "absence", "depth", "proportion",
    "color", "colors", "environmental", "structural", "tone"
    "environment", "surround", "material", "story","roof","like","human","lush","enhance","indicate", "texture",
    "stark","visible","powerful","sparse", "distinctive", "define","enhanced", "appear","situate","gently","interior","offer",
    "detail","prominently","material","facade","fac?de","achieve","project","expression","significantly","dramatically",
    "power","design","share","indicate","drama","fully","screen",""
}


def spacy_tokenizer(doc_text: str):
    doc = nlp(doc_text)
    tokens = []
    for token in doc:
        lemma = token.lemma_.lower().strip()
        
        # 1) Remove basic stopwords / punctuation / whitespace / very short tokens
        # 1) 기본 불용어 / 구두점 / 공백 / 길이가 매우 짧은 토큰 제거
        if token.is_stop or token.is_punct or lemma == "" or len(lemma) < 2:
            continue
            
        # 2) Remove domain stopwords
        # 2) 도메인 불용어 제거
        if lemma in domain_stopwords:
            continue

        tokens.append(lemma)
    return tokens


In [3]:
from datasets import load_dataset

# If the column names already exist in the CSV, keep the header option without specifying it separately.
# 컬럼명이 이미 CSV에 존재하면 별도로 지정하지 말고 header 옵션을 유지합니다.
dataset = load_dataset(
    "csv",
    data_files="/app/paper/data/data.csv",
    encoding='cp949'  # or 'utf-8-sig'
)["train"]

docs = dataset["description"]

print("Sample document count::", len(docs))
print("First document example:", docs[0])


Sample document count:: 755
First document example: Large modernist building featuring a robust rectangular mass, vertical orientation, and distinct cubic form, facade composition characterized by a repetitive grid pattern of square windows evenly spaced, facade predominantly clad in smooth beige and grayish panels, base of the building displaying smooth, flat concrete textures, facade exhibiting minimal ornamentation with a clean, geometric approach, prominent roof details include curved architectural elements in the upper section creating a distinctive layered effect, a prominent central mass extends vertically, materials include smooth concrete, stone tiles, and reflective glazing, colors dominated by neutral beige, brown, gray, and glass reflecting sky hues, distinct structural elements with clearly visible separation of floors via linear bands, environment includes partial vegetation in the foreground with a sculptural tree and small artificial water feature, scale substantial as 

In [4]:

################################################################################
# 4. Load local Llama model (llama_cpp_python)
#
# 4. 로컬 Llama 모델 로드 (llama_cpp_python)
################################################################################

# Path to local gguf model (e.g., Meta-Llama-3.1-70B-Instruct.Q3_K_L.gguf)
# 로컬 gguf 모델 로드 경로 (예: Meta-Llama-3.1-70B-Instruct.Q3_K_L.gguf)

llm = Llama(
    model_path="/app/paper/model/openhermes-2.5-mistral-7b.Q4_K_M.gguf",
    n_gpu_layers=-1,  # Use maximum GPU layers (adjust based on memory) # GPU를 최대 사용 (메모리에 따라 조정)
    n_ctx=4000,
    stop=["Q:", "\n"]
)

#Prompt to use for LLM-based topic labeling
# LLM 기반 Topic 레이블링에 사용할 프롬프트

label_prompt = """Q:

You are an expert in architecture.

I have a topic that contains the following documents:

[DOCUMENTS]

The topic is described by the following keywords: '[KEYWORDS]'.

Based on the above information, can you give a short label of the topic of at most 5 words? 
Focus on architectural keywords

A:
"""

representation_model = {
    "KeyBERT": KeyBERTInspired(),
    "LLM": LlamaCPP(llm, prompt=label_prompt),
}


llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from /app/paper/model/openhermes-2.5-mistral-7b.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = teknium_openhermes-2.5-mistral-7b
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7: 

In [5]:

################################################################################
# 5. Embedding model + vectorization tool setup
#
# 5. 임베딩 모델 + 벡터화 도구 설정
################################################################################

# (1) SentenceTransformer embedding
embedding_model = SentenceTransformer("BAAI/bge-m3", device="cuda")
embeddings = embedding_model.encode(docs, show_progress_bar=True)

# (2) Custom CountVectorizer (spaCy)
custom_vectorizer = CountVectorizer(
    tokenizer=spacy_tokenizer,
    token_pattern=None,
    # Add min_df, max_df, etc. if needed
    # min_df=5, max_df=0.9, ngram_range=(1,2), ...
)

Batches:   0%|          | 0/24 [00:00<?, ?it/s]

In [6]:

################################################################################
# 6. Dimensionality reduction / clustering (HDBSCAN) model
#
# 6. 차원축소/클러스터링(HDBSCAN) 모델
################################################################################

# UMAP/HDBSCAN used for actual clustering
# 실제 클러스터링에 활용할 UMAP/HDBSCAN

umap_model = UMAP(
    n_neighbors=6,
    n_components=4,
    min_dist=1,
    spread=1.5,
    metric='cosine',
    random_state=32
)

hdbscan_model = HDBSCAN(
    min_cluster_size=5,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True
)

# 2-D UMAP for visualization
# 시각화용 2차원 UMAP
reduced_embeddings_2d = UMAP(
    n_neighbors=5,      
    n_components=2,  # 2 dimensions for visualization # 시각화 목적이면 2차원
    min_dist=1,      
    spread=1.5,          
    metric='cosine',
    random_state=32
).fit_transform(embeddings)


In [7]:

################################################################################
# 7. Create BERTopic instance + fit_transform
#
# 7. BERTopic 인스턴스 생성 + fit_transform
################################################################################

topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    representation_model=representation_model,
    top_n_words=30,
    calculate_probabilities=True,
    verbose=True,
    vectorizer_model=custom_vectorizer
)

topics, probs = topic_model.fit_transform(docs, embeddings)
topic_info = topic_model.get_topic_info()
print("Generated top 30 topic info:")
print(topic_info.head(30))

2025-08-06 10:05:01,456 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-08-06 10:05:02,099 - BERTopic - Dimensionality - Completed ✓
2025-08-06 10:05:02,100 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-08-06 10:05:02,129 - BERTopic - Cluster - Completed ✓
2025-08-06 10:05:02,132 - BERTopic - Representation - Extracting topics from clusters using representation models.
  0%|          | 0/26 [00:00<?, ?it/s]llama_perf_context_print:        load time =    6646.16 ms
llama_perf_context_print: prompt eval time =    6645.79 ms /   930 tokens (    7.15 ms per token,   139.94 tokens per second)
llama_perf_context_print:        eval time =    1182.50 ms /    15 runs   (   78.83 ms per token,    12.69 tokens per second)
llama_perf_context_print:       total time =    7832.39 ms /   945 tokens
  4%|▍         | 1/26 [00:07<03:15,  7.84s/it]Llama.generate: 27 prefix-match hit, remaining 837 prompt tokens to eval
llama_perf_context_print:     

Generated top 30 topic info:
    Topic  Count                                          Name  \
0      -1    337          -1_glass_contemporary_concrete_panel   
1       0     78          0_residential_natural_wooden_balcony   
2       1     32                1_pavilion_open_natural_canopy   
3       2     28                2_urban_dynamic_metallic_glass   
4       3     27            3_museum_surface_massing_geometric   
5       4     26        4_institutional_white_geometric_modern   
6       5     23      5_concrete_brutalist_angular_cylindrical   
7       6     23              6_educational_volume_white_metal   
8       7     21                 7_concrete_wall_smooth_canopy   
9       8     21           8_cubic_minimalist_volume_geometric   
10      9     16                9_balcony_tower_band_modernist   
11     10     13        10_industrial_steel_column_transparent   
12     11     12             11_transparent_urban_panel_narrow   
13     12     12        12_skyscraper_sleek_bra

In [17]:
info = topic_model.get_topic_info()
info.to_csv("/app/Github/output/topic_model.csv", index=True)  

## Visualization

In [9]:
# (1) Overview of topics
# (1) 토픽들 전반 살펴보기

fig_topics = topic_model.visualize_topics()
fig_topics.show()

# (2) Document visualization (2D)
# (2) 문서 시각화 (2D)

fig_docs_2d = topic_model.visualize_documents(
    docs,
    reduced_embeddings=reduced_embeddings_2d,
    hide_document_hover=False
)
fig_docs_2d.update_layout(
    xaxis_title="UMAP-Dim1",
    yaxis_title="UMAP-Dim2",

)
fig_docs_2d.update_traces(
    marker=dict(
        size=9,
        line=dict(width=0.5, color="DarkSlateGrey"),
        #symbol="circle",  # Refer to 'https://plotly.com/python/marker-style/' (참고)
        opacity=0.9
    )
)

fig_docs_2d.update_traces(text=None)
fig_docs_2d.show()

In [10]:
# (3) Hierarchical Clustering (tree form)
# (3) 계층적 토픽 (트리형)

hierarchical_topics = topic_model.hierarchical_topics(docs)
fig_hierarchy = topic_model.visualize_hierarchy(
    hierarchical_topics=hierarchical_topics
)
fig_hierarchy.update_layout(width=1000, height=1000)
fig_hierarchy.update_layout(
    margin=dict(l=100)  # Increase left margin so labels are not cut off # 왼쪽 여백을 크게 해서 레이블이 안 잘리도록
)
fig_hierarchy.show()

100%|██████████| 24/24 [00:00<00:00, 928.83it/s]


In [11]:
# (4) Hierarchical Documents and Topics
# (4) 계층적 토픽 + 문서 시각화

fig_h_docs_2d = topic_model.visualize_hierarchical_documents(
    docs,
    hierarchical_topics,
    reduced_embeddings=reduced_embeddings_2d,
    hide_document_hover=False
)

fig_h_docs_2d.update_traces(
    marker=dict(
        size=8,
        line=dict(width=0.5, color="DarkSlateGrey"),
        #symbol="circle",  # refer to 'https://plotly.com/python/marker-style/' (참고)
        opacity=0.9
    )
)
fig_h_docs_2d.show()

In [12]:
# (5) barchart - Topic word Scores 

fig_barchart = topic_model.visualize_barchart(top_n_topics=24, n_words=10)
fig_barchart.update_layout(width=1800, height=1800)
fig_barchart.show()

In [13]:
# (6) heatmap
# fig_heatmap = topic_model.visualize_heatmap(n_clusters=10)
# fig_heatmap.update_layout(width=1200, height=1000)
# fig_heatmap.show()

fig_heatmap = topic_model.visualize_heatmap(n_clusters=10) # Similar topics are grouped by n_clusters # n_clusters로 유사 토픽끼리 묶임
fig_heatmap.update_xaxes(
    tickangle=45,
    tickfont=dict(size=12),
    automargin=True
)
fig_heatmap.update_layout(
    width=1200,
    height=1100,
    margin=dict(l=300, r=300, t=300, b=300)  # Values can be adjusted # 값 조정 가능
)

fig_heatmap.update_yaxes(
    tickfont=dict(size=13),
    automargin=True
)
fig_heatmap.update_traces(
    # If using coloraxis instead of trace.colorbar, the format below may differ
    # 만약 trace.colorbar 가 아니라 coloraxis를 쓰고 있다면 아래 형식이 다를 수 있음
    colorbar=dict(
        thickness=10,   # 두께(px)
        len=0.5,        # Vertical length ratio relative to heatmap (0-1) # heatmap 대비 세로 길이 비율(0~1)
        y=0.5,          # Center alignment # 중앙 정렬
        yanchor='middle',
        x=1.1           # Position offset to the right of the heatmap # heatmap 오른쪽에 떨어뜨려 위치
    )
)

fig_heatmap.show()

In [14]:
# (7) term rank
fig_term_rank = topic_model.visualize_term_rank()

# Use a darker color set (Vivid)
# 더 짙은 색상 세트 사용 (Vivid)

fig_term_rank.update_layout(
    showlegend=True,
    legend_title_text="Topic Number",
    width=1200,
    height=800,
    plot_bgcolor="white"  # set background to white (for sharp contrast) # 배경 흰색으로 설정 (선명한 대비 효과)
)

colors = px.colors.qualitative.Dark24

for idx, trace in enumerate(fig_term_rank.data):
    trace.line.width = 1               # increase line width (improves clarity) # 선 굵기 증가 (선명도 증가)
    trace.line.color = colors[idx % len(colors)]
    trace.opacity = 0.7                # remove transparency (maximum clarity) # 투명도 제거 (최대 선명도)
    trace.name = f"Topic {idx}"

#Remove grid
# 그리드 제거
fig_term_rank.update_xaxes(showgrid=False)
fig_term_rank.update_yaxes(showgrid=False)
fig_term_rank.show()

In [15]:
fig_dist = topic_model.visualize_distribution(probs[0], min_probability=0.0)
# 토픽 번호만 y축 레이블로 지정
# Set only topic numbers as y-axis labels
num_topics = len(probs[0])
topic_numbers = [f"Topic {i}" for i in range(num_topics)]

fig_dist.update_yaxes(
    ticktext=topic_numbers,  # Text to use as labels # 레이블로 사용할 텍스트
    tickvals=list(range(num_topics)),  # Actual values # 실제 값
    automargin=True
)

fig_dist.update_layout(width=1200, height=800, margin=dict(l=100))

fig_dist.show()

In [16]:
topic_keywords = []
all_topics = topic_model.get_topic_info().Topic.unique()
for t_id in all_topics:
    if t_id == -1:
        continue
    top_words = topic_model.get_topic(t_id)  # e.g.: [(word, score), (word, score), ...]
    for (w, score) in top_words:
        topic_keywords.append({
            "topic_id": int(t_id),
            "keyword": w,
            "score": float(score),
        })

df_keywords = pd.DataFrame(topic_keywords)
df_keywords.sort_values(["topic_id","score"], ascending=[True,False], inplace=True)


In [18]:
import json

# A list containing all (topic_id, keyword) pairs, including duplicates
# 중복 포함 그대로, 모든 (topic_id, keyword) 쌍을 담은 리스트
items_for_llm = []
for _, row in df_keywords.iterrows():
    items_for_llm.append({
        "topic_id": int(row["topic_id"]),
        "keyword": row["keyword"],
    })

print(f"LLM에 보낼 (topic_id, keyword) 개수: {len(items_for_llm)}")
print(f"Number of (topic_id, keyword) pairs to send to the LLM: {len(items_for_llm)}")

# Save to JSON file (to paste into an interactive LLM)
# JSON 파일로 저장 (대화형 LLM에 붙여넣기 위해)
with open("/app/Github/output/topic_keywords_for_llm.json", "w", encoding="utf-8") as f:
    json.dump(items_for_llm, f, ensure_ascii=False, indent=2)


LLM에 보낼 (topic_id, keyword) 개수: 750
Number of (topic_id, keyword) pairs to send to the LLM: 750


# Pre-Process json data by LLM before recategorization 
# 건축 카테고리 재정의 전 LLM을 활용한 데이터 전처리 

### 1)Use the '/Github/Prompt/json_pre_process_LLM_prompt.txt' to pre process the data.
### 2)save the processed json output inside the "/app/Github/output" folder

# ChatGPT-REQUEST

In [22]:
import json
import pandas as pd

with open("/app/Github/output/architecture_category_llm_output.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# json file formmat(json파일 데이터 양식)
# [
#   {"topic_id":0,"keyword":"window","category":"Facade composition"},
#   {"topic_id":0,"keyword":"concrete","category":"Materials and Textures"},
#   ...
# ]

df_classified = pd.DataFrame(data)
print(df_classified.head(210))


     topic_id           keyword                       category
0           0       residential          Architectural program
1           0           natural          Environmental Context
2           0            wooden         Materials and Textures
3           0           balcony  Special Architectural Feature
4           0            window             Facade composition
5           0             white                         Colors
6           0       rectangular                  Form and Mass
7           0          vertical             Facade composition
8           0              dark                         Colors
9           0        minimalist            Architectural Style
10          0      contemporary            Architectural Style
11          0            modern            Architectural Style
12          0             metal         Materials and Textures
13          0              wall             Facade composition
14          0             large           Scale and Pro

In [23]:
df_merged = pd.merge(
    df_keywords,
    df_classified,
    on=["topic_id","keyword"],
    how="left"
)

# Remove items with "Unknown"
# "Unknow"인 항목 삭제
df_merged = df_merged[df_merged["category"] != "Unknown"]

print(df_merged.head(210))


     topic_id           keyword     score                       category
0           0       residential  0.035357          Architectural program
1           0           natural  0.028979          Environmental Context
2           0            wooden  0.024592         Materials and Textures
3           0           balcony  0.023262  Special Architectural Feature
4           0            window  0.022730             Facade composition
5           0             white  0.022604                         Colors
6           0       rectangular  0.020392                  Form and Mass
7           0          vertical  0.017862             Facade composition
8           0              dark  0.017831                         Colors
9           0        minimalist  0.017050            Architectural Style
10          0      contemporary  0.016260            Architectural Style
11          0            modern  0.016247            Architectural Style
12          0             metal  0.016164         M

In [24]:
df_cat_scores = df_merged.groupby("category")["score"].sum().reset_index()
df_cat_scores.sort_values("score", ascending=False, inplace=True)
print(df_cat_scores)


                         category     score
4              Facade composition  3.667170
5                   Form and Mass  2.674300
6          Materials and Textures  2.670239
8            Scale and Proportion  1.984319
3           Environmental Context  1.375298
1           Architectural program  1.331550
0             Architectural Style  1.302355
2                          Colors  1.296634
10            Structural Elements  1.175610
9   Special Architectural Feature  1.159225
11    Urban Perspective and Depth  0.486683
7                    Roof Details  0.211545


In [25]:
cat_counts = df_merged.groupby("category")["keyword"].count().reset_index()
cat_counts = cat_counts.sort_values("keyword", ascending=False)
print("\n=== Number of keywords per category ===")
print("\n=== 카테고리별 키워드 수 ===")
print(cat_counts)

# Sum of c-TF-IDF scores for each category
# 각 카테고리 별 c-TF-IDF 점수 합계
df_scoresum = df_merged.groupby("category")["score"].sum().reset_index()
df_scoresum = df_scoresum.sort_values("score", ascending=False)
print("\n=== 카테고리별 cTFIDF 합 ===")
print("\n=== Sum of cTFIDF per category ===")
print(df_scoresum)


=== Number of keywords per category ===

=== 카테고리별 키워드 수 ===
                         category  keyword
4              Facade composition      116
5                   Form and Mass       93
6          Materials and Textures       81
8            Scale and Proportion       62
2                          Colors       48
0             Architectural Style       46
3           Environmental Context       44
10            Structural Elements       38
1           Architectural program       27
9   Special Architectural Feature       26
11    Urban Perspective and Depth       14
7                    Roof Details        6

=== 카테고리별 cTFIDF 합 ===

=== Sum of cTFIDF per category ===
                         category     score
4              Facade composition  3.667170
5                   Form and Mass  2.674300
6          Materials and Textures  2.670239
8            Scale and Proportion  1.984319
3           Environmental Context  1.375298
1           Architectural program  1.331550
0          

In [None]:
################################################################################
# Visualization of the redefinition (classification) process ­– using df_merged
#
# "재정의(분류) 과정의 시각화" - df_merged 활용
################################################################################

import plotly.express as px

# df_merged:
#   [topic_id, keyword, score, category]
#
#   Unknown 제외된 최종 LLM 분류 결과
#   Final LLM classification result with 'Unknown' removed

# (1) Category distribution (frequency) bar chart
# (1) 카테고리 분포(빈도수) 막대 그래프
cat_counts = df_merged.groupby("category")["keyword"].count().reset_index()
cat_counts.columns = ["category", "count"]
cat_counts = cat_counts.sort_values("count", ascending=False)

fig_cat_counts = px.bar(
    cat_counts,
    x="category",
    y="count",
    title="Distribution of all keyword architecture categories (by frequency)",
    #title="전체 키워드 건축 카테고리 분포 (빈도수 기준)",
    color="count",            # Use the color parameter for visual distinction (optional) # 시각적으로 구분하기 위해 color 매개변수 사용 (선택)
    color_continuous_scale="Blues"  # Adjust color scale (optional) # 색상 스케일 조정 (선택)
)
fig_cat_counts.update_layout(width=900, height=600)
fig_cat_counts.show()

# (2) topic_id vs category heat-map (crosstab) – based on ‘number of keywords’
# (2) topic_id vs category 히트맵(교차표) - '키워드 개수' 기준
pivot_topic_cat = df_merged.pivot_table(
    index="topic_id",
    columns="category",
    values="keyword",
    aggfunc="count",
    fill_value=0
).astype(int)

fig_topic_cat_heatmap = px.imshow(
    pivot_topic_cat,
    text_auto=True,
    aspect="auto",
    color_continuous_scale="Blues",
    title="[Count] Topic vs. Architecture Category"
)
fig_topic_cat_heatmap.update_layout(width=1000, height=700)
fig_topic_cat_heatmap.show()

################################################################################
# Visualization based on “score (total score)”
#
# "score(합산 점수) 기준"으로도 시각화
################################################################################

# (1) Category distribution (total score) bar chart
# (1) 카테고리 분포(합산 점수) 막대 그래프
cat_scores = df_merged.groupby("category")["score"].sum().reset_index()
cat_scores.columns = ["category", "total_score"]
cat_scores = cat_scores.sort_values("total_score", ascending=False)

fig_cat_scores = px.bar(
    cat_scores,
    x="category",
    y="total_score",
    title="Distribution of all keyword architecture categories (by total score)",
#    title="전체 키워드 건축 카테고리 분포 (합산 점수 기준)",
    color="total_score",
    color_continuous_scale="Blues"
)
fig_cat_scores.update_layout(width=900, height=600)
fig_cat_scores.show()


#(2) topic_id vs category heat-map (crosstab) – based on ‘score sum’
# (2) topic_id vs category 히트맵(교차표) - 'score 합계' 기준
pivot_topic_cat_score = df_merged.pivot_table(
    index="topic_id",
    columns="category",
    values="score",
    aggfunc="sum",
    fill_value=0
).astype(float)

fig_topic_cat_score_heatmap = px.imshow(
    pivot_topic_cat_score,
    text_auto=".2f",   
    aspect="auto",
    color_continuous_scale="Blues",
    title="[Score Sum] Topic vs. Architecture Category"
)
fig_topic_cat_score_heatmap.update_layout(width=1000, height=700)
fig_topic_cat_score_heatmap.show()


################################################################################
# Visualizing keyword rankings by architectural category + top N
#
# "건축 카테고리 별 키워드들의 순위를 시각화 + 상위 Top N"
################################################################################


# First, top categories by ‘frequency’
# 우선, '빈도수' 기준 상위 카테고리
TOP_N = 5  
cat_counts_sorted = cat_counts.sort_values("count", ascending=False)
top_categories_count = cat_counts_sorted.head(TOP_N)
print(f"\n=== Top {TOP_N} categories based on total keyword appearances ===")
#print(f"\n=== 전체 키워드 등장 수 기준 상위 {TOP_N} 카테고리 ===")
print(top_categories_count)

# Top categories by ‘total score’
# '합산 점수' 기준 상위 카테고리
TOP_N_SCORE = 5
cat_scores_sorted = cat_scores.sort_values("total_score", ascending=False)
top_categories_score = cat_scores_sorted.head(TOP_N_SCORE)
print(f"\n=== Top {TOP_N_SCORE} categories based on total keyword scores ===")
#print(f"\n=== 전체 키워드 점수 합계 기준 상위 {TOP_N_SCORE} 카테고리 ===")
print(top_categories_score)





=== Top 5 categories based on total keyword appearances ===
                 category  count
4      Facade composition    116
5           Form and Mass     93
6  Materials and Textures     81
8    Scale and Proportion     62
2                  Colors     48

=== Top 5 categories based on total keyword scores ===
                 category  total_score
4      Facade composition     3.667170
5           Form and Mass     2.674300
6  Materials and Textures     2.670239
8    Scale and Proportion     1.984319
3   Environmental Context     1.375298


In [44]:
# ------------------------------------------------------------
#  Aggregation and visualization of top N keywords per category (score-based)
#  모든 카테고리별 상위 N 키워드(점수 기준) 집계·시각화
#  -----------------------------------------------------------
import pandas as pd
import plotly.express as px
from IPython.display import display   


# ================= User settings section ==================
# ================= 사용자 설정 영역 ==========================
KEYWORDS_PER_CATEGORY = 10   # Number of keywords to display per category (modify if desired) # 카테고리마다 보고 싶은 키워드 개수 (원하면 수정)
SHOW_BAR_CHARTS      = True  # Whether to draw bar charts # 막대그래프도 그릴지 여부
# ============================================================

# 1) Keyword-level aggregation: freq (count), total_score (sum of scores)
# 1) keyword 단위 집계: freq(등장 횟수), total_score(점수 합계)

agg_kw = (
    df_merged
    .groupby(['category', 'keyword'])
    .agg(
        freq        = ('keyword', 'count'),
        total_score = ('score',   'sum')
    )
    .reset_index()
)

# 2) Category order sorting (optional) – here by descending total score
# 2) 카테고리 순서 정렬(선택) – 여기선 전체 점수 합계 기준으로 큰 것부터

cat_order = (
    agg_kw
    .groupby('category')['total_score']
    .sum()
    .sort_values(ascending=False)
    .index
)

# 3) Extract and visualize top N per category
# 3) 카테고리별 상위 N 추출 및 출력/시각화

for cat in cat_order:
    subset = agg_kw[agg_kw['category'] == cat]                  
    top_n  = subset.sort_values('total_score', ascending=False) \
                    .head(KEYWORDS_PER_CATEGORY)                
    
    # ----- Display as table -----
    # ----- 표 형태로 출력 -----

    print(f"\n=== Category: {cat} ===")
    #print(f"\n=== 카테고리: {cat} ===")
    display(top_n)        

    # ----- (Optional) Bar chart -----
    # ----- (선택) 막대그래프 -----

    if SHOW_BAR_CHARTS and not top_n.empty:
        fig = px.bar(
            top_n,
            x='keyword',
            y='total_score',
            color='total_score',
            text='freq',                  
            title=f"{cat} top {KEYWORDS_PER_CATEGORY} keywords (score-based)",       
            #title=f"{cat} 카테고리 상위 {KEYWORDS_PER_CATEGORY} 키워드 (score 기준)",
            color_continuous_scale='Blues'
        )
        fig.update_layout(width=900, height=500)
        fig.show()



=== Category: Facade composition ===


Unnamed: 0,category,keyword,freq,total_score
100,Facade composition,horizontal,12,0.393721
120,Facade composition,vertical,11,0.336435
107,Facade composition,pattern,8,0.24092
99,Facade composition,grid,6,0.225791
122,Facade composition,wall,5,0.199351
118,Facade composition,transparency,6,0.17419
98,Facade composition,glazing,6,0.139297
121,Facade composition,void,2,0.131902
123,Facade composition,window,6,0.128927
116,Facade composition,surface,5,0.128361



=== Category: Form and Mass ===


Unnamed: 0,category,keyword,freq,total_score
149,Form and Mass,rectangular,10,0.242682
166,Form and Mass,volume,9,0.237335
136,Form and Mass,flat,8,0.198433
124,Form and Mass,angle,5,0.160191
125,Form and Mass,angular,4,0.124361
138,Form and Mass,geometric,5,0.122076
140,Form and Mass,incline,2,0.108198
130,Form and Mass,curved,3,0.100889
158,Form and Mass,stack,3,0.098845
153,Form and Mass,sculptural,4,0.097931



=== Category: Materials and Textures ===


Unnamed: 0,category,keyword,freq,total_score
170,Materials and Textures,concrete,9,0.41252
173,Materials and Textures,glass,10,0.301551
177,Materials and Textures,metallic,9,0.291521
181,Materials and Textures,reflective,7,0.260123
167,Materials and Textures,brick,5,0.259911
184,Materials and Textures,smooth,7,0.163216
176,Materials and Textures,metal,5,0.158735
186,Materials and Textures,steel,3,0.15747
169,Materials and Textures,clear,6,0.130679
183,Materials and Textures,rugged,1,0.067075



=== Category: Scale and Proportion ===


Unnamed: 0,category,keyword,freq,total_score
214,Scale and Proportion,rise,4,0.194547
203,Scale and Proportion,large,8,0.178754
210,Scale and Proportion,monumental,5,0.175054
202,Scale and Proportion,high,4,0.153525
204,Scale and Proportion,level,4,0.145577
218,Scale and Proportion,strong,4,0.112797
217,Scale and Proportion,slender,3,0.098218
206,Scale and Proportion,massive,2,0.091168
199,Scale and Proportion,extensive,3,0.069214
211,Scale and Proportion,narrow,2,0.063737



=== Category: Environmental Context ===


Unnamed: 0,category,keyword,freq,total_score
90,Environmental Context,waterfront,3,0.178391
89,Environmental Context,water,2,0.100766
83,Environmental Context,rocky,1,0.062621
77,Environmental Context,natural,2,0.060993
82,Environmental Context,river,1,0.058814
66,Environmental Context,environment,3,0.057047
73,Environmental Context,landscape,3,0.056545
79,Environmental Context,outside,1,0.055571
72,Environmental Context,ground,2,0.055433
86,Environmental Context,terrain,1,0.050489



=== Category: Architectural program ===


Unnamed: 0,category,keyword,freq,total_score
34,Architectural program,tower,3,0.274095
31,Architectural program,residential,5,0.196817
32,Architectural program,skyscraper,1,0.101095
29,Architectural program,pavilion,1,0.09337
18,Architectural program,block,1,0.085674
25,Architectural program,institutional,3,0.084769
28,Architectural program,museum,1,0.073034
21,Architectural program,educational,1,0.061918
26,Architectural program,mixed,1,0.057331
19,Architectural program,complex,1,0.056981



=== Category: Architectural Style ===


Unnamed: 0,category,keyword,freq,total_score
3,Architectural Style,contemporary,10,0.220389
8,Architectural Style,minimalist,8,0.199042
10,Architectural Style,modern,9,0.186189
12,Architectural Style,modernist,3,0.122954
5,Architectural Style,industrial,1,0.079657
2,Architectural Style,classical,1,0.077994
14,Architectural Style,postmodern,1,0.071439
1,Architectural Style,brutalist,1,0.068601
7,Architectural Style,minimal,3,0.067721
15,Architectural Style,traditional,2,0.039141



=== Category: Colors ===


Unnamed: 0,category,keyword,freq,total_score
60,Colors,white,10,0.222624
47,Colors,grey,3,0.097942
50,Colors,red,3,0.093126
42,Colors,dark,4,0.087314
45,Colors,gray,3,0.072662
38,Colors,brown,2,0.062025
49,Colors,orange,1,0.052763
46,Colors,green,2,0.050516
48,Colors,neutral,2,0.048286
37,Colors,blue,2,0.048148



=== Category: Structural Elements ===


Unnamed: 0,category,keyword,freq,total_score
244,Structural Elements,column,4,0.130199
246,Structural Elements,floor,5,0.125873
256,Structural Elements,support,5,0.12313
249,Structural Elements,framing,3,0.11389
255,Structural Elements,slab,2,0.082733
239,Structural Elements,base,2,0.073815
247,Structural Elements,frame,2,0.071956
252,Structural Elements,rib,1,0.067292
248,Structural Elements,framework,2,0.055874
241,Structural Elements,bracing,1,0.053566



=== Category: Special Architectural Feature ===


Unnamed: 0,category,keyword,freq,total_score
225,Special Architectural Feature,balcony,5,0.240238
233,Special Architectural Feature,podium,1,0.130438
227,Special Architectural Feature,canopy,3,0.107772
228,Special Architectural Feature,cantilever,1,0.107061
229,Special Architectural Feature,cantilevered,2,0.103885
237,Special Architectural Feature,staircase,2,0.086354
238,Special Architectural Feature,terrace,1,0.064584
232,Special Architectural Feature,ornamental,1,0.058217
234,Special Architectural Feature,railing,2,0.055505
231,Special Architectural Feature,mural,1,0.051965



=== Category: Urban Perspective and Depth ===


Unnamed: 0,category,keyword,freq,total_score
262,Urban Perspective and Depth,urban,9,0.284867
260,Urban Perspective and Depth,frontal,2,0.05884
259,Urban Perspective and Depth,aerial,1,0.056451
261,Urban Perspective and Depth,street,1,0.046498
263,Urban Perspective and Depth,viewpoint,1,0.040027



=== Category: Roof Details ===


Unnamed: 0,category,keyword,freq,total_score
191,Roof Details,cornice,1,0.086137
192,Roof Details,crown,1,0.034795
193,Roof Details,parapet,1,0.029165
194,Roof Details,roofline,1,0.022454
195,Roof Details,rooftop,1,0.021324
196,Roof Details,slope,1,0.017669


In [None]:
##########################################################################################
# 12. Automatically generating "image creation prompts" using top architecture categories
# 12. 상위 건축 카테고리를 이용해 "이미지 생성 Prompt" 자동생성 예시
##########################################################################################


def make_image_prompt(category_name, df_merged, top_k=5, by_score=True):
    """
    Among the keywords in the specified category, select the top_k with the highest scores if by_score=True,
    or the top_k with the highest frequencies if by_score=False,
    and create an example image-generation prompt.

    해당 카테고리에 속한 키워드 중, (by_score=True)면 점수 높은 상위 top_k,
    (by_score=False)면 빈도수 높은 상위 top_k를 뽑아서
    이미지 생성 프롬프트 예시를 만들어본다.
    """
    cat_df = df_merged[df_merged["category"] == category_name]
    if by_score:
        # sorted in descending order by score # 점수 높은 순
        freq = cat_df.groupby("keyword")["score"].sum().reset_index()
        freq.columns = ["keyword", "total_score"]
        freq_sorted = freq.sort_values("total_score", ascending=False)
        top_k_words = freq_sorted.head(top_k)["keyword"].tolist()
    else:
        # sorted in descending order of frequency # 빈도수 높은 순
        freq = cat_df["keyword"].value_counts().reset_index()
        freq.columns = ["keyword", "count"]
        freq_sorted = freq.sort_values("count", ascending=False)
        top_k_words = freq_sorted.head(top_k)["keyword"].tolist()
    
    if len(top_k_words) == 0:
        return None
    
    prompt = (f"A futuristic '{category_name}' architecture design focusing on "
              f"{', '.join(top_k_words)}. "
              f"Ultra-detailed, photorealistic rendering.")
    return prompt

print(f"\n=== Example prompts for image generation (top_{top_k} by score) ===")
#print(f"\n=== 이미지 생성을 위한 예시 Prompt (점수 기준 top_{top_k}) ===")
for cat_name in selected_top_categories:
    img_prompt = make_image_prompt(cat_name, df_merged, top_k=5, by_score=True)
    if img_prompt:
        print(f" - [{cat_name}] Prompt: {img_prompt}")
    else:
        print(f" - [{cat_name}] -> No keyword in the category")



=== Example prompts for image generation (top_5 by score) ===
 - [Facade composition] Prompt: A futuristic 'Facade composition' architecture design focusing on horizontal, vertical, pattern, grid, wall. Ultra-detailed, photorealistic rendering.
 - [Form and Mass] Prompt: A futuristic 'Form and Mass' architecture design focusing on rectangular, volume, flat, angle, angular. Ultra-detailed, photorealistic rendering.
 - [Materials and Textures] Prompt: A futuristic 'Materials and Textures' architecture design focusing on concrete, glass, metallic, reflective, brick. Ultra-detailed, photorealistic rendering.
 - [Scale and Proportion] Prompt: A futuristic 'Scale and Proportion' architecture design focusing on rise, large, monumental, high, level. Ultra-detailed, photorealistic rendering.
 - [Environmental Context] Prompt: A futuristic 'Environmental Context' architecture design focusing on waterfront, water, rocky, natural, river. Ultra-detailed, photorealistic rendering.


# Use LLM and re-categorized results(keyword,freq,c-TF-IDF scores) for Final Analysis by LLM (use 'final_analysis.txt' prompt) 