# 1. 청년 순이동률

In [353]:
import pandas as pd
import requests
import numpy as np

# pd.set_option("display.max_rows", 1000)

admin = pd.read_csv("../../data/행정구역코드.csv")
admin["행정구역코드"] = admin["행정구역코드"].astype(str)

# 수도권 + 광역시 제외
exclude_prefix = ["11", "31", "21", "22", "23", "24", "25", "26"]
sigun = admin[~admin["행정구역코드"].str[:2].isin(exclude_prefix)].copy()

objL1 = "+".join(sigun["행정구역코드"].tolist()) + "+"

BASE_URL = "https://kosis.kr/openapi/Param/statisticsParameterData.do"

params = {
    "method": "getList",
    "apiKey": "ZWRiNzEyMzEwNGI5OWQ3NzcxNGM1MDNiOGJkOTQ0Y2M=",
    "orgId": "101",
    "tblId": "DT_1YL20642",
    "itmId": "T001+",
    "objL1": objL1,
    "objL2": "21+22+23+",
    "format": "json",
    "jsonVD": "Y",
    "prdSe": "Y",
    "newEstPrdCnt": "3",
}

response = requests.get(BASE_URL, params=params)
df_raw = pd.DataFrame(response.json())

df = df_raw[
    [
        "PRD_DE",  # 연도
        "C1",  # 행정코드
        "C1_NM",  # 지역명
        "C2_NM",  # 지표명 (순이동률 / 순이동)
        "DT",  # 값
    ]
].copy()

df.columns = ["연도", "행정코드", "지역명", "지표", "값"]

df["연도"] = df["연도"].astype(int)
df["행정코드"] = df["행정코드"].astype(str)
df["값"] = pd.to_numeric(df["값"], errors="coerce")

rate_df = df[df["지표"].str.contains("순이동률")].copy()
count_df = df[df["지표"].str.contains("순이동") & ~df["지표"].str.contains("률")].copy()

rate_df.rename(columns={"값": "청년순이동률"}, inplace=True)
count_df.rename(columns={"값": "청년순이동"}, inplace=True)

pivot_rate_df = rate_df.pivot_table(
    index=["행정코드", "지역명"], columns="연도", values="청년순이동률", aggfunc="sum"
).reset_index()

pivot_count_df = count_df.pivot_table(
    index=["행정코드", "지역명"], columns="연도", values="청년순이동", aggfunc="sum"
).reset_index()

# =========================
# 0) 입력: pivot_rate_df (행정코드, 지역명, 연도컬럼들)
# =========================

# 최근 3개 연도 자동 선택 (newEstPrdCnt=3이지만 혹시 더 들어와도 안전)
year_cols = sorted(
    [c for c in pivot_rate_df.columns if isinstance(c, (int, np.integer))]
)
target_years = year_cols[-3:]  # 예: [2022, 2023, 2024]

# 가중치(최근일수록 중요)
w = np.array([0.2, 0.3, 0.5], dtype=float)
w = w / w.sum()

# 점수 가중치 (프로젝트용 기본값)
alpha = 0.4  # slope (방향성)
beta = 0.5  # weighted_mean (최근 수준)
gamma = 0.1  # std (변동성 페널티)

score_df = pivot_rate_df[["행정코드", "지역명"] + target_years].copy()
y = score_df[target_years].to_numpy(dtype=float)

# =========================
# 1) 추세 기울기(slope): 선형회귀의 기울기
#    x = [0,1,2]로 고정 (연도 간격 동일 가정)
# =========================
x = np.arange(len(target_years), dtype=float)
x_centered = x - x.mean()
x_denom = (x_centered**2).sum()

# slope = cov(x,y)/var(x) (row-wise)
y_centered = y - np.nanmean(y, axis=1, keepdims=True)
slope = np.nansum(x_centered * y_centered, axis=1) / x_denom

# =========================
# 2) 최근가중평균(weighted mean)
# =========================
weighted_mean = np.nansum(y * w, axis=1)

# =========================
# 3) 변동성(std)
# =========================
std = np.nanstd(y, axis=1)

score_df["slope"] = slope
score_df["weighted_mean"] = weighted_mean
score_df["std"] = std

# =========================
# 4) 스케일 보정: z-score 표준화 (권장)
#    - slope, weighted_mean, std의 단위/범위가 다를 수 있음
# =========================
for col in ["slope", "weighted_mean", "std"]:
    mu = score_df[col].mean(skipna=True)
    sd = score_df[col].std(skipna=True)
    score_df[f"z_{col}"] = (score_df[col] - mu) / (sd if sd != 0 else 1)

# 최종 점수
score_df["top5_score"] = (
    alpha * score_df["z_slope"]
    + beta * score_df["z_weighted_mean"]
    - gamma * score_df["z_std"]
)

# =========================
# 5) 필터: "증가"의 최소 조건
#    옵션 A: 최신년도 > 0 (가장 강한 필터)
#    옵션 B: 최근 2년 평균 > 0 (조금 완화)
# =========================
latest_year = target_years[-1]
recent2_mean = score_df[target_years[-2:]].mean(axis=1)

# 원하는 필터 하나 선택해서 쓰면 됨
score_df_filtered = score_df[(score_df[latest_year] > 0) & (recent2_mean > 0)].copy()

# =========================
# 6) TOP5 뽑기 + 보기 좋게 정렬
# =========================
top5 = score_df_filtered.sort_values("top5_score", ascending=False).head(5)[
    ["행정코드", "지역명"]
    + target_years
    + ["slope", "weighted_mean", "std", "top5_score"]
]

top10 = score_df_filtered.sort_values("top5_score", ascending=False).head(10)[
    ["행정코드", "지역명"]
    + target_years
    + ["slope", "weighted_mean", "std", "top5_score"]
]

year_cols = [col for col in pivot_rate_df.columns if isinstance(col, int)]

# 단순 평균
pivot_rate_df["평균"] = pivot_rate_df[year_cols].mean(axis=1)

# 중앙값
pivot_rate_df["중앙값"] = pivot_rate_df[year_cols].median(axis=1)

n = len(year_cols)
mid = n // 2

pivot_rate_df["증가율"] = pivot_rate_df[year_cols[mid:]].mean(axis=1) - pivot_rate_df[
    year_cols[:mid]
].mean(axis=1)

## 인구 십만명당 문화기반시설수

In [351]:
# 인구 십만명당 문화기반시설수(시도/시/군/구)
# https://kosis.kr/statHtml/statHtml.do?orgId=101&tblId=DT_1YL20931&conn_path=I2

import requests
import pandas as pd
import os

admin = pd.read_csv("../../data/행정구역코드.csv")
admin["행정구역코드"] = admin["행정구역코드"].astype(str)

# 수도권 + 광역시 제외
exclude_prefix = ["11", "31", "21", "22", "23", "24", "25", "26"]
sigun = admin[~admin["행정구역코드"].str[:2].isin(exclude_prefix)].copy()

objL1 = "+".join(sigun["행정구역코드"].tolist()) + "+34360+35310+36420+"

url = "https://kosis.kr/openapi/Param/statisticsParameterData.do"

params = {
    "method": "getList",
    "apiKey": os.getenv("KOSIS_API_KEY"),
    "orgId": "101",
    "tblId": "DT_1YL20931",
    "itmId": "T001+",
    "objL1": objL1,
    "format": "json",
    "jsonVD": "Y",
    "prdSe": "Y",
    "newEstPrdCnt": "1",  # 최근 1개 연도 -> 2024년 기준
}

res = requests.get(url, params=params)

data = res.json()
culture_df = pd.DataFrame(data)

culture_df["DT"] = pd.to_numeric(culture_df["DT"], errors="coerce")
culture_clean = culture_df[["C1_NM", "DT"]].copy()
culture_clean.columns = ["C1_NM", "문화시설수"]

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# 한글 폰트 설정
plt.rcParams["font.family"] = "Malgun Gothic"
plt.rcParams["axes.unicode_minus"] = False

# ttop10와 culture_clean 병합 (지역명 기준)
merged_df = pd.merge(
    top10, culture_clean, left_on="지역명", right_on="C1_NM", how="inner"
)

# top5_score 기준으로 정렬
merged_df = merged_df.sort_values(by="top5_score", ascending=False)

# 산점도 그래프
plt.figure(figsize=(12, 8))
plt.scatter(
    merged_df["문화시설수"],
    merged_df["top5_score"],
    s=150,
    alpha=0.6,
    color="steelblue",
)

# 지역 레이블 표시
for _, row in merged_df.iterrows():
    plt.text(
        row["문화시설수"],
        row["top5_score"],
        row["지역명"],
        fontsize=11,
        ha="left",
        va="bottom",
    )

# 기준선 추가
plt.axhline(
    merged_df["top5_score"].median(),
    linestyle="--",
    color="red",
    alpha=0.5,
    label=f"청년유입 점수 중앙값: {merged_df['top5_score'].median():.2f}",
)
plt.axvline(
    merged_df["문화시설수"].median(),
    linestyle="--",
    color="blue",
    alpha=0.5,
    label=f"문화시설수 중앙값: {merged_df['문화시설수'].median():.2f}",
)

plt.xlabel("인구 10만명당 문화기반시설 수", fontsize=12)
plt.ylabel("청년유입 점수", fontsize=12)
plt.title("청년유입 상위 지역의 문화시설 수", fontsize=14, fontweight="bold")
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 공공도서관

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

library_df = pd.read_excel(
    "./data/2025 전국 문화기반시설 총람.xlsx",
    sheet_name="공공도서관",
    skiprows=2,
    header=3,
)

library_df = library_df.rename(columns={"Unnamed: 2": "C1_NM"})
library_df = (
    library_df.groupby(["Unnamed: 1", "C1_NM"]).size().reset_index(name="공공도서관수")
)

# 한글 폰트 설정
plt.rcParams["font.family"] = "Malgun Gothic"
plt.rcParams["axes.unicode_minus"] = False

# ttop10와 library_df 병합 (지역명 기준)
merged_df = pd.merge(top10, library_df, left_on="지역명", right_on="C1_NM", how="inner")

# top5_score 기준으로 정렬
merged_df = merged_df.sort_values(by="top5_score", ascending=False)

# 산점도 그래프
plt.figure(figsize=(12, 8))
plt.scatter(
    merged_df["공공도서관수"],
    merged_df["top5_score"],
    s=150,
    alpha=0.6,
    color="steelblue",
)

# 지역 레이블 표시
for _, row in merged_df.iterrows():
    plt.text(
        row["공공도서관수"],
        row["top5_score"],
        row["지역명"],
        fontsize=11,
        ha="left",
        va="bottom",
    )

# 기준선 추가
plt.axhline(
    merged_df["top5_score"].median(),
    linestyle="--",
    color="red",
    alpha=0.5,
    label=f"청년유입 점수 중앙값: {merged_df['top5_score'].median():.2f}",
)
plt.axvline(
    merged_df["공공도서관수"].median(),
    linestyle="--",
    color="blue",
    alpha=0.5,
    label=f"공공도서관수 중앙값: {merged_df['공공도서관수'].median():.2f}",
)

plt.xlabel("공공도서관 수", fontsize=12)
plt.ylabel("청년유입 점수", fontsize=12)
plt.title("청년유입 상위 지역의 공공도서관 수", fontsize=14, fontweight="bold")
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 생활문화센터

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

culture_center_df = pd.read_excel(
    "./data/2025 전국 문화기반시설 총람.xlsx",
    sheet_name="생활문화센터",
    skiprows=2,
    header=2,
)

culture_center_df = culture_center_df.rename(columns={"Unnamed: 2": "C1_NM"})
culture_center_df = (
    culture_center_df.groupby(["Unnamed: 1", "C1_NM"])
    .size()
    .reset_index(name="생활문화센터수")
)

# 한글 폰트 설정
plt.rcParams["font.family"] = "Malgun Gothic"
plt.rcParams["axes.unicode_minus"] = False

# ttop10와 culture_center_df 병합 (지역명 기준)
merged_df = pd.merge(
    top10, culture_center_df, left_on="지역명", right_on="C1_NM", how="inner"
)

# top5_score 기준으로 정렬
merged_df = merged_df.sort_values(by="top5_score", ascending=False)

# 산점도 그래프
plt.figure(figsize=(12, 8))
plt.scatter(
    merged_df["생활문화센터수"],
    merged_df["top5_score"],
    s=150,
    alpha=0.6,
    color="steelblue",
)

# 지역 레이블 표시
for _, row in merged_df.iterrows():
    plt.text(
        row["생활문화센터수"],
        row["top5_score"],
        row["지역명"],
        fontsize=11,
        ha="left",
        va="bottom",
    )

# 기준선 추가
plt.axhline(
    merged_df["top5_score"].median(),
    linestyle="--",
    color="red",
    alpha=0.5,
    label=f"청년유입 점수 중앙값: {merged_df['top5_score'].median():.2f}",
)
plt.axvline(
    merged_df["생활문화센터수"].median(),
    linestyle="--",
    color="blue",
    alpha=0.5,
    label=f"생활문화센터수 중앙값: {merged_df['생활문화센터수'].median():.2f}",
)

plt.xlabel("생활문화센터 수", fontsize=12)
plt.ylabel("청년유입 점수", fontsize=12)
plt.title("청년유입 상위 지역의 생활문화센터 수", fontsize=14, fontweight="bold")
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 영화관

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

theater_df = pd.read_csv("./data/KC_497_DMSTC_MCST_THEART_2025.csv")

theater_df = (
    theater_df.groupby(["sido_nm", "sgg_nm"]).size().reset_index(name="영화관수")
)

theater_use = theater_df.copy()

# 행정명 정규화
theater_use["C1_NM"] = theater_use["sgg_nm"]

# ○○시○○구 → ○○시 (구가 있는 경우만)
theater_use["C1_NM"] = theater_use["C1_NM"].str.replace(
    r"(.*?시).+구$", r"\1", regex=True
)

# 시군 단위 영화관 개수 합산
theater_df2 = theater_use.groupby("C1_NM", as_index=False)["영화관수"].sum()

# 한글 폰트 설정
plt.rcParams["font.family"] = "Malgun Gothic"
plt.rcParams["axes.unicode_minus"] = False

# ttop10와 theater_df2 병합 (지역명 기준)
merged_df = pd.merge(
    top10, theater_df2, left_on="지역명", right_on="C1_NM", how="inner"
)

# top5_score 기준으로 정렬
merged_df = merged_df.sort_values(by="top5_score", ascending=False)

# 산점도 그래프
plt.figure(figsize=(12, 8))
plt.scatter(
    merged_df["영화관수"],
    merged_df["top5_score"],
    s=150,
    alpha=0.6,
    color="steelblue",
)

# 지역 레이블 표시
for _, row in merged_df.iterrows():
    plt.text(
        row["영화관수"],
        row["top5_score"],
        row["지역명"],
        fontsize=11,
        ha="left",
        va="bottom",
    )

# 기준선 추가
plt.axhline(
    merged_df["top5_score"].median(),
    linestyle="--",
    color="red",
    alpha=0.5,
    label=f"청년유입 점수 중앙값: {merged_df['top5_score'].median():.2f}",
)
plt.axvline(
    merged_df["영화관수"].median(),
    linestyle="--",
    color="blue",
    alpha=0.5,
    label=f"영화관수 중앙값: {merged_df['영화관수'].median():.2f}",
)

plt.xlabel("영화관 수", fontsize=12)
plt.ylabel("청년유입 점수", fontsize=12)
plt.title("청년유입 상위 지역의 영화관 수", fontsize=14, fontweight="bold")
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# 2. 유튜브 댓글 크롤링 후 워드 클라우드, LDA 생성

## https://www.youtube.com/watch?v=kxkacVPgTIk

### 유튜브 크롤링

In [None]:
baseUrl = "https://www.googleapis.com/youtube/v3/commentThreads"

part = "snippet"
videoId = "kxkacVPgTIk"
key = os.getenv("YOUTUBE_API_KEY")
maxResults = 100
textFormat = "plainText"
nextPageToken = None

dataList = []
max_rep = 50

for i in range(max_rep):
    url = f"{baseUrl}?part={part}&videoId={videoId}&key={key}&maxResults={maxResults}&textFormat={textFormat}"

    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()

        nextPageToken = data.get("nextPageToken")

        for i in range(len(data["items"])):
            dataList.append(data["items"][i]["snippet"]["topLevelComment"]["snippet"])

        if nextPageToken is None:
            break

print(len(dataList))

df = pd.DataFrame(dataList)

### 워드클라우드

In [None]:
import re
from kiwipiepy import Kiwi
from collections import Counter

kiwi = Kiwi()
word_list = []

for sent in df["textOriginal"]:
    clean_sent = re.sub("[^0-9a-zA-Z가-힣\s]", "", sent)
    tokens = kiwi.analyze(clean_sent)[0][0]
    sub_list = []

    for token in tokens:
        word = token.form
        pos = token.tag

        if pos == "NNG" and len(word) > 1:
            sub_list.append(word)

    word_list.extend(sub_list)

print(len(word_list))

counter = Counter(word_list)

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import koreanize_matplotlib

wc = WordCloud(
    font_path="C:\Windows\Fonts\malgun.ttf",
    background_color="white",
    width=800,
    height=400,
    prefer_horizontal=1.0,
)

wc.generate_from_frequencies(counter)

plt.figure(figsize=(8, 8))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()

### LDA

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(
    max_df=0.1, min_df=2, max_features=1000, ngram_range=(1, 2)
)

feat_vec = count_vectorizer.fit_transform(word_list)
feat_vec.shape

In [None]:
import pandas as pd

df_vec = pd.DataFrame(
    feat_vec.toarray(), columns=count_vectorizer.get_feature_names_out()
)
df_vec.head()

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=3)
lda.fit(feat_vec)

In [None]:
import pyLDAvis.lda_model

pyLDAvis.enable_notebook()
vis = pyLDAvis.lda_model.prepare(lda, feat_vec, count_vectorizer)
pyLDAvis.display(vis)

In [322]:
import pyLDAvis

pyLDAvis.save_html(vis, "lda_vis.html")

## https://www.youtube.com/watch?v=ZghVVrcYAuc

### 유튜브 크롤링

In [None]:
import os
import requests

baseUrl = "https://www.googleapis.com/youtube/v3/commentThreads"

part = "snippet"
videoId = "ZghVVrcYAuc"
key = os.getenv("YOUTUBE_API_KEY")
maxResults = 100
textFormat = "plainText"
nextPageToken = None

dataList2 = []
max_rep = 50

for i in range(max_rep):
    url = f"{baseUrl}?part={part}&videoId={videoId}&key={key}&maxResults={maxResults}&textFormat={textFormat}"

    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()

        nextPageToken = data.get("nextPageToken")

        for i in range(len(data["items"])):
            dataList2.append(data["items"][i]["snippet"]["topLevelComment"]["snippet"])

        if nextPageToken is None:
            break

print(len(dataList2))

df = pd.DataFrame(dataList2)

### 워드클라우드

In [None]:
import re
from kiwipiepy import Kiwi
from collections import Counter

kiwi = Kiwi()
word_list2 = []

for sent in df["textOriginal"]:
    clean_sent = re.sub("[^0-9a-zA-Z가-힣\s]", "", sent)
    tokens = kiwi.analyze(clean_sent)[0][0]
    sub_list = []

    for token in tokens:
        word = token.form
        pos = token.tag

        if pos == "NNG" and len(word) > 1:
            sub_list.append(word)

    word_list2.extend(sub_list)

print(len(word_list2))

counter2 = Counter(word_list2)

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import koreanize_matplotlib

wc = WordCloud(
    font_path="C:\Windows\Fonts\malgun.ttf",
    background_color="white",
    width=800,
    height=400,
    prefer_horizontal=1.0,
)

wc.generate_from_frequencies(counter2)

plt.figure(figsize=(8, 8))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()

### LDA

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(
    max_df=0.1, min_df=2, max_features=1000, ngram_range=(1, 2)
)

feat_vec = count_vectorizer.fit_transform(word_list2)
feat_vec.shape

In [None]:
import pandas as pd

df_vec = pd.DataFrame(
    feat_vec.toarray(), columns=count_vectorizer.get_feature_names_out()
)
df_vec.head()

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=3)
lda.fit(feat_vec)

In [None]:
import pyLDAvis.lda_model

pyLDAvis.enable_notebook()
vis = pyLDAvis.lda_model.prepare(lda, feat_vec, count_vectorizer)
pyLDAvis.display(vis)

In [334]:
import pyLDAvis

pyLDAvis.save_html(vis, "lda_vis2.html")

## https://www.youtube.com/watch?v=PfseJBL9Vl0

### 유튜브 크롤링

In [None]:
import os
import requests

baseUrl = "https://www.googleapis.com/youtube/v3/commentThreads"

part = "snippet"
videoId = "PfseJBL9Vl0"
key = os.getenv("YOUTUBE_API_KEY")
maxResults = 100
textFormat = "plainText"
nextPageToken = None

dataList3 = []
max_rep = 50

for i in range(max_rep):
    url = f"{baseUrl}?part={part}&videoId={videoId}&key={key}&maxResults={maxResults}&textFormat={textFormat}"

    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()

        nextPageToken = data.get("nextPageToken")

        for i in range(len(data["items"])):
            dataList3.append(data["items"][i]["snippet"]["topLevelComment"]["snippet"])

        if nextPageToken is None:
            break

print(len(dataList3))

df = pd.DataFrame(dataList3)

### 워드클라우드

In [None]:
from konlpy.tag import Okt
import re
from kiwipiepy import Kiwi
import re

kiwi = Kiwi()
word_list3 = []

for sent in df["textOriginal"]:
    clean_sent = re.sub("[^0-9a-zA-Z가-힣\s]", "", sent)
    tokens = kiwi.analyze(clean_sent)[0][0]
    sub_list = []

    for token in tokens:
        word = token.form
        pos = token.tag

        if pos == "NNG" and len(word) > 1:
            sub_list.append(word)

    word_list3.extend(sub_list)

print(len(word_list3))

counter3 = Counter(word_list3)

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import koreanize_matplotlib

wc = WordCloud(
    font_path="C:\Windows\Fonts\malgun.ttf",
    background_color="white",
    width=800,
    height=400,
    prefer_horizontal=1.0,
)

wc.generate_from_frequencies(counter3)

plt.figure(figsize=(8, 8))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()

### LDA

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(
    max_df=0.1, min_df=2, max_features=1000, ngram_range=(1, 2)
)

feat_vec = count_vectorizer.fit_transform(word_list3)
feat_vec.shape

In [None]:
import pandas as pd

df_vec = pd.DataFrame(
    feat_vec.toarray(), columns=count_vectorizer.get_feature_names_out()
)
df_vec.head()

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=3)
lda.fit(feat_vec)

In [None]:
import pyLDAvis.lda_model

pyLDAvis.enable_notebook()
vis = pyLDAvis.lda_model.prepare(lda, feat_vec, count_vectorizer)
pyLDAvis.display(vis)

In [344]:
import pyLDAvis

pyLDAvis.save_html(vis, "lda_vis3.html")