In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install googletrans==3.1.0a0

Collecting googletrans==3.1.0a0
  Downloading googletrans-3.1.0a0.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3 (from googletrans==3.1.0a0)
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.1/55.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting hstspreload (from httpx==0.13.3->googletrans==3.1.0a0)
  Downloading hstspreload-2024.4.1-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m43.0 MB/s[0m eta [36m0:00:00[0m
Collecting chardet==3.* (from httpx==0.13.3->googletrans==3.1.0a0)
  Downloading chardet-3.0.4-py2.py3-none-any.whl (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.4/133.4 kB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting idna==2.* (from httpx==0.13.3->googletrans==3.1.0a0)
  Downloading idna-2.10-py2.py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━

In [None]:
import polars as pl
from tqdm import tqdm
import os

In [None]:
from googletrans import Translator
translator = Translator()

# Util functions

In [None]:
def translate_str_column(df, col, batch_size=200):
    texts = df[col]
    trans_texts = []

    for i in tqdm(range(0, df.shape[0], batch_size)):
        texts_batch = list(texts[i: i + batch_size])
        trans_batch = translator.translate(texts_batch, dest='vi')
        trans_batch = [trans.text.lower() for trans in trans_batch]
        trans_texts.extend(trans_batch)

    assert len(trans_texts) == df.shape[0]

    df = df.with_columns(pl.Series(name=f"{col}_trans", values=trans_texts))

    return df

def translate_cate_column(df, col, batch_size=200):
    # Strip texts
    df = df.with_columns(pl.col(col).str.strip())

    # Get all categories
    categories = set(df[col])

    if None in categories:
        categories.remove(None)

    if '' in categories:
        categories.remove('')

    categories = list(categories)

    # Dịch thể loại và tạo mapping
    mapping = {None: None, '': ''}

    for i in tqdm(range(0, len(categories), batch_size)):
        batch_cates = categories[i : i + batch_size]
        trans_cates = translator.translate(batch_cates, dest='vi')
        trans_cates = [trans.text.lower() for trans in trans_cates]
        assert len(batch_cates) == len(trans_cates)

        for j in range(len(batch_cates)):
            mapping[batch_cates[j]] = trans_cates[j]

    # Ánh xạ tiếng Trung qua tiếng Việt
    df = df.with_columns(pl.col(col).map_elements(lambda x: mapping[x]))

    return df

def translate_cate_list_column(df, col, batch_size=500):
    # Strip texts
    df = df.with_columns(pl.col(col).list.eval(pl.element().str.strip()))

    # Lưu các thể loại
    categories = {cate for row_cates in df[col] for cate in row_cates}

    if None in categories:
        categories.remove(None)

    if '' in categories:
        categories.remove('')

    categories = list(categories)

    # Dịch thể loại và tạo mapping
    mapping = {None: None, '': ''}

    for i in tqdm(range(0, len(categories), batch_size)):
        batch_cates = categories[i : i + batch_size]
        trans_cates = translator.translate(batch_cates, dest='vi')
        trans_cates = [trans.text.lower() for trans in trans_cates]
        assert len(batch_cates) == len(trans_cates)

        for j in range(len(batch_cates)):
            mapping[batch_cates[j]] = trans_cates[j]

    # Ánh xạ tiếng Trung qua tiếng Việt
    df = df.with_columns(pl.col(col).map_elements(lambda row_cates: [mapping[cate] for cate in row_cates]))

    return df

# Translation

In [None]:
RAW_DIR = '/content/drive/MyDrive/Colab Notebooks/Nhom_4/2. Thực Hành/Visualize Data/Data'
TRANS_DIR = '/content/drive/MyDrive/Colab Notebooks/Nhom_4/2. Thực Hành/Visualize Data/Translated Data'

## 1. Course

In [None]:
course_df = pl.read_ndjson('entities/course.json')
course_df.head()

id,name,field,prerequisites,about,resource
str,str,list[str],str,str,list[struct[3]]
"""C_584313""","""《资治通鉴》导读""","[""历史学"", ""中国语言文学""]","""""","""通过老师导读，同学们可深入这…","[{[""第一课 导论与三家分晋"", ""导论"", ""导论""],""V_849"",""1.1.1""}, {[""第一课 导论与三家分晋"", ""智伯的覆亡"", ""智伯的覆亡""],""V_850"",""1.2.1""}, … {[""第十五课 隋唐霸业"", null, ""第十五课 隋唐霸业--习题""],""Ex_957"",""15.8""}]"
"""C_584329""","""微积分——极限理论与一元函数…","[""应用经济学"", ""数学"", … ""理论经济学""]","""""","""本课程是理工科的一门数学基础…","[{[""序言"", ""序言"", ""序言""],""V_1350"",""1.1.1""}, {[""第一章 实数与函数"", ""第一节 实数集的界与确界"", ""实数集的界""],""V_1351"",""2.1.1""}, … {[""第八章 级数"", null, ""第八章 级数--第六节思考与练习""],""Ex_1545"",""9.9""}]"
"""C_584381""","""新闻摄影""","[""艺术学"", ""新闻传播学""]","""""","""掌握基本的摄影技能，了解图片…","[{[""第一章 绪论"", ""第一讲 引言1"", ""引言1""],""V_1800"",""1.1.1""}, {[""第一章 绪论"", ""第二讲 引言2"", ""引言2""],""V_1801"",""1.2.1""}, … {[""大作业提交"", null, ""《大作业》提交--小节""],""Ex_1926"",""20.4""}]"
"""C_597208""","""数据挖掘：理论与算法""","[""计算机科学与技术""]","""""","""最有趣的理论+最有用的算法=…","[{[""走进数据科学：博大精深，美不胜收"", ""整装待发"", ""Video""],""V_2961"",""1.1.1""}, {[""走进数据科学：博大精深，美不胜收"", ""学而不思则罔"", ""Video""],""V_2962"",""1.3.1""}, … {[""美丽数据说：阆苑仙葩，美玉无瑕"", null, ""第十一章第一节测试题""],""Ex_3104"",""11.1""}]"
"""C_597225""","""大学计算机""",[],"""""","""大学计算机课程将以计算思维为…","[{[""第1周： 基于计算机的问题求解"", ""课程介绍"", ""开篇""],""V_4596"",""1.1.1""}, {[""第1周： 基于计算机的问题求解"", ""1.0 本章导学"", ""1.0 本章导学""],""V_4597"",""1.2.1""}, … {[""第9周：算法与程序设计"", null, ""第九周测验""],""Ex_4827"",""10.12""}]"


In [None]:
course_df = translate_str_column(course_df, 'name')
course_df = translate_str_column(course_df, 'about')
course_df = translate_cate_list_column(course_df, 'field')

100%|██████████| 19/19 [02:51<00:00,  9.03s/it]
100%|██████████| 19/19 [06:45<00:00, 21.34s/it]
  df = df.with_columns(pl.col(col).list.eval(pl.element().str.strip()))
100%|██████████| 1/1 [00:03<00:00,  3.83s/it]


In [None]:
course_df

id,name,field,prerequisites,about,resource,name_trans,about_trans
str,str,list[str],str,str,list[struct[3]],str,str
"""C_584313""","""《资治通鉴》导读""","[""lịch sử"", ""trường dạy tiếng trung""]","""""","""通过老师导读，同学们可深入这…","[{[""第一课 导论与三家分晋"", ""导论"", ""导论""],""V_849"",""1.1.1""}, {[""第一课 导论与三家分晋"", ""智伯的覆亡"", ""智伯的覆亡""],""V_850"",""1.2.1""}, … {[""第十五课 隋唐霸业"", null, ""第十五课 隋唐霸业--习题""],""Ex_957"",""15.8""}]","""giới thiệu về …","""thông qua sự h…"
"""C_584329""","""微积分——极限理论与一元函数…","[""kinh tế học ứng dụng"", ""toán học"", … ""kinh tế lý thuyết""]","""""","""本课程是理工科的一门数学基础…","[{[""序言"", ""序言"", ""序言""],""V_1350"",""1.1.1""}, {[""第一章 实数与函数"", ""第一节 实数集的界与确界"", ""实数集的界""],""V_1351"",""2.1.1""}, … {[""第八章 级数"", null, ""第八章 级数--第六节思考与练习""],""Ex_1545"",""9.9""}]","""giải tích - lý…","""khóa học này l…"
"""C_584381""","""新闻摄影""","[""nghệ thuật"", ""báo chí""]","""""","""掌握基本的摄影技能，了解图片…","[{[""第一章 绪论"", ""第一讲 引言1"", ""引言1""],""V_1800"",""1.1.1""}, {[""第一章 绪论"", ""第二讲 引言2"", ""引言2""],""V_1801"",""1.2.1""}, … {[""大作业提交"", null, ""《大作业》提交--小节""],""Ex_1926"",""20.4""}]","""chụp ảnh tin t…","""nắm vững các k…"
"""C_597208""","""数据挖掘：理论与算法""","[""khoa học và công nghệ máy tính""]","""""","""最有趣的理论+最有用的算法=…","[{[""走进数据科学：博大精深，美不胜收"", ""整装待发"", ""Video""],""V_2961"",""1.1.1""}, {[""走进数据科学：博大精深，美不胜收"", ""学而不思则罔"", ""Video""],""V_2962"",""1.3.1""}, … {[""美丽数据说：阆苑仙葩，美玉无瑕"", null, ""第十一章第一节测试题""],""Ex_3104"",""11.1""}]","""khai thác dữ l…","""lý thuyết thú …"
"""C_597225""","""大学计算机""",[],"""""","""大学计算机课程将以计算思维为…","[{[""第1周： 基于计算机的问题求解"", ""课程介绍"", ""开篇""],""V_4596"",""1.1.1""}, {[""第1周： 基于计算机的问题求解"", ""1.0 本章导学"", ""1.0 本章导学""],""V_4597"",""1.2.1""}, … {[""第9周：算法与程序设计"", null, ""第九周测验""],""Ex_4827"",""10.12""}]","""máy tính đại h…","""các khóa học m…"
"""C_597229""","""财务分析与决策""","[""kinh tế học ứng dụng"", ""khoa học và kỹ thuật quản lý""]","""""","""这门课程用财务语言解构企业的…","[{[""资金的运用——认识资产"", ""1.1 绪论"", ""绪论""],""V_5042"",""2.1.1""}, {[""资金的运用——认识资产"", ""1.2 认识资产负债表"", ""认识资产负债表""],""V_5043"",""2.2.1""}, … {[""期末大作业——主观题"", null, ""期末主观题""],""Ex_5144"",""12.2""}]","""phân tích tài …","""khóa học này s…"
"""C_597291""","""高级英语写作""",[],"""""","""本课程能够帮助学生掌握英语段…","[{[""Chapter One Paragraph Writing"", ""1.1	Parts of a Paragraph"", ""1.1	Parts of a Paragraph""],""V_8379"",""1.1.1""}, {[""Chapter One Paragraph Writing"", ""1.2 Four Steps in Writing"", ""1.2.1 Step 1- Begin with a point""],""V_8380"",""1.2.1""}, … {[""Chapter Five Research Paper Writing"", null, ""Chapter Five Research Paper Writing""],""Ex_8432"",""5.3""}]","""viết tiếng anh…","""khóa học này c…"
"""C_597307""","""大唐兴衰""","[""lịch sử""]","""""","""隋唐五代史是史学名著《资治通…","[{[""第一课、隋朝开基"", ""第一节 隋帝杨坚"", ""第一节 隋帝杨坚""],""V_9394"",""1.1.1""}, {[""第一课、隋朝开基"", ""第二节 杨隋代周"", ""第二节 杨隋代周""],""V_9395"",""1.2.1""}, … {[""第十二课、落日长安"", null, ""第十二课、落日长安--习题""],""Ex_9464"",""12.5""}]","""sự thăng trầm …","""lịch sử nhà tù…"
"""C_597365""","""五分钟轻松搞定职场礼仪（20…",[],"""""","""职场“礼”为先，成功的未来不…","[{[""课程介绍动画：职场“礼”为先，成功的未来不是梦"", ""课程介绍动画"", ""课程介绍动画 — 职场“礼”为先，成功的未来不是梦""],""V_15713"",""1.1.1""}, {[""第一章 塑造优雅、大方、可亲的职业形象"", ""第一节 坐姿篇"", ""本讲导学 坐姿""],""V_15714"",""2.1.1""}, … {[""第五章 应对繁杂、多变、细致的文化差异"", ""第四节 欧美篇"", ""授课视频 讲义 — 欧美各国礼仪""],""V_15800"",""6.4.1""}]","""dễ dàng nắm vữ…","""“nghi thức” đư…"
"""C_597367""","""时尚化妆造型（2018秋）""",[],"""""","""针对爱美人士讲解时尚生活、新…","[{[""第一章 化妆基础"", ""1.1 化妆品与化妆工具"", ""Video""],""V_15849"",""1.1.1""}, {[""第一章 化妆基础"", ""1.1 化妆品与化妆工具"", ""Video""],""V_15850"",""1.1.2""}, … {[""第五章 时尚造型"", ""5.8 网拍广告妆"", ""Video""],""V_15872"",""6.8.2""}]","""trang điểm và …","""đối với những …"


In [None]:
course_df.write_ndjson(os.path.join(TRANS_DIR, 'entities/course.json'))

## 2. Field

In [None]:
course_field_df = pl.read_ndjson(os.path.join(RAW_DIR, 'relations/course-field.json'))
course_field_df.head(5)

course_id,course_name,field
i64,str,list[str]
584313,"""《资治通鉴》导读""","[""中国语言文学"", ""历史学""]"
681932,"""“做中学”Java程序设计""","[""计算机科学与技术""]"
674962,"""《红楼梦》的空间艺术""","[""中国语言文学""]"
682709,"""《纯粹理性批判》导论""","[""哲学""]"
682635,"""《统万城》导读""","[""历史学""]"


In [None]:
course_field_df = translate_str_column(course_field_df, 'course_name')
course_field_df = translate_cate_list_column(course_field_df, 'field')
course_field_df.write_ndjson(os.path.join(TRANS_DIR, 'relations/course-field.json'))

100%|██████████| 4/4 [00:29<00:00,  7.49s/it]
  df = df.with_columns(pl.col(col).list.eval(pl.element().str.strip()))
100%|██████████| 1/1 [00:03<00:00,  3.67s/it]


# 3. Concept

In [None]:
concept_df = pl.read_ndjson(os.path.join(RAW_DIR, 'entities/concept.json'))
concept_df.head(5)

id,name,context
str,str,list[str]
"""K_神经部_组织学与胚胎学""","""神经部""",[]
"""K_促甲状腺激素细胞_组织学…","""促甲状腺激素细胞""","[""质和髓质两部分组成 答案：B 13．腺垂体嗜酸性细胞可分为 A．催乳激素细胞、促肾上腺皮质激素细胞和促甲状腺激素细胞 B．生长激素细胞、催乳激素细胞和抗利尿激素细胞 C．促肾上腺皮质激素细胞、促甲状腺激素细胞和促性腺"", ""素细胞和促甲状腺激素细胞 B．生长激素细胞、催乳激素细胞和抗利尿激素细胞 C．促肾上腺皮质激素细胞、促甲状腺激素细胞和促性腺激素细胞 D．生长激素细胞、催乳激素细胞 E．催乳激素细胞、促甲状腺激素细胞和促性腺激素细胞"", … ""质和髓质两部分组成 答案：B 13．腺垂体嗜酸性细胞可分为 A．催乳激素细胞、促肾上腺皮质激素细胞和促甲状腺激素细胞 B．生长激素细胞、催乳激素细胞和抗利尿激素细胞 C．促肾上腺皮质激素细胞、促甲状腺激素细胞和促性腺""]"
"""K_嗜色细胞_组织学与胚胎学…","""嗜色细胞""",[]
"""K_生长激素细胞_组织学与胚…","""生长激素细胞""","[""案：B 13．腺垂体嗜酸性细胞可分为 A．催乳激素细胞、促肾上腺皮质激素细胞和促甲状腺激素细胞 B．生长激素细胞、催乳激素细胞和抗利尿激素细胞 C．促肾上腺皮质激素细胞、促甲状腺激素细胞和促性腺激素细胞 D．生长"", ""细胞、催乳激素细胞和抗利尿激素细胞 C．促肾上腺皮质激素细胞、促甲状腺激素细胞和促性腺激素细胞 D．生长激素细胞、催乳激素细胞 E．催乳激素细胞、促甲状腺激素细胞和促性腺激素细胞 答案：D 14．腺垂体嗜碱性细胞"", … ""细胞、催乳激素细胞和抗利尿激素细胞 C．促肾上腺皮质激素细胞、促甲状腺激素细胞和促性腺激素细胞 D．生长激素细胞、催乳激素细胞 E．催乳激素细胞、促甲状腺激素细胞和促性腺激素细胞 答案：D 14．腺垂体嗜碱性细胞""]"
"""K_褐铁矿_材料科学与工程""","""褐铁矿""","["" 褐铁矿（Limonite）是一种常见的铁矿，常形成于铁矿床的氧化带中，多以次生矿形态存在。也经常因沉积作用"", ""相当重要的铁矿资源之一。 中药“禹余粮”(别称：余粮石、白禹余、太一禹余粮、石脑）即此矿物。 沉积形褐铁矿。"", … ""，而另一种是镍纹石。[来源请求] 大部分采矿得来的镍都来自两种矿床。第一种是砖红壤，主要矿物为含镍的褐铁矿：(Fe, Ni)O(OH)与硅镁镍矿（一种含镍的硅酸盐）：(Ni, Mg)3Si2O5(OH)4。""]"


In [None]:
concept_df.shape

(637572, 3)

In [None]:
concept_df = translate_str_column(concept_df, 'name', 1000)
concept_df.write_ndjson(os.path.join(TRANS_DIR, 'entities/concept.json'))

  0%|          | 1/638 [02:41<28:34:42, 161.51s/it]


KeyboardInterrupt: 

## 4. School

In [None]:
school_df = pl.read_ndjson(os.path.join(RAW_DIR, 'entities/school.json'))
school_df.head(5)

id,name,name_en,sign,about,motto
str,str,str,str,str,str
"""S_1""","""清华大学""","""Tsinghua Unive…","""thu""","""简称“清华”，由中华人民共和…","""自强不息,厚德载物"""
"""S_2""","""北京大学""","""Peking Univers…","""PKU""","""北京大学（Peking Un…","""博学、审问、慎思、明辨"""
"""S_3""","""武汉大学""","""Wuhan Universi…","""whu""","""武汉大学（Wuhan Uni…","""自强 弘毅 求是 拓新"""
"""S_4""","""苏州大学""","""Soochow Univer…","""suda""","""苏州大学（Soochow U…","""养天地正气，法古今完人"""
"""S_5""","""四川大学""","""Sichuan Univer…","""scu""","""四川大学（Sichuan U…",""""""


In [None]:
school_df.shape

(429, 6)

In [None]:
school_df = translate_str_column(school_df, 'about')
school_df = translate_str_column(school_df, 'motto')
school_df.write_ndjson(os.path.join(TRANS_DIR, 'entities/school.json'))

100%|██████████| 3/3 [01:43<00:00, 34.64s/it]
100%|██████████| 3/3 [00:32<00:00, 10.74s/it]


## 5. Teacher

In [None]:
teacher_df = pl.read_ndjson(os.path.join(RAW_DIR, 'entities/teacher.json'))
teacher_df.head(5)

id,name,name_en,about,job_title,org_name
str,str,str,str,str,str
"""T_1""","""刘燕妮""","""Yanni Liu""","""北大哲学系毕业，清华大学马克…","""讲师""","""清华大学"""
"""T_2""","""陈怡""","""Yi Chen""","""1945年生于重庆，1967…","""教授""","""清华大学"""
"""T_3""","""程钢""","""Gang Cheng""","""程钢，《庄子哲学导读》课程负…","""副教授""","""清华大学"""
"""T_4""","""谢维和""",,"""谢维和，博士、教授、博士生导…","""教授""","""清华大学"""
"""T_5""","""史静寰""","""Jing-huan Shi""","""史静寰，女，清华大学教育研究…","""教授""","""清华大学"""


In [None]:
teacher_df.shape

(17018, 6)

In [None]:
# teacher_df = translate_cate_column(teacher_df, 'job_title')
# teacher_df = translate_cate_column(teacher_df, 'org_name')
teacher_df = translate_str_column(teacher_df, 'about')
teacher_df.write_ndjson(os.path.join(TRANS_DIR, 'entities/teacher.json'))

100%|██████████| 86/86 [41:37<00:00, 29.04s/it]


In [None]:
teacher_df

id,name,name_en,about,job_title,org_name,about_trans
str,str,str,str,str,str,str
"""T_1""","""刘燕妮""","""Yanni Liu""","""北大哲学系毕业，清华大学马克…","""giảng viên""","""đại học thanh …","""tốt nghiệp kho…"
"""T_2""","""陈怡""","""Yi Chen""","""1945年生于重庆，1967…","""giáo sư""","""đại học thanh …","""sinh năm 1945 …"
"""T_3""","""程钢""","""Gang Cheng""","""程钢，《庄子哲学导读》课程负…","""phó giáo sư""","""đại học thanh …","""cheng gang là …"
"""T_4""","""谢维和""",,"""谢维和，博士、教授、博士生导…","""giáo sư""","""đại học thanh …","""xie weihe, tiế…"
"""T_5""","""史静寰""","""Jing-huan Shi""","""史静寰，女，清华大学教育研究…","""giáo sư""","""đại học thanh …","""shi jinghuan, …"
"""T_6""","""王孙禺""",,"""王孙禺，汉族，教授、博士生导…","""giáo sư""","""đại học thanh …","""wang sunyu, qu…"
"""T_7""","""袁本涛""",,"""袁本涛，博士、教授、博士生导…","""giáo sư""","""đại học thanh …","""yuan bentao, t…"
"""T_8""","""林健""",,"""林健，福建福州人，英国Lan…","""giáo sư""","""đại học thanh …","""lin jian, ngườ…"
"""T_9""","""程建钢""",,"""程建钢，博士、教授、博士生导…","""giáo sư""","""đại học thanh …","""cheng jiangang…"
"""T_10""","""李越""",,"""李越，1962 年8月生人，…","""nhà nghiên cứu…","""đại học thanh …","""li yue, sinh t…"


## 6. User

In [None]:
user_df = pl.read_ndjson(os.path.join(RAW_DIR, 'entities/user.json'))
user_df.head(5)

id,name,gender,school,year_of_birth,course_order,enroll_time
str,str,i64,str,str,list[i64],list[str]
"""U_22""","""我""",0,"""""",,"[682129, 2294668]","[""2019-10-12 10:28:02"", ""2020-11-21 14:03:28""]"
"""U_24""","""王帅国""",1,"""清华大学""",,"[597214, 605512, … 2229905]","[""2019-05-20 16:06:48"", ""2019-05-24 19:34:43"", … ""2020-11-21 11:38:57""]"
"""U_25""","""王帅国""",0,"""清华大学""",,[1903985],"[""2020-08-07 18:59:13""]"
"""U_53""","""于歆杰""",1,"""清华大学""",,"[696679, 1704639, … 1794464]","[""2020-03-01 21:24:30"", ""2020-03-12 16:17:02"", … ""2020-06-18 18:47:15""]"
"""U_54""","""马昱春""",2,"""清华大学""",,"[682442, 682164, … 1906706]","[""2019-10-09 02:17:49"", ""2019-11-08 00:49:03"", … ""2020-07-20 11:27:28""]"


In [None]:
user_list_df = pl.read_csv('/content/drive/MyDrive/Colab Notebooks/Nhom_4/2. Thực Hành/Visualize Data/Model Data/user_list.txt', separator=' ')
user_list_df.head()

org_id,remap_id
str,i64
"""U_146""",0
"""U_185""",1
"""U_205""",2
"""U_217""",3
"""U_464""",4


In [None]:
org_ids = user_list_df['org_id'].to_list()
len(org_ids)

136860

In [None]:
filtered_user_df = user_df.filter(pl.col('id').is_in(org_ids))
filtered_user_df.head()

id,name,gender,school,year_of_birth,course_order,enroll_time
str,str,i64,str,str,list[i64],list[str]
"""U_146""","""张幸福""",1,"""""",,"[680824, 766203, … 680845]","[""2020-02-03 22:03:21"", ""2020-02-03 22:04:03"", … ""2020-04-14 20:31:43""]"
"""U_185""","""教师-陈燕秀""",2,"""贵州理工学院""",,"[696729, 707096, … 734053]","[""2019-09-18 10:13:50"", ""2019-11-27 08:06:22"", … ""2020-09-14 22:31:22""]"
"""U_205""","""尹亮""",1,"""青海大学""",,"[948435, 735362, … 948296]","[""2020-01-02 08:48:14"", ""2020-01-07 11:16:18"", … ""2020-06-17 14:06:34""]"
"""U_217""","""饲猫少年""",0,"""加利盾分校""",,"[697748, 883345, … 696685]","[""2020-02-07 11:50:27"", ""2020-02-07 11:51:18"", … ""2020-02-18 15:11:08""]"
"""U_464""","""教师-安宇""",1,"""清华大学""",,"[697069, 696700, … 1795810]","[""2019-09-19 10:05:10"", ""2019-09-19 12:21:11"", … ""2020-05-29 19:50:36""]"


In [None]:
filtered_user_df = translate_str_column(filtered_user_df, 'name')
filtered_user_df = translate_str_column(filtered_user_df, 'school')
filtered_user_df.write_ndjson(os.path.join(TRANS_DIR, 'entities/khai_user.json'))

 43%|████▎     | 214/500 [1:49:51<1:41:34, 21.31s/it]