## 1. Translated data

In [None]:
import requests
import polars as pl
from tqdm import tqdm
from googletrans import Translator

ModuleNotFoundError: No module named 'googletrans'

In [None]:
def translate_str_column(df, col, batch_size=32):
    """
    Translates a column of strings in a Polars DataFrame to English.

    Args:
        df: The Polars DataFrame.
        col: The name of the column to translate.
        batch_size: The number of strings to translate in each batch.

    Returns:
        The DataFrame with the translated column.
    """
    translator = Translator()
    texts = df[col].to_list()  # Convert to Python list
    trans_texts = []

    # Handling Potential List[null] or other unexpected data types
    for i in tqdm(range(0, df.shape[0], batch_size)):
        texts_batch = list(texts[i : i + batch_size])
        texts_batch = [
            str(text) if not isinstance(text, str) else text  # convert non-strings to strings
            for text in texts_batch
        ]

        # handle if field column has list[null] or similar
        texts_batch = [
            "" if text is None or (isinstance(text, list) and all(x is None for x in text)) else text
            for text in texts_batch
        ]

        trans_batch = translator.translate(texts_batch, dest="en")  # Translate to English
        trans_batch = [trans.text.lower() for trans in trans_batch]
        trans_texts.extend(trans_batch)

    df = df.with_columns(pl.Series(name=col + "_en", values=trans_texts))
    return df

def translate_cate_column(df, col, batch_size=200):
    # Strip whitespaces from texts
    df = df.with_columns(pl.col(col).str.strip())

    # Get all unique categories
    categories = set(df[col])

    # Remove None and empty strings from categories
    if None in categories:
        categories.remove(None)

    if '' in categories:
        categories.remove('')

    categories = list(categories)

    # Translate categories and create a mapping
    mapping = {None: None, '': ''}

    for i in tqdm(range(0, len(categories), batch_size)):
        batch_cates = categories[i : i + batch_size]
        trans_cates = translator.translate(batch_cates, dest='en')  # Translate to English
        trans_cates = [trans.text.lower() for trans in trans_cates]
        assert len(batch_cates) == len(trans_cates)

        for j in range(len(batch_cates)):
            mapping[batch_cates[j]] = trans_cates[j]

    # Map Chinese categories to English categories
    df = df.with_columns(pl.col(col).map_elements(lambda x: mapping[x]))

    return df

def translate_cate_list_column(df, col, batch_size=500):
    # Strip whitespaces from list elements
    df = df.with_columns(pl.col(col).list.eval(pl.element().str.strip()))

    # Save all unique categories
    categories = {cate for row_cates in df[col] for cate in row_cates}

    # Remove None and empty strings from categories
    if None in categories:
        categories.remove(None)

    if '' in categories:
        categories.remove('')

    categories = list(categories)

    # Translate categories and create a mapping
    mapping = {None: None, '': ''}

    for i in tqdm(range(0, len(categories), batch_size)):
        batch_cates = categories[i : i + batch_size]
        trans_cates = translator.translate(batch_cates, dest='en')  # Translate to English
        trans_cates = [trans.text.lower() for trans in trans_cates]
        assert len(batch_cates) == len(trans_cates)

        for j in range(len(batch_cates)):
            mapping[batch_cates[j]] = trans_cates[j]

    # Map Chinese category lists to English category lists
    df = df.with_columns(pl.col(col).map_elements(lambda row_cates: [mapping[cate] for cate in row_cates]))

    return df

In [None]:
import requests

url = "https://lfs.aminer.cn/misc/moocdata/data/mooccube2/entities/course.json"
course_df = pl.read_ndjson(url)
course_df

id,name,field,prerequisites,about,resource
str,str,list[str],str,str,list[struct[3]]
"""C_584313""","""《资治通鉴》导读""","[""历史学"", ""中国语言文学""]","""""","""通过老师导读，同学们可深入这一经典文本内部，得以纵览千年历史…","[{[""第一课 导论与三家分晋"", ""导论"", ""导论""],""V_849"",""1.1.1""}, {[""第一课 导论与三家分晋"", ""智伯的覆亡"", ""智伯的覆亡""],""V_850"",""1.2.1""}, … {[""第十五课 隋唐霸业"", null, ""第十五课 隋唐霸业--习题""],""Ex_957"",""15.8""}]"
"""C_584329""","""微积分——极限理论与一元函数""","[""应用经济学"", ""数学"", … ""理论经济学""]","""""","""本课程是理工科的一门数学基础课，系统、全面地介绍了一元函数微…","[{[""序言"", ""序言"", ""序言""],""V_1350"",""1.1.1""}, {[""第一章 实数与函数"", ""第一节 实数集的界与确界"", ""实数集的界""],""V_1351"",""2.1.1""}, … {[""第八章 级数"", null, ""第八章 级数--第六节思考与练习""],""Ex_1545"",""9.9""}]"
"""C_584381""","""新闻摄影""","[""艺术学"", ""新闻传播学""]","""""","""掌握基本的摄影技能，了解图片新闻的工作方式，训练对生活的观察…","[{[""第一章 绪论"", ""第一讲 引言1"", ""引言1""],""V_1800"",""1.1.1""}, {[""第一章 绪论"", ""第二讲 引言2"", ""引言2""],""V_1801"",""1.2.1""}, … {[""大作业提交"", null, ""《大作业》提交--小节""],""Ex_1926"",""20.4""}]"
"""C_597208""","""数据挖掘：理论与算法""","[""计算机科学与技术""]","""""","""最有趣的理论+最有用的算法=不得不学的数据科学。""","[{[""走进数据科学：博大精深，美不胜收"", ""整装待发"", ""Video""],""V_2961"",""1.1.1""}, {[""走进数据科学：博大精深，美不胜收"", ""学而不思则罔"", ""Video""],""V_2962"",""1.3.1""}, … {[""美丽数据说：阆苑仙葩，美玉无瑕"", null, ""第十一章第一节测试题""],""Ex_3104"",""11.1""}]"
"""C_597225""","""大学计算机""",[],"""""","""大学计算机课程将以计算思维为导向，以计算机原理、概念为基础，…","[{[""第1周： 基于计算机的问题求解"", ""课程介绍"", ""开篇""],""V_4596"",""1.1.1""}, {[""第1周： 基于计算机的问题求解"", ""1.0 本章导学"", ""1.0 本章导学""],""V_4597"",""1.2.1""}, … {[""第9周：算法与程序设计"", null, ""第九周测验""],""Ex_4827"",""10.12""}]"
…,…,…,…,…,…
"""C_2338076""","""（疾风计划）软件工程""",[],"""""","""疾风计划2021【首期限额招募】进行中 名校名师丨专属助教丨…","[{[""第1章 初识软件工程 "", ""1.1 软件无处不在 "", ""讲课视频""],""V_8729252"",""1.1.1""}, {[""第1章 初识软件工程 "", ""1.2 软件的本质特性"", ""讲授视频""],""V_8729253"",""1.2.1""}, … {[""第15章 期末考试与总结"", null, ""第一部分：基础知识""],""Ex_8729369"",""15.2""}]"
"""C_2341259""","""（疾风计划）面向对象程序设计（C++）""",[],"""""","""疾风计划2021【首期限额招募】进行中 名校名师丨专属助教丨…","[{[""第一讲 课程简介与编程环境"", ""1.0 课程定位、教学内容"", ""课程定位与教学内容""],""V_8782384"",""1.1.1""}, {[""第一讲 课程简介与编程环境"", ""1.1 编程环境与工具"", ""程序结构与编译链接""],""V_8782385"",""1.3.1""}, … {[""期末考试"", null, ""期末考试--作业""],""Ex_8782460"",""10.2""}]"
"""C_2337996""","""（疾风计划）数据结构(下)""",[],"""""","""疾风计划2021【首期限额招募】进行中 名校名师丨专属助教丨…","[{[""第零章 "", ""选课之前"", ""宣传片""],""V_8727594"",""0.1""}, {[""第零章 "", ""OJ系统说明"", ""1-注册与登录""],""V_8727597"",""0.2.1""}, … {[""第十二章 排序"", ""本章测验"", ""本章测试""],""Ex_8727931"",""14.7.1""}]"
"""C_1945689""","""机器学习训练营""",[],"""""","""清华张敏老师带你12周掌握机器学习！8大经典算法 +7大实训…","[{[""序-开营仪式及学习课件"", null, ""1.1_初识机器学习""],""V_7211133"",""0.1""}, {[""序-开营仪式及学习课件"", null, ""【开营仪式回放】！！10.21""],""V_8124356"",""0.4""}, … {[""机器学习算法总结"", null, ""12.1_机器学习算法总结""],""V_7769687"",""12""}]"


In [None]:
course_df = translate_str_column(course_df, 'name')

In [None]:
course_df = translate_str_column(course_df, 'field')

In [None]:
course_df = translate_str_column(course_df, 'about')

## 2. Load data

In [None]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
import polars as pl
from tqdm import tqdm
import os
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy

In [None]:
path = '/gdrive/MyDrive/Nhóm 1/2 Bài tập trên lớp/Bài tập thực hành 1/Translated_Data/course_translated.json'

course_df = pl.read_json(path)
course_df = course_df.to_pandas()
course_df.head()

Unnamed: 0,id,name,field,prerequisites,about,resource
0,C_584313,"introduction to ""zi zhi tong jian""","['history', 'chinese language and literature']",,"through the teacher's guidance, students can g...","[{'titles': ['第一课 导论与三家分晋', '导论', '导论'], 'reso..."
1,C_584329,calculus - limit theory and functions of one v...,"['applied economics', 'mathematics', 'physics'...",,this course is a basic mathematics course in s...,"[{'titles': ['序言', '序言', '序言'], 'resource_id':..."
2,C_584381,photojournalism,"['art', 'journalism']",,"master basic photography skills, understand ho...","[{'titles': ['第一章 绪论', '第一讲 引言1', '引言1'], 'res..."
3,C_597208,data mining: theory and algorithms,['computer science and technology'],,the most interesting theory + the most useful ...,"[{'titles': ['走进数据科学：博大精深，美不胜收', '整装待发', 'Vide..."
4,C_597225,university computer,[],,university computer courses will be guided by ...,"[{'titles': ['第1周： 基于计算机的问题求解', '课程介绍', '开篇'],..."


## 3. Khám phá dữ liệu

In [None]:
course_df.shape

(3781, 6)

Sử dụng hàm `info()` để cung cấp một bản tóm tắt ngắn gọn của DataFrame (tên cột, kiểu dữ liệu, số lượng giá trị không bị thiếu)

In [None]:
course_df.info()

AttributeError: 'DataFrame' object has no attribute 'info'

- File course.json có 3781 hàng, Trong đó chỉ có 1 cột chứa 2 bộ giá trị NULL là cột prerequisites, còn lại các cột khác các bộ đều không chứa giá trị NULL.


### Xử lý dữ liệu Null

In [None]:
course_df.isnull().sum()

Unnamed: 0,0
id,0
name,0
field,0
prerequisites,2
about,0
resource,0


- **Thực** hành điền các giá trị NULL bằng giá trị ‘’


In [None]:
course_df.fillna('', inplace=True)

### Khảo sát cột field

In [None]:
course_df['field']

Unnamed: 0,field
0,"['history', 'chinese language and literature']"
1,"['applied economics', 'mathematics', 'physics'..."
2,"['art', 'journalism']"
3,['computer science and technology']
4,[]
...,...
3776,[]
3777,[]
3778,[]
3779,[]


In [None]:
course_df['field'].dtype

dtype('O')

In [None]:
(course_df['field'].astype(str) == '[]').sum()

3234

- Nhận xét: Có vẻ như đây là một trường không bắt buộc nên dữ liệu bị thiếu rất nhiều.


### Cột prerequisites

In [None]:
(course_df['prerequisites'].astype(str) == '').sum()

2580

Cột prerequisites cũng có kết quả tương tự tuy nhiên xét về mặt ý nghĩa thì đây cũng là điều bình thường

###Thống kê số lượng tài liệu

In [None]:
def count_element(x):
    return len(x)

course_df['number of resources'] = course_df['resource'].apply(count_element)
course_df[['id', 'name', 'number of resources']]


Unnamed: 0,id,name,number of resources
0,C_584313,"introduction to ""zi zhi tong jian""",91
1,C_584329,calculus - limit theory and functions of one v...,170
2,C_584381,photojournalism,127
3,C_597208,data mining: theory and algorithms,125
4,C_597225,university computer,165
...,...,...,...
3776,C_2338076,(wind project) software engineering,103
3777,C_2341259,(wind project) object-oriented programming (c++),76
3778,C_2337996,(wind project) data structure (part 2),324
3779,C_1945689,machine learning bootcamp,76


In [None]:
count_resources = course_df['number of resources'].sort_values()
count_resources

Unnamed: 0,number of resources
3115,1
3163,1
3162,1
3161,1
3160,1
...,...
827,600
1498,752
1676,1104
3558,1200


In [None]:
count_resources.sum()

271043

In [None]:
path = '/gdrive/MyDrive/Nhóm 1/2 Bài tập trên lớp/Bài tập thực hành 1/Translated_Data'
course_df[['id', 'number of resources']].to_csv(os.path.join(path, 'course_num_resources.csv'), index=False)

## Làm sạch dữ liệu

Xóa cột list_resource_id

In [None]:
course_df = course_df.drop(columns=['resource'])

In [None]:
course_df.head()

Unnamed: 0,id,name,field,prerequisites,about,number of resources
0,C_584313,"introduction to ""zi zhi tong jian""","['history', 'chinese language and literature']",,"through the teacher's guidance, students can g...",91
1,C_584329,calculus - limit theory and functions of one v...,"['applied economics', 'mathematics', 'physics'...",,this course is a basic mathematics course in s...,170
2,C_584381,photojournalism,"['art', 'journalism']",,"master basic photography skills, understand ho...",127
3,C_597208,data mining: theory and algorithms,['computer science and technology'],,the most interesting theory + the most useful ...,125
4,C_597225,university computer,[],,university computer courses will be guided by ...,165


Loại bỏ dữ liệu nhiễu

In [None]:
import re
import pandas as pd
# Hàm loại bỏ ký tự không mong muốn
def clean_text(text):
    # Loại bỏ ký tự đặc biệt
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)  # Giữ lại chữ cái, số và khoảng trắng
    text = text.strip()  # Loại bỏ khoảng trắng thừa ở hai đầu
    text = text.lower()  # Chuyển về viết thường
    return text

# Tiến hành xử lý cho cột 'course_name'
course_df['name'] = course_df['name'].apply(clean_text)

# Kiểm tra dữ liệu đã xử lý
course_df.head()

Unnamed: 0,id,name,field,prerequisites,about,number of resources
0,C_584313,introduction to zi zhi tong jian,"['history', 'chinese language and literature']",,"through the teacher's guidance, students can g...",91
1,C_584329,calculus limit theory and functions of one va...,"['applied economics', 'mathematics', 'physics'...",,this course is a basic mathematics course in s...,170
2,C_584381,photojournalism,"['art', 'journalism']",,"master basic photography skills, understand ho...",127
3,C_597208,data mining theory and algorithms,['computer science and technology'],,the most interesting theory + the most useful ...,125
4,C_597225,university computer,[],,university computer courses will be guided by ...,165
