# Drive and Frameworks Setup

In [1]:
# connect with gdrive
from google.colab import drive
drive.mount('/content/drive')
# %cd /content/drive/MyDrive

# connect with "DPI Summer Interns 2024" folder
# !ls "/content/drive/MyDrive/DPI Summer Interns 2024/Hassan/MultiLangPolicyScanner/policy_corpus"

# connect with "DPI Summer Interns 2024" folder
%cd /content/drive/MyDrive/DPI Summer Interns 2024/Hassan/MultiLangPolicyScanner/policy_corpus

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/.shortcut-targets-by-id/1O7dDY06zyuGFOPs6tPr05M2GKlVoONFs/DPI Summer Interns 2024/Hassan/MultiLangPolicyScanner/policy_corpus


In [2]:
# Install necessary libraries
!pip3 install konlpy
!pip3 install hanzidentifier
!pip3 install sudachipy
!pip3 install sudachidict_full #this one takes a sec
!pip3 install preprocess



In [3]:
# Import necessary libraries

#general
import pandas as pd
import os
import re
import glob
import regex
import difflib
import nltk
import sys
from tqdm.notebook import tqdm
import preprocess

#cprofile
import cProfile
import pstats
from pstats import SortKey

#japanese
import sudachipy

#korean
from konlpy.tag import Okt

#chinese
import jieba
import hanzidentifier

# Lists of Language Files

In [4]:
# read privacy_links_df.csv into a dataframe
privacy_links_df = pd.read_csv('privacy_links_df_updated.csv')
japanese_privacy_links_df = pd.read_csv('language_policy_links/JP_privacy_links_top_1000(Corrected).csv')
# privacy_links_df['valid_language']
# print(privacy_links_df[privacy_links_df['policy_downloaded'] == True]['init_language'].value_counts())
# print(privacy_links_df[privacy_links_df['policy_downloaded'] == True]['valid_language'].value_counts())

ja_policy_list = japanese_privacy_links_df['hash'].tolist() # modify to read the ~1200 or so links
ko_policy_list = privacy_links_df[(privacy_links_df['policy_downloaded'] == True) & (privacy_links_df['valid_language'] == 'ko')]['hash'].tolist()
chinesese_policy_list = privacy_links_df[(privacy_links_df['policy_downloaded'] == True) & (privacy_links_df['valid_language'] == 'zh-cn')]['hash'].tolist()
chinesese_policy_list.extend(privacy_links_df[(privacy_links_df['policy_downloaded'] == True) & (privacy_links_df['valid_language'] == 'zh-tw')]['hash'].tolist())

print(f"Total japanese files {len(ja_policy_list)}")
print(f"Total korean files {len(ko_policy_list)}")
print(f"Total chinese files {len(chinesese_policy_list)}")

Total japanese files 1297
Total korean files 916
Total chinese files 428


# Preprocessing

## Japanese Helper Function

In [11]:
def is_kana(text : str):
    return bool(regex.search(r'\P{Script=Katakana}|\P{Script=Hiragana}', text))

def clean_japanese_text(text : str):
    text = text.strip()
    text = re.sub(r'\s+',' ',text) #collapse white space
    text = re.sub(r'[\n\r]+','\n',text) #collapse new lines
    text = re.sub(r'，|,', '、', text) #normalize commas
    text = re.sub(r'\.', '。', text) #normalize periods

    return text

In [12]:
print(is_kana("あa"))

True


In [13]:
#readablity
def japanese_readability_index(tokens : pd.DataFrame, scale : int = 1): #modified to take already tokenized text
  """Overview
  --
  Calculates readabilty for korean texts with formula:
    (total_words + total_syllables) / sentences

  Parameters
  --
  tokens : pandas DataFrame containing text as tokenized by the function "tokenize_korean"
  scale : scale factor to multipy readability score, used for standardizing score between langagues

  Returns
  --
  int
      kri score, if failed returns as -1
  """

  # counts number of periods to get # of sentences (rough approximation)
  try: #wrapped in try in case of no periods
    sentences = tokens["POS2"].value_counts()["句点"]
  except:
    return(-1) # if contains no periods, readability score is junk, return -1

  #remove periods and commas from word count (should be only remaining punctuation)
  mask = (tokens["POS2"] == '句点') | (tokens["POS2"] == '読点')
  words = tokens.drop(tokens[mask].index, inplace = False)['Reading'].to_list()

  total_words = len(words) #not perfect but morphs ~approx words
  total_syllables = sum(len(word) for word in words)

  # Calculate the KRI score
  jri_score = (total_words + total_syllables) / sentences

  return jri_score

In [14]:
def remove_stop_word_japanese(tokens : pd.DataFrame, stopwords : set()): #takes already tokenized text
  """Overview
  --
  Removes stopwords passed to function from tokenized text

  Parameters
  --
  tokens : pandas DataFrame containing text as tokenized by the function "tokenize_korean"
  stopwords : set of stopwords as string

  Returns
  --
  void
      deletes stopwords inplace on tokens
  """

  mask = tokens["Normalized"].isin(stopwords)
  tokens.drop(tokens[mask].index, inplace = True)#drop is performed in place, not returned

def remove_white_space_japanese(tokens : pd.DataFrame): #takes already tokenized text
  """Overview
  --
  Removes stopwords passed to function from tokenized text

  Parameters
  --
  tokens : pandas DataFrame containing text as tokenized by the function "tokenize_korean"

  Returns
  --
  void
      deletes white-space inplace on tokens
  """
  mask = (tokens["POS1"] == "空白")
  tokens.drop(tokens[mask].index, inplace = True)#drop is performed in place, not returned

def remove_punc_japanese(tokens : pd.DataFrame, keep_periods = True, keep_commas = True): #takes already tokenized text
  """Overview
  --
  Removes punctuation from tokenized text

  Parameters
  --
  tokens : pandas DataFrame containing text as tokenized by the function "tokenize_korean"
  keep_periods : if true, periods aren't removed
  keep_commas : if true, commas aren't removed

  Returns
  --
  void
      deletes punctuation inplace on tokens
  """

  #unrecognized_punc = {} #in case some puncutuation is not marked as punctuation (unused)
  mask = (tokens["POS1"] == '"補助記号"')

  #used to keep periods and commas
  if keep_periods:
    mask = mask & (tokens["POS2"] != '句点')
  if keep_commas:
    mask = mask & (tokens["POS2"] != '読点')

  #removes unrecognized punc (unused)
  #mask = mask | (tokens["normalized"].isin(unrecognized_punc))

  tokens.drop(tokens[mask].index, inplace = True)#drop is performed in place, not returned

In [15]:
def formatter_japanese(tokens : list[sudachipy.MorphemeList]):
  formatted = []
  for morph_list in tokens: #due to quirk of morpheme_list - loop over each morpheme list
    for token in morph_list: #loops over tokens - each token is a new row

      #spliting on splitmode A
      a_split = [sub_token.surface() for sub_token in token.split(sudachipy.SplitMode.A, add_single = False)] #split tuple of morphemes into list of strings
      if len(a_split) == 0:
          a_split = "*" #matches SudachiPy na value
      else:
          a_split = "-".join(a_split)

      #spliting on splitmode B
      b_split = [sub_token.surface() for sub_token in token.split(sudachipy.SplitMode.B, add_single = False)] #split tuple of morphemes into list of strings
      if len(b_split) == 0:
          b_split = "*" #matches SudachiPy na value
      else:
          b_split = "-".join(b_split)

      pos = token.part_of_speech()
      formatted.append((token.surface(),token.normalized_form(),token.reading_form(),token.dictionary_form(),pos[0],pos[1],pos[2],pos[3],pos[4],pos[5],token.is_oov(),b_split,a_split))

  return formatted


In [16]:
def safe_split(text : str, tokenizer : sudachipy.Tokenizer):
  try:
    #attemps to tokenize text
    return [tokenizer.tokenize(text)] #returns in a list since morpheme lists dont support extension/appending
  except:
    #checks if error is due to oversized input
    if(sys.getsizeof(text) > 49149):

        #try to split by paragraph
        split = re.split(r'(\n)', text)

        #if no obvious paragraphs are found, split along sentence boundaries
        if len(split) <= 1:
          split = re.split(r'(\.|。)', text)

          #if no obvious sentence boundaries are found, split between words
          if len(split) <= 1:
            split = re.split(r'(\w|,|、|，)', text)

            #if still no good split is found, throw an exception
            if len(split)  <= 1:
              raise("String is oversized and a valid split wasn't found.")

        #splits text at middle split point
        mid = len(split) // 2
        first_half = "".join(split[0:mid])
        back_half = "".join(split[mid:])

        #recursively tokenizes smaller halves
        first_half_split = safe_split(first_half, tokenizer)
        back_half_split = safe_split(back_half, tokenizer)

        #recombines and returns section as list of morpheme lists
        first_half_split.extend(back_half_split)
        return first_half_split

    else:
        raise("something is wrong, check SudachiPY exception.")

#tokenization helper function
def tokenize_japanese(text : str, tokenizer : sudachipy.Tokenizer):
    """Tokenization helper function

    Parameters
    ----------
    text : The text to tokenize
    tokenizer : The SudachiPy tokenizer to use

    Returns
    -------
    void
        Creates file at the file location specified
    """

    #use recursive safe split to split text greater than SudachiPy size limit
    tokens = safe_split(text, tokenizer)

    data = formatter_japanese(tokens)

    return pd.DataFrame(data, columns=["Surface","Normalized","Reading","Dictionary","POS1","POS2","POS3","POS4","Conj_Type","Conj","OOV","B_Split","A_Split"])

def mass_tokenize_japanese(hash_list : list, in_folder : str, out_folder : str, stopwords : set = {}, readability : bool = True):
  """Overview
  --
  Tokenizes all files listed in hash list and corresponding text file

  Parameters
  --
  hash_list : list of hashes to tokenize
  in_folder : folder to look for txt files using hash as name
  out_folder : folder to write tokenized text csv files into
  stopwords : set of stopword to remove from all texts
  readability : if true, also writes csv with each files readability score as calculated by "korean_readability_index"

  Returns
  --
  void
      writes CSV files to out_folder
  """

  #ensures out_folder exists
  if not os.path.exists(out_folder):
      os.makedirs(out_folder)

  #create dictionary - Using full for best performance
  full_dict = sudachipy.Dictionary(dict = "full")
  #create tokenizer - Spliting on highest level for NER
  tokenizer_C = full_dict.create(mode = sudachipy.SplitMode.C)

  if readability:
    r_scores = [] #list for storing readability scores

  #iterate over every file in has list
  for hash in tqdm(hash_list):
    #open and clean text
    with open(f"{in_folder}/{hash}.txt", "r") as f:
      text = f.read()
      text = clean_japanese_text(text)

      #kana check check
      if is_kana(text):
        tokens = tokenize_japanese(text, tokenizer_C) #tokenization

        #first rount of stopwording, remove superflous puncuation and white-space
        remove_punc_japanese(tokens)
        remove_white_space_japanese(tokens)

        #caluclate readabilty
        if readability:
          readability_score = japanese_readability_index(tokens)

          #if caluclating readabilty fails and return as -1, print hash
          if readability_score == -1:
            print(hash)

          r_scores.append(readability_score)

        #second rount of stopwording, remove actual stopwords
        remove_stop_word_japanese(tokens, stopwords = stopwords)

        # write to ./korean_preprocess
        tokens.to_csv(f"{out_folder}/{hash}.csv", index=False)

  #write readability with coresponding hash
  if readability:
    data = zip(hash_list,r_scores)
    r_scores_df = pd.DataFrame(data, columns = ["hash","readability"])
    r_scores_df.to_csv(f"{out_folder}/readability.csv", index=False)

## Korean Helper Functions

In [None]:
#simple language check if text contains hangul character
def is_hangul(text : str):
    return bool(regex.search(r'\p{IsHangul}', text))

#simple text cleaner to normalize text input
def clean_korean_text(text : str):
    text = text.strip()
    text = re.sub(r'[\n\r\s]+',' ',text) #new lines arent use for splitting so simply converted to whitespace
    text = re.sub(r'，|、', ',', text) #normalize commas
    text = re.sub(r'。', '.', text) #normalize periods

    return text

In [None]:
#readablity
def korean_readability_index(tokens : pd.DataFrame, scale : int = 1): #modified to take already tokenized text
  """Overview
  --
  Calculates readabilty for korean texts with formula:
    (total_words + total_syllables) / sentences

  Parameters
  --
  tokens : pandas DataFrame containing text as tokenized by the function "tokenize_korean"
  scale : scale factor to multipy readability score, used for standardizing score between langagues

  Returns
  --
  int
      kri score, if failed returns as -1
  """

  # counts number of periods to get # of sentences (rough approximation)
  try: #wrapped in try in case of no periods
    sentences = tokens['normalized'].value_counts()["."]
  except:
    return(-1) # if contains no periods, readability score is junk, return -1

  #remove periods and commas from word count (should be only remaining punctuation)
  mask = (tokens["normalized"] == '.') | (tokens["normalized"] == ',')
  words = tokens.drop(tokens[mask].index, inplace = False)['normalized'].to_list()

  total_words = len(words) #not perfect but morphs ~approx words
  total_syllables = sum(len(word) for word in words)

  # Calculate the KRI score
  kri_score = (total_words + total_syllables) / sentences

  return kri_score


In [None]:
def remove_stop_word_korean(tokens : pd.DataFrame, stopwords : set()): #takes already tokenized text
  """Overview
  --
  Removes stopwords passed to function from tokenized text

  Parameters
  --
  tokens : pandas DataFrame containing text as tokenized by the function "tokenize_korean"
  stopwords : set of stopwords as string

  Returns
  --
  void
      deletes stopwords inplace on tokens
  """

  mask = tokens["normalized"].isin(stopwords)
  tokens.drop(tokens[mask].index, inplace = True)#drop is performed in place, not returned

def remove_punc_korean(tokens : pd.DataFrame, keep_periods = True, keep_commas = True): #takes already tokenized text
  """Overview
  --
  Removes punctuation from tokenized text

  Parameters
  --
  tokens : pandas DataFrame containing text as tokenized by the function "tokenize_korean"
  keep_periods : if true, periods aren't removed
  keep_commas : if true, commas aren't removed

  Returns
  --
  void
      deletes punctuation inplace on tokens
  """

  unrecognized_punc = {"‘","・","“"} #some puncutuation is not marked as punctuation
  mask = (tokens["pos"] == 'Punctuation')

  #used to keep periods and commas
  if keep_periods:
    mask = mask & (tokens["normalized"] != '.')
  if keep_commas:
    mask = mask & (tokens["normalized"] != ',')

  #removes unrecognized punc
  mask = mask | (tokens["normalized"].isin(unrecognized_punc))

  tokens.drop(tokens[mask].index, inplace = True)#drop is performed in place, not returned

In [None]:
#preps okt output for csv
def formatter_korean(surf, norm, stem, pos):
  """Overview
  --
  function to solve problem of potentially missaligned normalized and non-normalized tokens due to minor differnece in how tokens are split

  Parameters
  --
  surf : list of surface tokens as returned by morphs, must NOT be normalized or stemmed
  norm : list of normalized tokens as returned by pos, MUST be normalized but NOT stemmed
  stem : list of surface tokens as returned by morphs, MUST be normalized and stemmed
  pos : list of surface tokens as returned by py, MUST be normalized but NOT stemmed

  Returns
  --
  list(list)
      input lists aligned and zipped in the format for creating a pandas DataFrame
  """

  #find sections of overlapping surface and notmalized tokens
  #assumes most text does match between surface and stem
  diffs = difflib.SequenceMatcher(isjunk=None, a=surf, b=norm, autojunk=False)
  matches = diffs.get_matching_blocks()
  matches.insert(0,difflib.Match(0,0,0)) #dummy match added to find differences at front of list

  #used for tracking position in lists
  a_inserts = 0
  b_inserts = 0

  #find gaps between matching blocks
  for i in range(len(matches) - 1):
    front = matches[i]
    back = matches[i+1]

    #position tracking
    front_a = front.a + front.size + a_inserts
    back_a = back.a + a_inserts
    front_b = front.b + front.size + b_inserts
    back_b = back.b + b_inserts

    a_size = back_a - front_a
    b_size = back_b - front_b

    #if surface with no norm, insert norm as na (*)
    if b_size == 0:
      for i in range(a_size):
        norm.insert(front_b, "*")
        stem.insert(front_b, "*")
        pos.insert(front_b, "*")
      b_inserts += a_size
      continue

    #if norm with no surface, insert surface as na (*)
    if a_size == 0:
      concat_surf = "*"

    #if norm and surface, copy whole surface for each noram (*)
    else:
      concat_surf = "".join(surf[front_a:back_a])

    #remove old surface tags
    del surf[front_a:back_a]

    #insert to align with norm
    for i in range(b_size):
      surf.insert(front_a, concat_surf)
    a_inserts += a_size - b_size

    #return in correct format for pandas dataframe
    return zip(surf, norm, stem, pos)

## Korean Tokenization Functions

In [None]:
def tokenize_korean(text, tokenizer): #tokenizer should be passed as argument so we only need to initiallize it once for all doucments
    """Overview
    --
    Tokenizes one text file

    Parameters
    --
    text : raw text, should be cleaned for best result
    tokenizer : tokenizer used for tokenization (in this case OKT)

    Returns
    --
    pd.Dataframe
        returns DataFrame object with tokenized text
    """

    norm,pos = map(list,zip(*(tokenizer.pos(text, norm=True))))
    surf = tokenizer.morphs(text)
    stem = tokenizer.morphs(text, norm=True, stem=True)

    #arranging data, not super elegant but fast enough
    data = formatter_korean(surf, norm, stem, pos)

    #put data into dataframe and return
    return pd.DataFrame(data, columns=['surface', 'normalized', 'dictionary' , 'pos'])

def mass_tokenize_korean(hash_list : list, in_folder : str, out_folder : str, stopwords : set = {}, readability : bool = True):
  """Overview
  --
  Tokenizes all files listed in hash list and corresponding text file

  Parameters
  --
  hash_list : list of hashes to tokenize
  in_folder : folder to look for txt files using hash as name
  out_folder : folder to write tokenized text csv files into
  stopwords : set of stopword to remove from all texts
  readability : if true, also writes csv with each files readability score as calculated by "korean_readability_index"

  Returns
  --
  void
      writes CSV files to out_folder
  """

  #ensures out_folder exists
  if not os.path.exists(out_folder):
      os.makedirs(out_folder)

  #initialize tokenizer only once
  okt = Okt()

  if readability:
    r_scores = [] #list for storing readability scores

  #iterate over every file in has list
  for hash in tqdm(hash_list):

    #open and clean text
    with open(f"{in_folder}/{hash}.txt", "r", encoding='utf-8') as f:
      text = f.read()
      text = clean_korean_text(text)

      #hangul check
      if is_hangul(text):
        tokens = tokenize_korean(text, okt) #tokenization
        remove_punc_korean(tokens) #first rount of stopwording, remove superflous puncuation

        #caluclate readabilty
        if readability:
          readability_score = korean_readability_index(tokens)

          #if caluclating readabilty fails and return as -1, print hash
          if readability_score == -1:
            print(hash)

          r_scores.append(readability_score)

        #first rount of stopwording, remove actual stopwords
        remove_stop_word_korean(tokens, stopwords = stopwords)

        # write to ./korean_preprocess
        tokens.to_csv(f"{out_folder}/{hash}.csv", index=False)

  #write readability with coresponding hash
  if readability:
    data = zip(hash_list,r_scores)
    r_scores_df = pd.DataFrame(data, columns = ["hash","readability"])
    r_scores_df.to_csv(f"{out_folder}/readability.csv", index=False)



## Chinese Helper Function

In [None]:
def is_chinese(value):
    return hanzidentifier.has_chinese(value)

def chinese_clean_text_func(text):
    # Normalize newline characters
    text = re.sub('[\n\r]+', '\n', text)

    # Replace Chinese and English commas with ideographic comma '、'
    text = re.sub('[，,]', '、', text)

    # Remove special characters and punctuation
    text = re.sub(r'[^\w\s、]', '', text)

    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Run Functions

## Japanese

In [17]:
pr_j = cProfile.Profile()
pr_j.enable()

#get stopwords from stopwords-ko.txt
with open("stopwords/stopwords-ja.txt", "r") as f:
    stopwords = set(f.read().split("\n"))
print(len(stopwords))

mass_tokenize_japanese(ja_policy_list,"policies","corpus/japanese_preprocessed", stopwords = stopwords)

pr_j.disable()
pr_j.dump_stats('misc/stats_ja')

134


  0%|          | 0/1297 [00:00<?, ?it/s]

b5e896e584195d83b30bc22323eb6fd352241a04e2f823185c6fef885f5a81b1
d8cdc7cd6bceb9039f50d1d3a73f2926dbe2c52d22c45fe1b484a2197e3687c3
fed50609e7a07e0fa9b37cd20a4152a0f030004291ac19e652f043760ce31f0c
46547d47482af14ac299d4a690fc58fb3b79cf942d1468e060d0a0be091599ee
8e9984b6e5da21c0f6801d1ee659bd378a8710bad7d0e79e1f721b0c9d45a496
179c99a8d1102197437d5766ffd26acae311ff27f78fca46723717a983707f2b
f8b45b00b2437931453cbbae2c94bb20cb267336746035d301b4d8814151cb86
b7a7f33e28302393f1586d02f0921840be7f662a1d1f899979e15838a0cfd265
f27c7137d5041d704dc0c45c077a6f1adc4d13b1a48e07fac89686cd45bba84c
7798d2dd931b80ed5d4b661e5336e69e90f0d0b8f59bdf2b14dab26e0cfe7c53


Error: need to escape, but no escapechar set

## Korean

In [None]:
pr_k = cProfile.Profile()
pr_k.enable()

#get stopwords from stopwords-ko.txt
with open("stopwords/stopwords-ko.txt", "r") as f:
    stopwords = set(f.read().split("\n"))
print(len(stopwords))

mass_tokenize_korean(ko_policy_list,"policies","corpus/korean_preprocessed", stopwords = stopwords)

pr_k.disable()
pr_k.dump_stats('misc/stats_ko')

595


  0%|          | 0/916 [00:00<?, ?it/s]

b20b783a3177517a157bf29c5d89fd8a47a1416580b76ce5f252c4de615008cf
b20b783a3177517a157bf29c5d89fd8a47a1416580b76ce5f252c4de615008cf
4420efc565f17d826611b123eb45d04954e8a23fd83fd52d1f2bc51a272f27e4
4420efc565f17d826611b123eb45d04954e8a23fd83fd52d1f2bc51a272f27e4
5597b3218ddc51e0db6308218d9a5c757cc8e2ea8e212b69cb98ed82a25cb198
5597b3218ddc51e0db6308218d9a5c757cc8e2ea8e212b69cb98ed82a25cb198
44611a969a079b225d1b408e6a9671b9fe58bbaae1d05dc3e94941ae16627cc4
44611a969a079b225d1b408e6a9671b9fe58bbaae1d05dc3e94941ae16627cc4
015df2517c29fbda70f1ba7b9d8dcf8d6cd311b40c6c31008069790ba18feeea
015df2517c29fbda70f1ba7b9d8dcf8d6cd311b40c6c31008069790ba18feeea
8379c9822cad1e20fd6101614c18c1d6391592cca08567222109e011d6b022e9
8379c9822cad1e20fd6101614c18c1d6391592cca08567222109e011d6b022e9
b0761540fd23bab5f5914a8fe89377bccaae2d762a85fb568d00d4c26bc18896
b0761540fd23bab5f5914a8fe89377bccaae2d762a85fb568d00d4c26bc18896
cbbe7a32ef09a2723175d6c800bb8c54dce7b750a448ec2f401d5110cb0fa213
cbbe7a32ef09a2723175d6c80

<pstats.Stats at 0x7ac6201771c0>

In [None]:
#print stats

p = pstats.Stats('misc/stats_ja')
p.strip_dirs().sort_stats(SortKey.TIME).print_stats()

#files without periods/failed kri score:
"""
b20b783a3177517a157bf29c5d89fd8a47a1416580b76ce5f252c4de615008cf
b20b783a3177517a157bf29c5d89fd8a47a1416580b76ce5f252c4de615008cf
4420efc565f17d826611b123eb45d04954e8a23fd83fd52d1f2bc51a272f27e4
4420efc565f17d826611b123eb45d04954e8a23fd83fd52d1f2bc51a272f27e4
5597b3218ddc51e0db6308218d9a5c757cc8e2ea8e212b69cb98ed82a25cb198
5597b3218ddc51e0db6308218d9a5c757cc8e2ea8e212b69cb98ed82a25cb198
44611a969a079b225d1b408e6a9671b9fe58bbaae1d05dc3e94941ae16627cc4
44611a969a079b225d1b408e6a9671b9fe58bbaae1d05dc3e94941ae16627cc4
015df2517c29fbda70f1ba7b9d8dcf8d6cd311b40c6c31008069790ba18feeea
015df2517c29fbda70f1ba7b9d8dcf8d6cd311b40c6c31008069790ba18feeea
8379c9822cad1e20fd6101614c18c1d6391592cca08567222109e011d6b022e9
8379c9822cad1e20fd6101614c18c1d6391592cca08567222109e011d6b022e9
b0761540fd23bab5f5914a8fe89377bccaae2d762a85fb568d00d4c26bc18896
b0761540fd23bab5f5914a8fe89377bccaae2d762a85fb568d00d4c26bc18896
cbbe7a32ef09a2723175d6c800bb8c54dce7b750a448ec2f401d5110cb0fa213
cbbe7a32ef09a2723175d6c800bb8c54dce7b750a448ec2f401d5110cb0fa213
f8f723e8a4bf11b768b39745d74ec9c09e6dfdd644d7a93fc3221a4d82f6f185
f8f723e8a4bf11b768b39745d74ec9c09e6dfdd644d7a93fc3221a4d82f6f185
"""

Mon Jul 15 15:53:20 2024    misc/stats_ja

         153837 function calls (150296 primitive calls) in 7.775 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
     5188    2.632    0.001    2.632    0.001 {method 'sub' of 're.Pattern' objects}
     1299    2.233    0.002    2.243    0.002 {built-in method io.open}
     1298    1.815    0.001    1.888    0.001 {method 'read' of '_io.TextIOWrapper' objects}
     1297    0.223    0.000    0.223    0.000 {method 'search' of '_regex.Pattern' objects}
     1298    0.179    0.000    0.179    0.000 {method '__exit__' of '_io._IOBase' objects}
        1    0.155    0.155    7.738    7.738 <ipython-input-29-dcf61e3884ff>:62(mass_tokenize_japanese)
      223    0.094    0.000    0.094    0.000 socket.py:545(send)
     1298    0.062    0.000    0.062    0.000 {built-in method _codecs.utf_8_decode}
     1297    0.033    0.000    0.081    0.000 regex.py:449(_compile)
        1    0.028    0

'\nb20b783a3177517a157bf29c5d89fd8a47a1416580b76ce5f252c4de615008cf\nb20b783a3177517a157bf29c5d89fd8a47a1416580b76ce5f252c4de615008cf\n4420efc565f17d826611b123eb45d04954e8a23fd83fd52d1f2bc51a272f27e4\n4420efc565f17d826611b123eb45d04954e8a23fd83fd52d1f2bc51a272f27e4\n5597b3218ddc51e0db6308218d9a5c757cc8e2ea8e212b69cb98ed82a25cb198\n5597b3218ddc51e0db6308218d9a5c757cc8e2ea8e212b69cb98ed82a25cb198\n44611a969a079b225d1b408e6a9671b9fe58bbaae1d05dc3e94941ae16627cc4\n44611a969a079b225d1b408e6a9671b9fe58bbaae1d05dc3e94941ae16627cc4\n015df2517c29fbda70f1ba7b9d8dcf8d6cd311b40c6c31008069790ba18feeea\n015df2517c29fbda70f1ba7b9d8dcf8d6cd311b40c6c31008069790ba18feeea\n8379c9822cad1e20fd6101614c18c1d6391592cca08567222109e011d6b022e9\n8379c9822cad1e20fd6101614c18c1d6391592cca08567222109e011d6b022e9\nb0761540fd23bab5f5914a8fe89377bccaae2d762a85fb568d00d4c26bc18896\nb0761540fd23bab5f5914a8fe89377bccaae2d762a85fb568d00d4c26bc18896\ncbbe7a32ef09a2723175d6c800bb8c54dce7b750a448ec2f401d5110cb0fa213\ncbbe7a3

## Chinese

In [None]:
!pip3 install jieba zhon nltk pandas



In [None]:
import jieba
import jieba.posseg as pseg
import pandas as pd
import re
from zhon.hanzi import punctuation
from nltk.corpus import stopwords

# Ensure nltk stopwords are downloaded
import nltk
nltk.download('stopwords')

# Load Chinese stop words
chinese_stop_words = stopwords.words('chinese')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def normalize_token(token):
    # Here we simply convert to lowercase for normalization
    # Additional normalization logic can be added as needed
    return token.lower()

def dictionary_form(token):
    # This function can be expanded to convert to a standard dictionary form
    # For now, we'll assume the token itself is its dictionary form
    return token

def process_text(text):
    cleaned_text = clean_text(text)
    words = pseg.cut(cleaned_text)

    data = []

    for word, flag in words:
        if word not in chinese_stop_words:
            data.append({
                'surface': word,
                'normalized': normalize_token(word),
                'dictionary': dictionary_form(word),
                'POS': flag
            })

    df = pd.DataFrame(data)
    return df

In [None]:
# Read the Chinese text document

# read all files into list in ./raw_sorted_policies/chinese_raw
chinese_policies = []
for filename in os.listdir('./raw_sorted_policies/chinese_raw'):
    with open(f'./raw_sorted_policies/chinese_raw/{filename}', 'r', encoding='utf-8') as f:
        chinese_policies.append(f.read())

print(len(chinese_policies))

444


In [None]:
text = chinese_policies[0]
print(text)
# df = process_text(text)
# print(df)

库洛游戏儿童个人信息保护政策 库洛游戏儿童个人信息保护政策 更新日期： 202 4 年6 月6 日 欢迎您使用库洛游戏提供的服务。 “库洛游戏”是指与您签署本协议的游戏服务的提供方——广州库洛科技有限公司（注册地址：广州市天河区棠下荷光三横路7号之一107房）及/或广州库洛数界科技有限公司（注册地址：广州市天河区棠下荷光三横路7号之一110之一房），在本协议中也称“我们”。 本政策系我们在 《库洛游戏个人信息保护政策》 基础上，针对儿童个人信息的加强保护所作的进一步补充说明。 本政策中 “儿童”特指不满十四周岁的未成年人 。 有关十四周岁以上（包括 14周岁）的用户的个人信息保护规则，请您查阅《库洛游戏个人信息保护政策》 予以了解。就儿童个人信息保护事宜，如本政策与 《库洛游戏个人信息保护政策》 存在不一致之处，以本政策为准；如本政策未约定，以 《库洛游戏个人信息保护政策》 为准。本政策所使用的全部术语，除本政策另有说明外，与 《库洛游戏用户协议》 、 《库洛游戏个人信息保护政策》 中的术语具有相同涵义。 如您为儿童的家长或其他监护人（以下统称 “监护人”），请您陪同您的孩子共同仔细完整阅读并理解本政策内容，如您为儿童用户，请要求您的监护人与您共同仔细完整阅读并理解本政策内容，尤其是以加粗、 添加下划线 等方式重点提示的条款 ，以清楚知悉在儿童接受或使用库洛游戏所提供的服务时，库洛游戏对儿童用户信息收集、存储、使用、共享的方式，以及儿童用户或其监护人对儿童个人信息享有的管理权利，进而在更好地了解库洛游戏所提供的服务的前提下根据儿童及其监护人的需求，由监护人对是否同意儿童接受服务，接受或使用何种服务做出适当的选择。 儿童个人信息的收集、使用、转移、披露等处理依法均应征得儿童监护人的同意。在未取得监护人同意的情形下，儿童不应擅自接受或使用库洛游戏所提供的服务，也不得自行向我们提供任何儿童个人信息或相关授权。 请您知悉、理解并同意，监护人于线上同意本政策，或儿童用户已接受或使用库洛游戏所提供的服务，视为儿童用户及其监护人理解并认同本政策所述的全部内容。 特别提醒您的是：不满十四周岁未成年人的个人信息属于敏感个人信息。请您充分、谨慎考虑后再选择是否提供或授权。 若您有任何问题、意见或建议，请查看本政策 “联系我们”载明的方式与我们联系。 本政策将帮助您了解以下内容： · 一

In [None]:


# with open('chinese_text.txt', 'r', encoding='utf-8') as file:
#     text = file.read()

# # Process the text and get the DataFrame
# df = process_text(text)

# # Output the DataFrame
# print(df)