## 데이터 가져오기

In [None]:
from google.colab import drive

import csv
import pandas as pd

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
base_path = '/content/drive/MyDrive/23-1_online/빅데이터 처리/팀플/Big Data Processing'

In [None]:
sample_text_data_path = base_path + '/data/sample_text_data.csv'

In [None]:
with open(sample_text_data_path, 'r', encoding='UTF8', errors='replace') as sample_text_data_file:
  sample_text_data_df = pd.read_csv(sample_text_data_file)

## 데이터 통합 및 저장

In [None]:
sample_text_data_str = sample_text_data_df.to_string(header=False, index=False)
sample_text_data_str_file_address = base_path + '/data/sample_text_data_str_file.txt'

with open(sample_text_data_str_file_address, 'w') as f:
    f.write(sample_text_data_str)

In [None]:
sample_text_data_df

Unnamed: 0,Currently popular,Game Description
0,"At first glance, Galactic Bowling appear...",Galactic Bowling is an exaggerated and stylize...
1,One Finger Death PunchBut it's adorable!...,THE LAW!!Looks to be a showdown atop a train. ...
2,,Jolt Project:The army now has a new robotics p...
3,IntroductionWhile 2D platformer and puzzle ga...,"In bizarre lands, play as a viscous glob of wa..."
4,There’s a reason why visual novels have ...,ABOUT THE GAMEPlay as a hacker who has arrange...
5,Played this game about 10 years ago for...,Feel tired of auto-fight? Feel tired of boring...
6,"So, this was one of the Steam Next Fest ...","TD Worlds is a dynamic, highly strategical gam..."
7,,When the Roman people honored a simple warrior...
8,I came across this game on Play Store. I...,"""MazM: Jekyll and Hyde"" is a darkly entertaini..."
9,"Despite the name Rotten Edition, this ga...",Death is lonely. He has zero friends on his Fa...


## 각종 전처리

In [None]:
!pip install nltk
!pip install tokenizers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tokenizers
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m69.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.13.3


In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import pandas as pd
import sys
import os

from string import punctuation
from nltk.corpus import stopwords
import argparse
from tokenizers import BertWordPieceTokenizer
import json

In [None]:
class TextPreprocessor:
  def __init__(self, data_df, base_path, text_data_path):
    self.data_df = data_df
    self.text_data = ''
    self.token_data = ''
    self.preprocessed_data = ''
    
    self.parser = ''
    self.args = ''
    self.tokenizer = ''

    self.base_path = base_path
    self.text_data_path = text_data_path
    self.vocab_path = ''

  # def lower_case(self):
  #   self.text_data.lower()

  def remove_special_char(self):
    for p in punctuation:
      self.text_data = self.text_data.replace(p, '')  # 문장부호 -> 성능에 따라서 encoding 범위를 가지고 조금 더 상세히

  def set_argment_parser(self):
    sys.argv = ['']

    self.parser = argparse.ArgumentParser()

    self.parser.add_argument('--corpus_file', type=str)
    self.parser.add_argument('--vocab_size', type=int, default=30000) # eng default = 30,000 / with chinese = 21,128
    self.parser.add_argument('--limit_alphabet', type=int, default=6000) # eng default = 1,000 / with chinese = (much more)

    self.args = self.parser.parse_args()
  
  def train_tokenizer(self):  # 지금은 word piece // word 단위 자체도 tokenizer // xlnet -> 임베딩
    self.tokenizer = BertWordPieceTokenizer(
        clean_text = True,
        handle_chinese_chars = True,
        strip_accents = False,
        lowercase = True,  # False recommended (let's check)
        wordpieces_prefix = '##'
    )

    self.tokenizer.train(
        files = [self.text_data_path],
        limit_alphabet = self.args.limit_alphabet,
        vocab_size = self.args.vocab_size
    )

    self.vocab_path = base_path + f'/vocab/ch-{self.args.limit_alphabet}-wpm-{self.args.vocab_size}-pretty'
    os.makedirs(os.path.dirname(self.vocab_path), exist_ok=True)
    self.tokenizer.save(self.vocab_path, True)    

  def preprocess_vocab_file(self):
    f = open(self.text_data_path,'w',encoding='utf-8')
    with open(self.vocab_path) as json_file:
      json_data = json.load(json_file)
      for item in json_data["model"]["vocab"].keys():
        f.write(item+'\n')

      f.close()

  def tokenize_str(self, s):
    if isinstance(s, str):
      return self.tokenizer.encode(s).tokens
    else:
      return []

  def tokenize_data(self):
    self.token_data = pd.DataFrame()
    self.token_data['Currently popular'] = self.data_df['Currently popular'].apply(self.tokenize_str)
    self.token_data['Game Description'] = self.data_df['Game Description'].apply(self.tokenize_str)

  def remove_stopwords_tokens(self, tokens):
    stop_words = set(stopwords.words('english')) 
    return [token for token in tokens if token not in stop_words]

  def remove_stopwords_data(self):
    self.token_data['Currently popular'] = self.token_data['Currently popular'].apply(self.remove_stopwords_tokens)
    self.token_data['Game Description'] = self.token_data['Game Description'].apply(self.remove_stopwords_tokens)

  def join_tokens(self):
    self.preprocessed_data = pd.DataFrame()
    self.preprocessed_data['Currently popular'] = self.token_data['Currently popular'].apply(lambda x: ' '.join(x))
    self.preprocessed_data['Game Description'] = self.token_data['Game Description'].apply(lambda x: ' '.join(x))


In [None]:
tp = TextPreprocessor(sample_text_data_df, base_path, sample_text_data_str_file_address)
tp.remove_special_char()
tp.set_argment_parser()
tp.train_tokenizer()
tp.preprocess_vocab_file()
tp.tokenize_data()
tp.remove_stopwords_data()
tp.join_tokens()
tp.preprocessed_data

Unnamed: 0,Currently popular,Game Description
0,"first gl ##ance , galactic bowling appears whi...",galactic bowling ex ##ag ##ger ##ated sty ##li...
1,one fin ##ger death pu ##n ##ch ##b ##ut ' ad ...,law ! ! looks show ##d ##own ##op train . last...
2,,jolt project : army new ro ##bo ##t ##ics proj...
3,introd ##uction ##while 2 ##d platformer puzzl...,"bizarre land ##s , play vis ##cou ##s gl ##o #..."
4,’ reason visual novels click - - get - - - nex...,gameplay hacker arr ##ange ##d de ##al gang ##...
5,played game 10 years ago couple weeks good tim...,feel tired auto - fight ? feel tired boring nu...
6,", one steam next fest demo ##s , really lookin...","td worlds dynamic , highly strateg ##ical game..."
7,,roman people hon ##ored simple war ##ri ##or v...
8,came across game play st ##ore . ran phone eas...,""" mazm : jekyll hyde "" dark ##ly entertaining ..."
9,"despite name rotten edition , game rotten game...",death lo ##n ##el ##y . z ##ero friends fa ##c...


In [None]:
sample_preprocessed_data_address = base_path + '/data/sample_preprocessed_data.csv'
tp.preprocessed_data.to_csv(sample_preprocessed_data_address, index=False)