<a href="https://colab.research.google.com/github/Tiabet/Project_Market/blob/master/text_preprocessing_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import unicodedata
import pandas as pd
from hanspell import spell_checker
import re
import sys

def normalize_unicode(text):
    return unicodedata.normalize('NFKC', text)

def correct_spelling(text):
    spelled_sent = spell_checker.check(text)
    return spelled_sent.checked

def apply_regex(text):
    text = re.sub(r'[ㄱ-ㅎㅏ-ㅣ0-9]+', '', text)
    text = re.sub('ᄒᄒ', '', text)
    text = re.sub('[-=+,#/\?:^.@*\"※~ㆍ!』‘|\[\]`\'…》\”\“\’·]', ' ', text)
    text = re.sub(r'[a-zA-Z]{1,2}', '', text)
    text = re.sub(r'\s{2,}|\t', ' ', text)
    return text

def preprocess_file(input_filename, output_filename):
    df = pd.read_csv(input_filename, sep='\t')  # TSV 파일이므로 sep='\t'로 설정

    preprocessing_pipeline = Pipeline([
        ('normalize_unicode', lambda x: x.apply(normalize_unicode)),
        ('correct_spelling', lambda x: x.apply(correct_spelling)),
        ('correct_spelling', lambda x: x.apply(correct_spelling))
        ('apply_regex', lambda x: x.apply(apply_regex))
    ])

    df_preprocessed = preprocessing_pipeline.transform(df['review'])
    df['review'] = df_preprocessed
    df.to_csv(output_filename, sep='\t', index=False)  # TSV 파일로 저장하기 위해 sep='\t' 추가

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: python preprocessing_script.py input.tsv output_preprocessed.tsv")
    else:
        input_filename = sys.argv[1]
        output_filename = sys.argv[2]
        preprocess_file(input_filename, output_filename)
