In [None]:
# !pip install git+https://github.com/haven-jeon/PyKoSpacing.git 

In [None]:
import csv
import random
import pandas as pd

# 리뷰 데이터를 랜덤으로 뽑아오기
def random_lines_from_csv(filename, num_lines):
    try:
        with open(filename, 'r', newline='', encoding='utf-8') as csvfile:
            reader = csv.reader(csvfile)
            header = next(reader, None)  # 헤더 행 처리 (선택 사항)
            lines = list(reader)
            if len(lines) < num_lines:
                return "Error: 파일에 지정된 줄 수보다 추출할 줄 수가 더 많습니다."
            random_lines = random.sample(lines, num_lines)
            return random_lines
    except FileNotFoundError:
        return f"Error: 파일을 찾을 수 없습니다: {filename}"
    except Exception as e:
        return f"Error: {e}"

# 사용 예시
filename = 'all_reviews.csv'  # CSV 파일 경로를 여기에 입력하세요
num_lines_to_extract = 50  # 추출할 줄 수를 입력하세요

extracted_lines = random_lines_from_csv(filename, num_lines_to_extract)

new_df = pd.DataFrame(extracted_lines)
new_df.to_csv('random_reviews.csv', encoding='utf-8-sig',header=False, index=False)

if isinstance(extracted_lines, list):
    for line in extracted_lines:
        print(line)
else:
    print(extracted_lines)

In [None]:
import pandas as pd
import requests
import xml.etree.ElementTree as ET
import time
import os
from pykospacing import Spacing

PASSPORT_KEY = "75a85e164818d0ac70522d024aba10f1392a1a44"

BASE_URL = "https://m.search.naver.com/p/csearch/ocontent/util/SpellerProxy"
spacing = Spacing()

def remove_tags(text: str) -> str:
    text = u'<content>{}</content>'.format(text).replace('<br>', '')
    return ''.join(ET.fromstring(text).itertext())

def spell_check_naver(text: str) -> tuple:
    params = {"passportKey": PASSPORT_KEY, "q": text, "color_blindness": "0"}
    headers = {
        "User-Agent": "Mozilla/5.0",
        "Referer": "https://search.naver.com/"
    }
    try:
        response = requests.get(BASE_URL, params=params, headers=headers)
        data = response.json()['message']['result']
        corrected = remove_tags(data.get('html', ''))
        return corrected, data.get('errata_count', 0)
    except Exception as e:
        print(f"❌ 오류: {e}")
        return text, -1

def process_in_batches(input_path: str, batch_size: int = 100, text_column: str = "review_content"):
    df = pd.read_csv(input_path, encoding='utf-8')
    total = len(df)
    output_path = "output_partial.csv"

    # 기존 파일 삭제 (처음 실행 시)
    # if os.path.exists(output_path):
    #     os.remove(output_path)
    start_idx = 37595
    for start in range(start_idx, total, batch_size):
        end = min(start + batch_size, total)
        batch_df = df.iloc[start:end].copy()

        corrected_list = []
        error_count_list = []

        for text in batch_df[text_column]:
            text = str(text)
            corrected, error_count = spell_check_naver(text)
            corrected_list.append(corrected)
            error_count_list.append(error_count)

            time.sleep(1.5)

        batch_df['교정된 내용'] = corrected_list  # 맞춤법 교정 결과
        batch_df['맞춤법 오류수'] = error_count_list  # 맞춤법 오류 수

        batch_df.to_csv(output_path, mode='a', index=False, header=not os.path.exists(output_path), encoding='utf-8-sig')
        print(f"✅ {start}~{end} 저장 완료")

    # 최종 결과 저장
    final_df = pd.read_csv(output_path, encoding='utf-8-sig')
    final_df.to_csv("output_final.csv", index=False, encoding='utf-8-sig')
    print("🎉 최종 저장 완료: output_final.csv")


In [6]:
process_in_batches("y_duple_reviews.csv")

KeyboardInterrupt: 

In [None]:
import pandas as pd
import requests
import xml.etree.ElementTree as ET
import time
from pykospacing import Spacing

# 네이버 맞춤법 검사 js에 보낼 키와 주소
PASSPORT_KEY = "dbc0dc1e7ae1070bacecf1a3865c72b5d121abc2"
BASE_URL = "https://m.search.naver.com/p/csearch/ocontent/util/SpellerProxy"

spacing = Spacing()

def remove_tags(text: str) -> str:
    text = u'<content>{}</content>'.format(text).replace('<br>', '')
    return ''.join(ET.fromstring(text).itertext())

def spell_check_naver(text: str) -> tuple:
    params = {
        "passportKey": PASSPORT_KEY,
        "q": text,
        "color_blindness": "0"
    }
    headers = {
        "User-Agent": "Mozilla/5.0",
        "Referer": "https://search.naver.com/"
    }
    try:
        response = requests.get(BASE_URL, params=params, headers=headers)
        data = response.json()['message']['result']
        html = data.get('html', '')
        corrected = remove_tags(html)
        return corrected, data.get('errata_count', 0)
    except Exception as e:
        print(f"❌ 오류: {e}")
        return text, -1

def extract_diff_words(original: str, corrected: str) -> str:
    ori_words = original.split()
    cor_words = corrected.split()
    diffs = []

    for o, c in zip(ori_words, cor_words):
        if o != c:
            diffs.append(f"{o}→{c}")

    if len(ori_words) != len(cor_words):
        min_len = min(len(ori_words), len(cor_words))
        tail_ori = ori_words[min_len:]
        tail_cor = cor_words[min_len:]
        for o, c in zip(tail_ori, tail_cor):
            if o != c:
                diffs.append(f"{o}→{c}")

    return ", ".join(diffs)

def process_and_add_columns(input_path: str, output_path: str, text_column: str = "review"):
    df = pd.read_csv(input_path, encoding='utf-8')

    corrected_list = []
    error_count_list = []
    diff_list = []

    for idx, row in df.iterrows():
        text = str(row[text_column])
        
        # 2. 네이버 맞춤법 검사
        corrected, error_count = spell_check_naver(text)

        # 3. 차이 추출
        diff = extract_diff_words(text_spaced, corrected) if error_count > 0 else ""
        text_spaced = spacing(corrected)
        corrected_list.append(corrected)
        error_count_list.append(error_count)
        diff_list.append(diff)
        
        time.sleep(0.8)

    df['spacing_fixed'] = [spacing(str(x)) for x in df[text_column]]  # 띄어쓰기 자동 보정 컬럼도 추가
    df['완성'] = text_spaced
    df['corrected_review'] = corrected_list
    df['spell_error_count'] = error_count_list
    df['spell_diff'] = diff_list

    df.to_csv(output_path, index=False, encoding='utf-8-sig')
    print(f"\n✅ 결과 저장 완료: {output_path}")


In [None]:
corrected_list = []
error_count_list = []
diff_list = []

In [None]:
inputPath = 'random_reviews.csv'
outputfile = 'result_random.reviews.csv'
process_and_add_columns(inputPath, outputfile, 'text')

time.sleep(0.8)
# df.to_csv('random_.csv', encoding='utf-8-sig', header=True)

In [None]:
input_file = "dict_review_texts.csv"
output_file = "리뷰_띄어쓰기_맞춤법_교정.csv"

process_and_add_columns(input_file, output_file)