In [81]:
import sqlite3
import pandas as pd

# 데이터베이스 파일 경로
db_path = r"C:\Users\Playdata\Desktop\만개 전처리\재료업데이트만개총데이터터.db"

# 데이터베이스 연결
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# 데이터베이스 내 테이블 목록 확인
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()

# 각 테이블의 컬럼 구조 확인
table_structures = {}
for table in tables:
    table_name = table[0]
    cursor.execute(f"PRAGMA table_info({table_name});")
    columns = cursor.fetchall()
    table_structures[table_name] = [(col[1], col[2]) for col in columns]

# 테이블 구조 출력
table_structures



{'recipes': [('id', 'INTEGER'),
  ('name', 'TEXT'),
  ('intro', 'TEXT'),
  ('info', 'TEXT'),
  ('photo', 'TEXT'),
  ('recipe', 'TEXT'),
  ('views', 'TEXT'),
  ('video', 'TEXT'),
  ('category', 'TEXT'),
  ('date', 'TEXT'),
  ('ingredients', 'TEXT')]}

In [82]:
# recipes 테이블의 데이터 로드
df_recipes = pd.read_sql_query("SELECT * FROM recipes;", conn)

# NULL 값 개수 확인
null_counts = df_recipes.isnull().sum()

# 값이 없는 항목 (빈 문자열 포함) 개수 확인
empty_counts = (df_recipes == "").sum()

# NULL, NaN, 빈 값 개수 합산
total_missing_counts = null_counts + empty_counts

# 결과 출력
total_missing_counts

id                  0
name                0
intro           13164
info            11079
photo               0
recipe          13220
views               0
video          230307
category            0
date                0
ingredients      9327
dtype: int64

In [83]:
# 전체 데이터 개수 확인

df_recipes.shape[0]

230927

In [84]:
# 'ingredients' 또는 'recipe'가 비어 있는 행 삭제

df_cleaned = df_recipes[(df_recipes['ingredients'] != "") & (df_recipes['recipe'] != "")]

# 삭제 후 전체 데이터 개수 확인
df_cleaned.shape[0]

216756

In [85]:
import re

# name column 전처리

# 남길 문자: 대괄호([])와 퍼센트(%)만 제외하고 모두 제거
remove_chars_extended = r'[^a-zA-Z0-9가-힣\s\[\]%]+'

# 'name' 컬럼에서 지정한 특수문자 제거
df_cleaned['name'] = df_cleaned['name'].apply(lambda x: re.sub(remove_chars_extended, '', x))
df_cleaned['intro'] = df_cleaned['intro'].apply(lambda x: re.sub(remove_chars_extended, '', x))

# 일본어, 한자, 공백 문자(Zero-width space, BOM 포함) 제거
remove_chars_jp_hanja_space = r'[\u3000-\u303F\u3040-\u30FF\u31F0-\u31FF\u3200-\u32FF\u3400-\u4DBF\u4E00-\u9FFF\uF900-\uFAFF\u200B\uFEFF]+'

# 'name' 컬럼에서 해당 문자 제거
df_cleaned['name'] = df_cleaned['name'].apply(lambda x: re.sub(remove_chars_jp_hanja_space, '', x))
df_cleaned['intro'] = df_cleaned['intro'].apply(lambda x: re.sub(remove_chars_jp_hanja_space, '', x))

# Non-Breaking Space (`\xa0`) 제거
df_cleaned['name'] = df_cleaned['name'].apply(lambda x: x.replace("\xa0", " "))
df_cleaned['intro'] = df_cleaned['intro'].apply(lambda x: x.replace("\xa0", " "))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['name'] = df_cleaned['name'].apply(lambda x: re.sub(remove_chars_extended, '', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['intro'] = df_cleaned['intro'].apply(lambda x: re.sub(remove_chars_extended, '', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['nam

In [86]:
# 중복된 단어를 정리하는 함수 정의
def remove_duplicate_words(text):
    words = text.split()  # 공백을 기준으로 단어 분리
    seen = set()
    unique_words = [word for word in words if not (word in seen or seen.add(word))]  # 중복 제거
    return " ".join(unique_words)  # 다시 공백을 포함하여 합치기

# 'name' 컬럼에 적용하여 중복 단어 제거
df_cleaned['name'] = df_cleaned['name'].apply(remove_duplicate_words)
df_cleaned['intro'] = df_cleaned['intro'].apply(remove_duplicate_words)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['name'] = df_cleaned['name'].apply(remove_duplicate_words)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['intro'] = df_cleaned['intro'].apply(remove_duplicate_words)


In [87]:
import string

# 남길 문자: %, (, ), *, +, -, /, :, cm, 中, 大, 小, x, ½, ~, =
allowed_chars_ingredients_extended = set("[],.%()*+/-:cm中大小x½~=" + string.ascii_letters + string.digits + string.whitespace + "".join(chr(i) for i in range(0xAC00, 0xD7A4)))  # 한글 포함

# 제거할 문자 필터링
df_cleaned['ingredients'] = df_cleaned['ingredients'].apply(
    lambda x: "".join([char for char in x if char in allowed_chars_ingredients_extended])
)

# '*'를 'x'로 변환하고 '-'를 '~'로 변환
df_cleaned['ingredients'] = df_cleaned['ingredients'].apply(lambda x: x.replace('*', 'x').replace('-', '~'))

# 대괄호([]) 뒤에 오는 쉼표 제거
df_cleaned['ingredients'] = df_cleaned['ingredients'].apply(lambda x: re.sub(r'\],', ']', x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['ingredients'] = df_cleaned['ingredients'].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['ingredients'] = df_cleaned['ingredients'].apply(lambda x: x.replace('*', 'x').replace('-', '~'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['ingredients'] = df_clea

In [88]:
# ' 구매' 제거
df_cleaned['ingredients'] = df_cleaned['ingredients'].apply(lambda x: x.replace(' 구매', ''))

# 대괄호([]) 뒤에 오는 쉼표 제거
df_cleaned['ingredients'] = df_cleaned['ingredients'].apply(lambda x: re.sub(r'\],', ']', x))

# '[재료]' 제거 및 연속된 쉼표 정리
df_cleaned['ingredients'] = df_cleaned['ingredients'].apply(lambda x: re.sub(r'\[재료\]', '', x))
df_cleaned['ingredients'] = df_cleaned['ingredients'].apply(lambda x: re.sub(r',+', ',', x).strip(','))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['ingredients'] = df_cleaned['ingredients'].apply(lambda x: x.replace(' 구매', ''))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['ingredients'] = df_cleaned['ingredients'].apply(lambda x: re.sub(r'\],', ']', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['ingredi

In [89]:
# 재료명과 단위 사이의 쉼표를 '-'로 변환하는 함수
def replace_comma_with_dash(text):
    return re.sub(r'(\S)\s*,\s*(\d+[\w%]*)', r'\1-\2', text)

# 'ingredients' 컬럼에 적용
df_cleaned['ingredients'] = df_cleaned['ingredients'].apply(replace_comma_with_dash)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['ingredients'] = df_cleaned['ingredients'].apply(replace_comma_with_dash)


- RECIPE 칼럼

In [90]:
# 'recipe' 컬럼에서 등장하는 한자 찾기
hanja_pattern = re.compile(r'[\u4E00-\u9FFF]+')  # 한자 범위

# recipe 컬럼에서 한자가 포함된 부분 추출
hanja_found = set()
for text in df_recipes['recipe'].dropna():
    hanja_found.update(hanja_pattern.findall(text))

# 한자 목록 출력
hanja_found

{'一',
 '一锅汤',
 '三',
 '上',
 '下',
 '中',
 '中濃',
 '中華',
 '中辛',
 '丼',
 '乃',
 '乗',
 '井',
 '京酱',
 '京酱肉丝',
 '人',
 '伏',
 '会影响慕斯口感',
 '倾斜打蛋盆',
 '元宵',
 '元宵餠',
 '克纯净水浸泡片刻',
 '六分发',
 '冷',
 '冷藏待用',
 '分发后冷藏备用',
 '切',
 '前',
 '十',
 '半分',
 '可以看到打蛋器头上的奶油尖尖朝下',
 '右',
 '吃得软的可以打到四五分',
 '吉利丁粉加',
 '否则可能油水分离',
 '味味味',
 '喜欢蛋糕的',
 '四骨',
 '地瓜球',
 '外',
 '多',
 '夜',
 '大',
 '大戸屋',
 '大雪',
 '奶油可缓慢流动',
 '奶花',
 '好',
 '完成',
 '容易打发',
 '将饼干碎与融化后的黄油拌匀倒入模具压实',
 '将饼干碎换掉直接铺蛋糕即可',
 '小',
 '小雪',
 '川',
 '左',
 '巧克力最佳溶解温度',
 '巧克力隔水加热至融化',
 '巻',
 '巽',
 '必',
 '忍',
 '情',
 '打发奶油',
 '提起打蛋器',
 '斜',
 '时间打长了奶油太硬',
 '晩時之歎',
 '曰',
 '最高不可超过',
 '有',
 '東坡肉',
 '梅',
 '模具内铺一张底径大小的油纸',
 '死',
 '殺靑',
 '氣',
 '水卵器',
 '泡发后隔水加热至融化',
 '浅漬',
 '海鮮中華丼',
 '淡奶油先冷藏几个小时',
 '淡奶油加糖粉打至',
 '滷肉飯',
 '無',
 '牛尾蒸方',
 '生',
 '生春巻',
 '用',
 '甲',
 '石決明',
 '福',
 '第一排右',
 '第一排左',
 '第二排右',
 '第二排左',
 '筋',
 '節氣',
 '米',
 '總',
 '美',
 '美味',
 '芒種',
 '花椒粉',
 '茶葉蛋',
 '蛋糕底',
 '蛋黄中加入牛奶搅拌均匀后隔水加热并不断搅拌至略浓稠',
 '角',
 '豚丼',
 '赤',
 '足絲',
 '込',
 '适合做慕斯',
 '野菜',
 '量',
 '隔水加热融化',
 '雀',
 '雀巢淡奶油打发的奶油比较

In [None]:
def add_period_to_sentences(text):
    sentences = text.split('\n')  # 여러 줄이 있을 경우 개별적으로 처리
    updated_sentences = []
    
    for sentence in sentences:
        sentence = sentence.strip()
        if sentence and not sentence.endswith('.'):
            sentence += '.'
        updated_sentences.append(sentence)

    return '\n'.join(updated_sentences)

df_cleaned['recipe'] = df_cleaned['recipe'].apply(add_period_to_sentences)

In [91]:
import pandas as pd
import re

# NaN 값이 있으면 빈 문자열로 변환
df_cleaned['recipe'] = df_cleaned['recipe'].fillna('')

# 1. 불필요한 특수문자 제거 (이모지, 기호 등)
remove_chars = r'[【】▷▾▲☆★‼！☞✔✅❗❤❣⚡✌\ufeff\xa0]'
df_cleaned['recipe'] = df_cleaned['recipe'].apply(lambda x: re.sub(remove_chars, '', x))

# 2. 한글 자음/모음만 단독으로 존재하는 경우 제거 (예: ᄆ, ᅡ, ᆨ)
df_cleaned['recipe'] = df_cleaned['recipe'].apply(lambda x: re.sub(r'\b[ㄱ-ㅎㅏ-ㅣ]+\b', '', x))

# 3. 연속된 기호 정리 (예: `...` -> `.`)
df_cleaned['recipe'] = df_cleaned['recipe'].apply(lambda x: re.sub(r'([.!?])\1+', r'\1', x))

# 4. 한자 정리 (小, 中, 大만 남기고 나머지 한자 삭제)
def remove_unwanted_hanja(text):
    if pd.isnull(text):
        return text
    return re.sub(r'[\u4E00-\u9FFF&&[^\s小中大]]', '', text)  # 한자 범위에서 小中大만 남김

df_cleaned['recipe'] = df_cleaned['recipe'].apply(remove_unwanted_hanja)

# 5. 연속된 공백 제거
df_cleaned['recipe'] = df_cleaned['recipe'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

# 6. 문장 정리
# - 여러 개의 줄바꿈(\n)을 하나로 통일
# - 문장 부호(.,!?) 뒤에 공백 추가 (일관된 문장 구조 유지)
df_cleaned['recipe'] = df_cleaned['recipe'].apply(lambda x: re.sub(r'(\n\s*)+', '\n', x))
df_cleaned['recipe'] = df_cleaned['recipe'].apply(lambda x: re.sub(r'([.,!?])([^\s])', r'\1 \2', x))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['recipe'] = df_cleaned['recipe'].fillna('')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['recipe'] = df_cleaned['recipe'].apply(lambda x: re.sub(remove_chars, '', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['recipe'] = df_cleaned['recipe'].apply(lambda x: r

In [92]:
# 특수 문자 제거
def remove_unwanted_special_chars(text):
    if text is None:
        return text
    return ''.join(char for char in text if char not in {'!', '"', '#', '&', "'", ':', ';', '?', '@', '\\', '^', '`', '|', '}', '¡', '°', '´', '·', '˂', '˃', '˘', '˙', '˚', '˵', '˶', '̀', '́', '̆', '̈', '̑', '̣', '̵', '̶', '͈', '͜', '͡', '–', '‘', '’', '“', '”', '•', '…', '‸', '※', '‼', '↑', '→', '↓', '↘', '∀', '∇', '∧', '∨', '≦', '≧', '⊙', '⌓', '⌣', '⌯', '╹', '■', '▣', '▪', '▲', '△', '▶', '▷', '►', '▼', '▾', '◀', '◆', '◈', '○', '◍', '◎', '●', '◡', '◼', '★', '☆', '☌', '☞', '☺', '♠', '♡', '♣', '♤', '♥', '♧', '♨', '♩', '♪', '♬', '⚡', '✅', '✌', '✔', '✨', '✿', '❌', '❗', '❛', '❣', '❤', '⭐', '。', '〈', '〉', '《', '》', '「', '」', '『', '』', '【', '】', '〜', '〰', '・', '㉠', '㉡', '㉢', '㉣', '㉤', '㉥', '㉦', '︎', '️', '︶', '\ufeff', '！', '（', '）', '＊', '，', '｀', '～', '｡', '･'})

df_cleaned['recipe'] = df_cleaned['recipe'].apply(remove_unwanted_special_chars)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['recipe'] = df_cleaned['recipe'].apply(remove_unwanted_special_chars)


In [93]:
# 제어 문자 제거 함수
def remove_control_chars(text):
    if text is None:
        return text
    return re.sub(r'[\x00-\x1F\x7F]', '', text)  # 제어 문자 범위 제거

# recipe 칼럼에서 제어 문자 제거 적용
df_cleaned['recipe'] = df_cleaned['recipe'].apply(remove_control_chars)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['recipe'] = df_cleaned['recipe'].apply(remove_control_chars)


In [94]:
# '*' 문자를 'x'로 변환하는 함수
def replace_asterisk(text):
    if text is None:
        return text
    return text.replace('*', 'x')

# recipe 칼럼에서 '*' 변환 적용
df_cleaned['recipe'] = df_cleaned['recipe'].apply(replace_asterisk)

def remove_unwanted_x(text):
    if text is None:
        return text
    # 'x'가 숫자 앞에 오는 경우 (공백 포함) 를 제외하고 삭제
    return re.sub(r'\bx(?!\s?\d)', '', text)

# recipe 칼럼에서 'x' 변환 적용
df_cleaned['recipe'] = df_cleaned['recipe'].apply(remove_unwanted_x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['recipe'] = df_cleaned['recipe'].apply(replace_asterisk)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['recipe'] = df_cleaned['recipe'].apply(remove_unwanted_x)


In [95]:
# 이모지 제거

def remove_emojis(text):
    emoji_pattern = re.compile(
        "[\U0001F300-\U0001F5FF\U0001F600-\U0001F64F\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF"
        "\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U00002702-\U000027B0]+",
        flags=re.UNICODE
    )
    return emoji_pattern.sub('', text)

# recipe 칼럼에서 'x' 변환 적용
df_cleaned['recipe'] = df_cleaned['recipe'].apply(remove_emojis)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['recipe'] = df_cleaned['recipe'].apply(remove_emojis)


In [96]:
def process_text(text):
    # 1. 모든 '-'를 '~'로 변경
    text = text.replace('-', '~')

    # 2. '~' 뒤에 숫자가 오거나, 공백 후 숫자가 오는 경우를 제외하고 삭제
    processed_text = re.sub(r"~(?!\s?\d)", "", text)

    return processed_text

df_cleaned['recipe'] = df_cleaned['recipe'].apply(process_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['recipe'] = df_cleaned['recipe'].apply(process_text)


In [97]:
# 특정한 형식의 괄호만 제거하는 함수 정의
def clean_specific_recipe_text(text):
    # '()' 및 '(x)' 만 삭제
    cleaned_text = re.sub(r"\(\s*\)", "", text)  # 빈 괄호 또는 공백만 있는 괄호 삭제
    cleaned_text = re.sub(r"\(x\)", "", cleaned_text)  # '(x)' 삭제
    return cleaned_text.strip()

# 'recipe' 칼럼 정리 적용
df_cleaned['recipe'] = df_cleaned['recipe'].apply(clean_specific_recipe_text)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['recipe'] = df_cleaned['recipe'].apply(clean_specific_recipe_text)


In [98]:
# 자음만 있거나 모음만 있는 경우를 제거하는 함수 정의
def remove_consonant_vowel_only(text):
    # 한글 자음과 모음 유니코드 범위
    consonants = "ㄱㄲㄴㄷㄸㄹㅁㅂㅃㅅㅆㅇㅈㅉㅊㅋㅌㅍㅎ"
    vowels = "ㅏㅐㅑㅒㅓㅔㅕㅖㅗㅘㅙㅚㅛㅜㅝㅞㅟㅠㅡㅢㅣ"
    
    # 정규식 패턴: 공백 또는 문장 경계에서 자음만 있는 경우 또는 모음만 있는 경우
    pattern = rf"\s*[{consonants}]+\s*|\s*[{vowels}]+\s*"

    # 패턴에 해당하는 문자 삭제
    cleaned_text = re.sub(pattern, " ", text)
    return cleaned_text.strip()

# 'recipe' 칼럼 정리 적용
df_cleaned['recipe'] = df_cleaned['recipe'].apply(remove_consonant_vowel_only)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['recipe'] = df_cleaned['recipe'].apply(remove_consonant_vowel_only)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['recipe'] = df_cleaned['recipe'].apply(add_period_to_sentences)


In [100]:
# 'xx' 이상의 연속된 'x'가 포함된 단어 제거 함수 정의
def remove_multiple_x_words(text):
    # 'xx' 이상의 연속된 'x'가 포함된 단어 삭제
    cleaned_text = re.sub(r"\b\w*x{2,}\w*\b", "", text)
    return cleaned_text.strip()

# 'recipe' 칼럼에서 'xx' 이상의 연속된 'x'가 포함된 단어 제거 적용
df_cleaned['recipe'] = df_cleaned['recipe'].apply(remove_multiple_x_words)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['recipe'] = df_cleaned['recipe'].apply(remove_multiple_x_words)


In [101]:
# '_', '><', '>_<' 문자 제거 함수 정의
def remove_special_chars(text):
    # 특정 문자들('_', '><', '>_<')을 제거
    cleaned_text = re.sub(r"_|><|>_<", "", text)
    return cleaned_text.strip()

# 'recipe' 칼럼에서 해당 문자 제거 적용
df_cleaned['recipe'] = df_cleaned['recipe'].apply(remove_special_chars)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['recipe'] = df_cleaned['recipe'].apply(remove_special_chars)


In [102]:
# '. ' 또는 ' .' 형태의 공백을 제거하고 '.'을 문장과 붙이는 함수 정의
def fix_period_spacing(text):
    # '. ' 또는 ' .'을 '.'으로 변경하여 문장과 붙임
    cleaned_text = re.sub(r"\s*\.\s*", ".", text)
    return cleaned_text.strip()

# 'recipe' 칼럼에 적용
df_cleaned['recipe'] = df_cleaned['recipe'].apply(fix_period_spacing)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['recipe'] = df_cleaned['recipe'].apply(fix_period_spacing)


In [103]:
# 다양한 형식의 URL 제거 함수 수정
def remove_all_urls_v3(text):
    # URL 패턴 (http, https, www 포함, 공백 및 변형 포함)
    url_pattern = r"\b(?:https?[:/]+|http[:/]+|www\.)\S+\b"
    
    # 해당 패턴을 삭제
    cleaned_text = re.sub(url_pattern, "", text)
    return cleaned_text.strip()

# 'recipe' 칼럼에서 URL 제거 적용
df_cleaned['recipe'] = df_cleaned['recipe'].apply(remove_all_urls_v3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['recipe'] = df_cleaned['recipe'].apply(remove_all_urls_v3)


In [104]:
# 'http', 'https', 'www' 포함한 URL 제거 함수 수정
def remove_all_http_variants(text):
    # URL 패턴 (http, https, www 포함하여 공백 및 변형도 포함)
    url_pattern = r"\b(?:https?|http|www)[\s\S]*?\b"
    
    # 해당 패턴을 삭제
    cleaned_text = re.sub(url_pattern, "", text, flags=re.IGNORECASE)
    return cleaned_text.strip()

# 'recipe' 칼럼에서 URL 및 'http' 제거 적용
df_cleaned['recipe'] = df_cleaned['recipe'].apply(remove_all_http_variants)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['recipe'] = df_cleaned['recipe'].apply(remove_all_http_variants)


In [105]:
# 'http', 'https', 'www' 앞에 공백을 추가한 후 제거하는 함수 수정
def remove_all_http_variants_with_space(text):
    # 'http', 'https', 'www' 앞에 공백 추가
    text = re.sub(r"(https?|http|www)", r" \1", text, flags=re.IGNORECASE)

    # URL 패턴 (http, https, www 포함하여 공백 및 변형도 포함)
    url_pattern = r"\b(?:https?|http|www)[\s\S]*?\b"
    
    # 해당 패턴을 삭제
    cleaned_text = re.sub(url_pattern, "", text, flags=re.IGNORECASE)
    return cleaned_text.strip()

# 'recipe' 칼럼에서 URL 및 'http' 제거 적용
df_cleaned['recipe'] = df_cleaned['recipe'].apply(remove_all_http_variants_with_space)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['recipe'] = df_cleaned['recipe'].apply(remove_all_http_variants_with_space)


In [106]:
# 특수번호 문자(①, ②, ...)를 '1.', '2.' 형식으로 변환하는 함수 정의
def replace_special_numbers(text):
    # 특수번호 문자 매핑
    num_map = {
        "①": "1.", "②": "2.", "③": "3.", "④": "4.", "⑤": "5.",
        "⑥": "6.", "⑦": "7.", "⑧": "8.", "⑨": "9.", "⑩": "10."
    }
    
    # 매핑된 값으로 변환
    for key, value in num_map.items():
        text = text.replace(key, value)
    
    return text.strip()

# 'recipe' 칼럼에서 특수번호 변환 적용
df_cleaned['recipe'] = df_cleaned['recipe'].apply(replace_special_numbers)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['recipe'] = df_cleaned['recipe'].apply(replace_special_numbers)


In [107]:
# 새로운 SQLite 데이터베이스 파일 경로
new_db_path = r"C:\Users\Playdata\Desktop\만개 전처리\SKN06-FINAL-6Team\임베딩전최종1.db"

# 새로운 데이터베이스에 저장
new_conn = sqlite3.connect(new_db_path)
df_cleaned.to_sql("recipes", new_conn, if_exists="replace", index=False)
conn.close()
new_conn.close()

# 저장된 파일 경로 반환
new_db_path

'C:\\Users\\Playdata\\Desktop\\만개 전처리\\SKN06-FINAL-6Team\\임베딩전최종1.db'