In [1]:
import torch

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)
print("Number of GPUs:", torch.cuda.device_count())

if torch.cuda.is_available():
    print("Current device:", torch.cuda.current_device())
    print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

PyTorch version: 2.5.1+cu118
CUDA available: True
CUDA version: 11.8
Number of GPUs: 1
Current device: 0
Device name: NVIDIA GeForce RTX 4050 Laptop GPU


##### Chunking (청킹)
- LLM에서 Chunking이란 긴 텍스트 데이터를 더 작은 단위로 나누는 과정을 의미한다.
- chunking의 필요성  
    1. 입력 데이터의 크기에 대한 제한
    2. 처리 시간 단축

- chunk_size: 한 단락
- chunk_overlap: 두 단락 사이의 겹치는 부분

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

text = "Content filtering is a process involving the use of software or hardware to screen and/or restrict access to objectionable email,\
webpages, executables and other suspicious items. Companies often use content-based filtering, also known as information filtering, as part of their internet firewalls.\
A common security measure, content filtering helps companies execute corporate policies on the use of information systems -- \
for example, the filtering and blocking of employee access to social media platforms.\
Additionally, parents often use web filtering to screen and/or exclude content their children have access to from a home computer.\
Filtering software can screen content for anything that is objectionable or criminal, including online porn, hate sites, illegal content and social media.\
However, one drawback of content filtering programs is that it is easy to unintentionally block access to content that should not be blocked."

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
chunk_size = 26     # 길이
chunk_overlap = 4       # 겹치는 부분

r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

In [4]:
print(c_splitter.split_text(text))

['Content filtering is a process involving the use of software or hardware to screen and/or restrict access to objectionable email,webpages, executables and other suspicious items. Companies often use content-based filtering, also known as information filtering, as part of their internet firewalls.A common security measure, content filtering helps companies execute corporate policies on the use of information systems -- for example, the filtering and blocking of employee access to social media platforms.Additionally, parents often use web filtering to screen and/or exclude content their children have access to from a home computer.Filtering software can screen content for anything that is objectionable or criminal, including online porn, hate sites, illegal content and social media.However, one drawback of content filtering programs is that it is easy to unintentionally block access to content that should not be blocked.']


In [5]:
print(r_splitter.split_text(text))

['Content filtering is a', 'a process involving the', 'the use of software or', 'or hardware to screen', 'and/or restrict access to', 'to objectionable', 'email,webpages,', 'executables and other', 'suspicious items.', 'Companies often use', 'use content-based', 'filtering, also known as', 'as information filtering,', 'as part of their internet', 'firewalls.A common', 'security measure, content', 'filtering helps companies', 'execute corporate', 'policies on the use of', 'of information systems --', '-- for example, the', 'the filtering and', 'and blocking of employee', 'access to social media', 'platforms.Additionally,', 'parents often use web', 'web filtering to screen', 'and/or exclude content', 'their children have', 'access to from a home', 'computer.Filtering', 'software can screen', 'content for anything that', 'is objectionable or', 'or criminal, including', 'online porn, hate sites,', 'illegal content and', 'and social media.However,', 'one drawback of content', 'filtering pro

In [6]:
chunk_size = 1
chunk_overlap = 0

r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

r_splitter.split_text(text)

['C',
 'o',
 'n',
 't',
 'e',
 'n',
 't',
 ' ',
 'f',
 'i',
 'l',
 't',
 'e',
 'r',
 'i',
 'n',
 'g',
 ' ',
 'i',
 's',
 ' ',
 'a',
 ' ',
 'p',
 'r',
 'o',
 'c',
 'e',
 's',
 's',
 ' ',
 'i',
 'n',
 'v',
 'o',
 'l',
 'v',
 'i',
 'n',
 'g',
 ' ',
 't',
 'h',
 'e',
 ' ',
 'u',
 's',
 'e',
 ' ',
 'o',
 'f',
 ' ',
 's',
 'o',
 'f',
 't',
 'w',
 'a',
 'r',
 'e',
 ' ',
 'o',
 'r',
 ' ',
 'h',
 'a',
 'r',
 'd',
 'w',
 'a',
 'r',
 'e',
 ' ',
 't',
 'o',
 ' ',
 's',
 'c',
 'r',
 'e',
 'e',
 'n',
 ' ',
 'a',
 'n',
 'd',
 '/',
 'o',
 'r',
 ' ',
 'r',
 'e',
 's',
 't',
 'r',
 'i',
 'c',
 't',
 ' ',
 'a',
 'c',
 'c',
 'e',
 's',
 's',
 ' ',
 't',
 'o',
 ' ',
 'o',
 'b',
 'j',
 'e',
 'c',
 't',
 'i',
 'o',
 'n',
 'a',
 'b',
 'l',
 'e',
 ' ',
 'e',
 'm',
 'a',
 'i',
 'l',
 ',',
 'w',
 'e',
 'b',
 'p',
 'a',
 'g',
 'e',
 's',
 ',',
 ' ',
 'e',
 'x',
 'e',
 'c',
 'u',
 't',
 'a',
 'b',
 'l',
 'e',
 's',
 ' ',
 'a',
 'n',
 'd',
 ' ',
 'o',
 't',
 'h',
 'e',
 'r',
 ' ',
 's',
 'u',
 's',
 'p',
 'i',
 'c'

In [7]:
chunk_size = 2
chunk_overlap = 1

r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

r_splitter.split_text(text)

['Co',
 'on',
 'nt',
 'te',
 'en',
 'nt',
 'f',
 'fi',
 'il',
 'lt',
 'te',
 'er',
 'ri',
 'in',
 'ng',
 'i',
 'is',
 'a',
 'p',
 'pr',
 'ro',
 'oc',
 'ce',
 'es',
 'ss',
 'i',
 'in',
 'nv',
 'vo',
 'ol',
 'lv',
 'vi',
 'in',
 'ng',
 't',
 'th',
 'he',
 'u',
 'us',
 'se',
 'o',
 'of',
 's',
 'so',
 'of',
 'ft',
 'tw',
 'wa',
 'ar',
 're',
 'o',
 'or',
 'h',
 'ha',
 'ar',
 'rd',
 'dw',
 'wa',
 'ar',
 're',
 't',
 'to',
 's',
 'sc',
 'cr',
 're',
 'ee',
 'en',
 'a',
 'an',
 'nd',
 'd/',
 '/o',
 'or',
 'r',
 're',
 'es',
 'st',
 'tr',
 'ri',
 'ic',
 'ct',
 'a',
 'ac',
 'cc',
 'ce',
 'es',
 'ss',
 't',
 'to',
 'o',
 'ob',
 'bj',
 'je',
 'ec',
 'ct',
 'ti',
 'io',
 'on',
 'na',
 'ab',
 'bl',
 'le',
 'e',
 'em',
 'ma',
 'ai',
 'il',
 'l,',
 ',w',
 'we',
 'eb',
 'bp',
 'pa',
 'ag',
 'ge',
 'es',
 's,',
 'e',
 'ex',
 'xe',
 'ec',
 'cu',
 'ut',
 'ta',
 'ab',
 'bl',
 'le',
 'es',
 'a',
 'an',
 'nd',
 'o',
 'ot',
 'th',
 'he',
 'er',
 's',
 'su',
 'us',
 'sp',
 'pi',
 'ic',
 'ci',
 'io',
 'ou',
 

##### chunking 유형
1. CharacterTextSplitter()  
    - 단순하게, 주어진 텍스트를 문자 단위로 분할한다.  
2. RecursiveCharacterTextSplitter()
    - 문서를 재귀적으로 분할한다.
    - 가장 먼저 \n\n (큰 덩어리)로 표시된 2줄 띄어쓰기 부분을 먼저 나눈다.  
    이후, 한 줄 띄어쓰기로 표시된 문장을 청크로 분리한다.  
    마지막으로 Character로 분리하기 때문에 더 세밀하게 chunking이 된다.
3. TokenTextSplitter()
    - 문자를 기준으로 텍스트를 분할하는 것이 아닌, 토큰 수를 기준으로 분할하는 방식.
    - 토큰은 일반적으로 최대 4자로 구성된다. 

##### chunking 고려사항
1. 문맥 손실
2. 청크 경계 처리: 청크를 나누는 과정에서 문장이 잘리지 않도록 주의한다.
- 해결방안
    - 청크 경계가 자연스러운 언어 단위(예: 문장, 문단)에 맞춰지도록 조절하고, 필요한 경우 청크 간에 일부 중복을 허용한다.
    - 적절한 chunk_size와 chunk_overlap값 찾기
    - 적절한 splitter 방법 찾기 (데이터와 서비스에 따라 방법이 다를 수 있다.)

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
chunk_size = 20     # 길이
chunk_overlap = 4       # 겹치는 부분

r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

In [9]:
# Recusive Splitter
text1 = '청킹을 위한 예제 데이터입니다. 어떻게 쪼개지는지 알아볼까요?'
r_splitter.split_text(text1)

['청킹을 위한 예제 데이터입니다.', '어떻게 쪼개지는지 알아볼까요?']

In [10]:
# Character Splitter
text2 = '청킹을 위한 예제 데이터입니다. 어떻게 쪼개지는지 알아볼까요? 결과를 확인해봅시다.'
c_splitter.split_text(text2)

['청킹을 위한 예제 데이터입니다. 어떻게 쪼개지는지 알아볼까요? 결과를 확인해봅시다.']

In [11]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

some_text = "Content filtering is a process involving the use of software or hardware to screen and/or restrict access to objectionable email,\
webpages, executables and other suspicious items. Companies often use content-based filtering, also known as information filtering, as part of their internet firewalls.\
A common security measure, content filtering helps companies execute corporate policies on the use of information systems -- \n\n \
for example, the filtering and blocking of employee access to social media platforms.\
Additionally, parents often use web filtering to screen and/or exclude content their children have access to from a home computer.\n\n \
Filtering software can screen content for anything that is objectionable or criminal, including online porn, hate sites, illegal content and social media.\
However, one drawback of content filtering programs is that it is easy to unintentionally block access to content that should not be blocked."

In [16]:
c_splitter = CharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=0,
    separator= ' '
)

c_splitter.split_text(some_text)

['Content filtering is a process involving the use of software or hardware to screen and/or restrict access to objectionable email,webpages, executables and other suspicious items. Companies often use content-based filtering, also known as information filtering, as part of their internet firewalls.A common security measure, content filtering helps companies execute corporate policies on the use of information systems -- \n\n for example, the',
 'filtering and blocking of employee access to social media platforms.Additionally, parents often use web filtering to screen and/or exclude content their children have access to from a home computer.\n\n Filtering software can screen content for anything that is objectionable or criminal, including online porn, hate sites, illegal content and social media.However, one drawback of content filtering programs is that it is easy to unintentionally block',
 'access to content that should not be blocked.']

In [19]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=0,
    separators = ["\n\n", "\n", " ", ""]
)

r_splitter.split_text(some_text)

['Content filtering is a process involving the use of software or hardware to screen and/or restrict access to objectionable email,webpages, executables and other suspicious items. Companies often use content-based filtering, also known as information filtering, as part of their internet firewalls.A common security measure, content filtering helps companies execute corporate policies on the use of information systems --',
 'for example, the filtering and blocking of employee access to social media platforms.Additionally, parents often use web filtering to screen and/or exclude content their children have access to from a home computer.',
 'Filtering software can screen content for anything that is objectionable or criminal, including online porn, hate sites, illegal content and social media.However, one drawback of content filtering programs is that it is easy to unintentionally block access to content that should not be blocked.']

In [23]:
from langchain.text_splitter import TokenTextSplitter

text_splitter = TokenTextSplitter(chunk_size=1, chunk_overlap=0)

text1 = "Border Collies are the typical breed most people think of when asked to name an intelligent dog breed. They are commonly used in dog sports, agility, as well as the more traditional herding and stockwork that they were originally bred to do."

text_splitter.split_text(text1)

['Border',
 ' Coll',
 'ies',
 ' are',
 ' the',
 ' typical',
 ' breed',
 ' most',
 ' people',
 ' think',
 ' of',
 ' when',
 ' asked',
 ' to',
 ' name',
 ' an',
 ' intelligent',
 ' dog',
 ' breed',
 '.',
 ' They',
 ' are',
 ' commonly',
 ' used',
 ' in',
 ' dog',
 ' sports',
 ',',
 ' agility',
 ',',
 ' as',
 ' well',
 ' as',
 ' the',
 ' more',
 ' traditional',
 ' her',
 'ding',
 ' and',
 ' stock',
 'work',
 ' that',
 ' they',
 ' were',
 ' originally',
 ' bred',
 ' to',
 ' do',
 '.']

In [24]:
from langchain.text_splitter import CharacterTextSplitter
chunk_size = 20
chunk_overlap = 5

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    model_name="gpt-3.5-turbo"
)

In [35]:
from langchain.document_loaders import PyPDFLoader

# PDF 가져오기
loaders = [
    PyPDFLoader("data/gpt3(Language Models are Few-Shot Learners).pdf")
]

docs = []
for loader in loaders:
    docs.extend(loader.load())

In [36]:
doc = splitter.split_documents(docs)

In [37]:
# 첫 번째 문서의 분할된 청크의 크기와 문서의 내용을 각각 출력
print(len(doc[0].page_content))
doc[0]

2425


Document(metadata={'source': 'data/gpt3(Language Models are Few-Shot Learners).pdf', 'page': 0}, page_content='Language Models are Few-Shot Learners\nTom B. Brown∗ Benjamin Mann∗ Nick Ryder∗ Melanie Subbiah∗\nJared Kaplan† Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry\nAmanda Askell Sandhini Agarwal Ariel Herbert-Voss Gretchen Krueger Tom Henighan\nRewon Child Aditya Ramesh Daniel M. Ziegler Jeffrey Wu Clemens Winter\nChristopher Hesse Mark Chen Eric Sigler Mateusz Litwin Scott Gray\nBenjamin Chess Jack Clark Christopher Berner\nSam McCandlish Alec Radford Ilya Sutskever Dario Amodei\nOpenAI\nAbstract\nRecent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training\non a large corpus of text followed by ﬁne-tuning on a speciﬁc task. While typically task-agnostic\nin architecture, this method still requires task-speciﬁc ﬁne-tuning datasets of thousands or tens of\nthousands of examples. By contrast, humans can generally perform a new lan