<a href="https://colab.research.google.com/github/Soyeon250/Data-Analysis-with-Open-Source/blob/main/%EC%98%A4%ED%94%88%EC%86%8C%EC%8A%A4_%EB%8D%B0%EC%9D%B4%ED%84%B0_%EB%B6%84%EC%84%9D_14%EA%B0%95.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 14강 비정형 데이터 분석 : 패션 사진 데이터 활용

### 목표

- 비정형 데이터를 인공지능 모델로 분석하여 실무에서 활용 가능한 보고서 형태로 가공

- 패션 트렌드라는 구체적인 주제를 통해, 비정형 데이터 분석의 실질적인 활용 방안을 경험하고자 함


### 분석 프로세스 개요

1. 데이터 수집
  - requests를 이용한 RSS 데이터 수집
  - lxml을 이용한 XML 파싱
  - 이미지 데이터 추출
2. VLM을 이용한 이미지 분석
  - 프롬프트를 이용한 이미지 필터링
  - 프롬프트를 이용한 스타일 분석
3. LLM을 이용한 키워드 분석 및 보고서 작성
  - 텍스트 전처리
  - 색상 및 스타일 키워드 추출
  - 워드 클라우드 분석
  - 보고서 작성

# 주의 : 런타임 GPU 로 설정 필요

In [1]:
# 4bit VLM 처리를 위한 bitsandbytes 설치
# LLM 처리를 위한 VLLM 설치 (오래걸리는 작업(>5분)이므로 미리 실행!)
!pip install bitsandbytes==0.45.3 vllm==0.7.3 transformers==4.48.2
# 필요 시 세션 재시작



In [4]:
# 한글 처리를 위한 matplotlib 설정 (1)

!sudo apt-get install -y fonts-nanum
!sudo fc-cache –fv
!rm ~/.cache/matplotlib -rf

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
fonts-nanum is already the newest version (20200506-1).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.


- 런타임 -> 세션 다시 시작

In [1]:
# 한글 처리를 위한 matplotlib 설정 (2)

import matplotlib.pyplot as plt
plt.rc('font', family='NanumBarunGothic')

# 1. 데이터 수집 및 전처리

## 14-1 RSS 피드에서 이미지 URL 추출

In [2]:
import requests  # RSS 정보 가져옴
from lxml import etree  # XML 정보를 트리 형태로 변환
from lxml.html import fromstring  # HTML 정보를 트리 형태로 변환
import pandas as pd

def extract_unique_images(rss_url):
    ## 주어진 RSS 피드 URL에서 고유한 이미지 URL들을 추출하는 함수 정의
    try:
        ## requests 라이브러리를 사용하여 RSS 피드 URL로부터 내용을 가져옴
        response = requests.get(rss_url)
        ## 가져온 XML 응답 내용을 lxml의 etree.fromstring으로 파싱하여 XML 트리 root를 생성
        root = etree.fromstring(response.content)
        image_urls = set()

        ## XML 트리에서 모든 'item' 태그를 XPath를 사용하여 순회
        for item in root.xpath('//item'):
            description = item.find('description')
            if description is not None and description.text:
                ## description의 텍스트 내용을 lxml.html.fromstring으로 파싱하여 HTML 트리를 생성
                html_tree = fromstring(description.text)
                ## HTML 트리에서 첫 번째 <img> 태그의 'src' 속성 값을 XPath를 사용하여 추출
                img_url = html_tree.xpath('string(//img/@src)')
                if img_url:
                    image_urls.add(img_url)

        return list(image_urls)

    except Exception as e:
        ## 오류 발생 시 오류 메시지를 출력하고 빈 리스트를 반환
        print(f"Error occurred: {e}")
        return []

rss_url = "https://glltn.com/feed/"
## extract_unique_images 함수를 호출하여 고유한 이미지 URL들을 추출
unique_images = extract_unique_images(rss_url)

## 추출된 이미지 URL 리스트를 사용하여 'image'라는 열을 가진 pandas DataFrame을 생성
df = pd.DataFrame(unique_images, columns=["image"])

In [3]:
df

Unnamed: 0,image
0,https://glltn.com/wp-content/blogs.dir/1/files...
1,https://glltn.com/wp-content/blogs.dir/1/files...
2,https://glltn.com/wp-content/blogs.dir/1/files...
3,https://glltn.com/wp-content/blogs.dir/1/files...
4,https://glltn.com/wp-content/blogs.dir/1/files...
5,https://glltn.com/wp-content/blogs.dir/1/files...
6,https://glltn.com/wp-content/blogs.dir/1/files...
7,https://glltn.com/wp-content/blogs.dir/1/files...
8,https://glltn.com/wp-content/blogs.dir/1/files...
9,https://glltn.com/wp-content/blogs.dir/1/files...


## 14-2 수집 데이터 확인

In [4]:
from IPython.display import display, HTML

def path_to_image_html(path):
    ## 이미지 경로를 HTML img 태그로 변환하는 함수
    return f'<img src="{path}" width="300" />'

## DataFrame의 스타일을 설정하여 이미지 너비를 300px로 지정
df.style.set_table_styles([{'selector': 'img', 'props': 'width: 300px;'}])

## DataFrame을 HTML로 변환하여 출력. 이미지 열은 path_to_image_html 함수로 포맷팅
display(HTML(df.to_html(escape=False, formatters=dict(**{'image': path_to_image_html}))))

Unnamed: 0,image
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,


## 2. VLM을 이용한 이미지 분석

## 14-3 VLM 모델 로드

In [5]:
import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer

## 'openbmb/MiniCPM-V-2_6-int4' 모델을 사전 훈련된 가중치와 함께 로드
## trust_remote_code=True는 허브에서 사용자 정의 코드를 실행할 수 있도록 허용
model = AutoModel.from_pretrained('openbmb/MiniCPM-V-2_6-int4', trust_remote_code=True)
## 로드된 모델에 해당하는 토크나이저를 로드
tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-V-2_6-int4', trust_remote_code=True)
## 모델을 평가 모드로 설정 (드롭아웃 등 훈련 시에만 필요한 기능 비활성화)
model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

configuration_minicpm.py: 0.00B [00:00, ?B/s]

modeling_navit_siglip.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/openbmb/MiniCPM-V-2_6-int4:
- modeling_navit_siglip.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/openbmb/MiniCPM-V-2_6-int4:
- configuration_minicpm.py
- modeling_navit_siglip.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_minicpmv.py: 0.00B [00:00, ?B/s]

resampler.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/openbmb/MiniCPM-V-2_6-int4:
- resampler.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/openbmb/MiniCPM-V-2_6-int4:
- modeling_minicpmv.py
- resampler.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.45G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenization_minicpmv_fast.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/openbmb/MiniCPM-V-2_6-int4:
- tokenization_minicpmv_fast.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

MiniCPMV(
  (llm): Qwen2ForCausalLM(
    (model): Qwen2Model(
      (embed_tokens): Embedding(151666, 3584)
      (layers): ModuleList(
        (0-27): 28 x Qwen2DecoderLayer(
          (self_attn): Qwen2Attention(
            (q_proj): Linear4bit(in_features=3584, out_features=3584, bias=True)
            (k_proj): Linear4bit(in_features=3584, out_features=512, bias=True)
            (v_proj): Linear4bit(in_features=3584, out_features=512, bias=True)
            (o_proj): Linear4bit(in_features=3584, out_features=3584, bias=False)
          )
          (mlp): Qwen2MLP(
            (gate_proj): Linear4bit(in_features=3584, out_features=18944, bias=False)
            (up_proj): Linear4bit(in_features=3584, out_features=18944, bias=False)
            (down_proj): Linear4bit(in_features=18944, out_features=3584, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
          (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=

![](https://farm3.staticflickr.com/2677/4434956914_6e95a22940_z.jpg)

## 14-4 이미지 질문 응답 예시

In [6]:
import io
from transformers import set_seed

## 재현성을 위해 시드(seed)를 42로 설정
set_seed(42)
## 예시 이미지 URL 정의
image_url = 'https://farm3.staticflickr.com/2677/4434956914_6e95a22940_z.jpg'
## requests로 이미지 다운로드 후 PIL Image 객체로 열고 RGB 형식으로 변환
image = Image.open(requests.get(image_url, stream=True).raw).convert('RGB')
## 이미지에 대한 질문 정의
question = 'how many cats in the photo?'
## 모델 입력 형식에 맞춰 메시지 구성 (이미지와 질문 포함)
msgs = [{'role': 'user', 'content': [image, question]}]
## 모델의 chat 함수를 호출하여 이미지와 질문에 대한 응답 생성
result = model.chat(image=None, msgs=msgs, tokenizer=tokenizer)
## 모델의 응답 출력
print(result)

preprocessor_config.json:   0%|          | 0.00/714 [00:00<?, ?B/s]

processing_minicpmv.py: 0.00B [00:00, ?B/s]

image_processing_minicpmv.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/openbmb/MiniCPM-V-2_6-int4:
- image_processing_minicpmv.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/openbmb/MiniCPM-V-2_6-int4:
- processing_minicpmv.py
- image_processing_minicpmv.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


1


In [7]:
set_seed(42)
## 이미지에 대한 질문을 업데이트. 책 표지의 고양이도 포함하도록 요청
question = 'how many cats in the photo? including the books cover.'
## 모델 입력 형식에 맞춰 메시지 구성 (이전에 로드된 이미지와 업데이트된 질문 포함)
msgs = [{'role': 'user', 'content': [image, question]}]
## 모델의 chat 함수를 호출하여 업데이트된 질문에 대한 응답 생성
result = model.chat(image=None, msgs=msgs, tokenizer=tokenizer)
## 모델의 응답 출력
print(result)

1


In [8]:
set_seed(42)
## 이미지에 대한 질문을 'describe the photo'로 설정하여 이미지 내용을 설명하도록 요청
question = 'describe the photo'
## 모델 입력 형식에 맞춰 메시지 구성 (이전에 로드된 이미지와 설명 요청 질문 포함)
msgs = [{'role': 'user', 'content': [image, question]}]
## 모델의 chat 함수를 호출하여 이미지에 대한 설명을 생성
result = model.chat(image=None, msgs=msgs, tokenizer=tokenizer)
## 모델의 응답 (이미지 설명) 출력
print(result)

The photo shows a book titled "why dogs are better than cats" with an image of a cat on the cover, resting its head on a dog's back. To the right side of the frame is a real-life cat, standing and observing the camera, which has a similar striped pattern as the one depicted on the book cover. The setting appears to be indoors, possibly on a wooden surface or floor.


The photo shows a book titled "why dogs are better than cats" with an image of a cat on the cover, resting its head on a dog's back. To the right side of the frame is a real-life cat, standing and observing the camera, which has a similar striped pattern as the one depicted on the book cover. The setting appears to be indoors, possibly on a wooden surface or floor.


## 14-5 의류 이미지 여부 판단

In [10]:
def is_picture_of_clothing(image_url):
    ## 이미지 URL이 의류 사진인지 판단하는 함수
    # 의류가 포함된 사진인지 확인하는 질문 작성 (영어로)
    question = 'Is this a picture of clothing? MUST say yes or no.'
    image = Image.open(requests.get(image_url, stream=True).raw).convert('RGB')
    msgs = [{'role': 'user', 'content': [image, question]}]
    result = model.chat(image=None, msgs=msgs, tokenizer=tokenizer, temperature=0.1)
    print(result)
    ## 응답에 'yes'가 포함되어 있는지 확인하여 True/False 반환
    return 'yes' in result.lower()

## DataFrame의 'image' 열에 함수를 적용하여 'is_clothing' 열에 결과 저장
df['is_clothing'] = df['image'].apply(is_picture_of_clothing)

Yes, this image is of clothing. It features a person wearing a jacket and pants, which are the main subjects of the photograph. The focus on the attire suggests that it could be used for fashion-related purposes such as showcasing a particular style or brand of clothing.
Yes.
No, this image is not of clothing. It is a photograph from a book cover that features a person climbing a streetlight pole in an urban environment. The focus is on the artwork and photography rather than any fashion or clothing items.
Yes, this image is of clothing. The photograph captures a person wearing overalls, which are a type of garment typically associated with workwear or casual attire. Overalls consist of a bib and back apron fastened onto a pair of trousers by buttons or snaps and are designed to protect the wearer's clothes from dirt and wear while performing manual labor or other activities that may involve getting dirty or wet. In this context, the overalls serve as both a functional piece of clothin

## 14-6 의류 판단 결과 시각화

In [11]:
display(HTML(df.to_html(escape=False, formatters=dict(**{'image': path_to_image_html}))))

Unnamed: 0,image,is_clothing
0,,True
1,,True
2,,False
3,,True
4,,True
5,,False
6,,True
7,,True
8,,True
9,,False


## 14-7 의류 이미지 필터링

In [12]:
## 'is_clothing' 열의 값이 True인 행들만 필터링하여 DataFrame을 업데이트
df = df[df['is_clothing']]

In [13]:
display(HTML(df.to_html(escape=False, formatters=dict(**{'image': path_to_image_html}))))

Unnamed: 0,image,is_clothing
0,,True
1,,True
3,,True
4,,True
6,,True
7,,True
8,,True
10,,True
11,,True


## 14-8 의류 스타일 분석

In [14]:
def describe_style(image_url):
    ## 주어진 이미지 URL의 의류 스타일을 분석하는 함수
    question = 'Analyze the style of the clothes. Please let me explain the colors and trend changes.'
    image = Image.open(requests.get(image_url, stream=True).raw).convert('RGB')
    msgs = [{'role': 'user', 'content': [image, question]}]
    ## 모델의 chat 함수를 호출하여 이미지에 대한 스타일 분석 응답 생성
    result = model.chat(image=None, msgs=msgs, tokenizer=tokenizer)
    return result

## 필터링된 DataFrame의 'image' 열에 describe_style 함수를 적용
## 결과는 'style'이라는 새로운 열에 저장
df['style'] = df['image'].apply(describe_style)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['style'] = df['image'].apply(describe_style)


In [15]:
display(HTML(df.to_html(escape=False, formatters=dict(**{'image': path_to_image_html}))))

Unnamed: 0,image,is_clothing,style
0,,True,"The clothing style depicted in the image leans towards a casual, streetwear aesthetic. The olive green jacket is reminiscent of military or utility wear, which has seen a resurgence in popularity as part of contemporary fashion trends. This type of jacket is often associated with durability and practicality, but it's also been adopted by urban fashion circles for its rugged yet stylish appearance.\n\nThe brown hoodie underneath adds to the laid-back vibe, suggesting comfort and ease of movement, which are key elements in streetwear culture. Hoodies have long been a staple in casual fashion due to their versatility and ability to be layered with other garments.\n\nThe rust-colored pants contribute to an earthy color palette that is both fashionable and grounded. Earth tones like this one are often cyclical in fashion, gaining prominence during certain seasons or years before falling out of favor again. They tend to evoke a sense of naturalism and simplicity.\n\nOverall, the combination of these items reflects a trend-conscious approach to dressing that values both function and style, aligning with modern streetwear trends that prioritize comfort, individual expression, and a nod to classic, timeless pieces."
1,,True,"The style of the clothes in the image leans towards a casual, urban aesthetic. The high-necked jacket is reminiscent of streetwear, often associated with comfort and functionality while also making a fashion statement. Its dark color and simple design suggest versatility and ease of pairing with various outfits.\n\nThe checkered pattern of the inner shirt adds a layer of visual interest without being overly bold or distracting, which aligns with contemporary trends that favor subtle patterns over loud prints. This kind of layered look is popular for its ability to create depth and texture in an outfit, as well as providing additional warmth.\n\nOverall, the clothing choices reflect a modern, understated style that is both fashionable and practical, likely appealing to individuals who value comfort and individual expression through their attire."
3,,True,"The style of the clothes in the photograph is distinctly casual and functional, characteristic of workwear or utility clothing. The denim overalls are a classic piece that has seen numerous iterations throughout fashion history. Originally designed for practical use, such as farming or manual labor, they have become a staple in various subcultures and fashion movements.\n\nIn terms of color, the image is monochromatic, presenting the overalls in shades of grey and black due to the grayscale filter applied. This choice emphasizes texture and form over color, which is often used in photography to focus on the subject's features and the material quality of the clothing.\n\nRegarding trend changes, denim overalls have experienced several revivals in fashion trends. They were particularly popular in the 1970s and 1980s before experiencing another resurgence in recent years. Their enduring popularity can be attributed to their versatility, comfort, and timeless appeal. Overalls like these are often associated with a laid-back, utilitarian aesthetic that appeals to those looking for durable, practical garments that also make a statement about individuality and personal style."
4,,True,"The style of the clothes worn by the individual in the image can be characterized as utilitarian and minimalist. The color palette is dominated by muted, earthy tones such as olive green and beige, which are often associated with military or workwear aesthetics. These colors suggest functionality and durability, as they tend to hide dirt and wear well.\n\nThe cut of the jacket is loose and relaxed, with a dropped shoulder that adds to the laid-back feel. This style choice contributes to a modern, casual look that contrasts with the structured, tailored silhouettes that have been more prevalent in recent fashion trends. Instead, there's an emphasis on comfort and practicality, which is reflected in the visible stitching details and the overall construction of the garment.\n\nThe absence of embellishments and the focus on simple lines and functional design elements indicate a trend towards simplicity and understated elegance. This aligns with broader fashion movements that favor minimalism over excess, focusing on quality materials and timeless designs rather than seasonal trends.\n\nIn summary, the clothing style depicted in the image reflects a contemporary take on utilitarian fashion, emphasizing comfort, practicality, and a subdued aesthetic through its use of color and form."
6,,True,"The style of the clothes in the image is indicative of a casual and possibly vintage-inspired aesthetic. The use of suede, a material known for its soft texture and durability, suggests a preference for comfort combined with a retro look. Suede has seen revivals in fashion over the years, often associated with 1970s and 1980s styles but experiencing a resurgence in modern times as part of a broader trend towards more tactile and less polished materials.\n\nThe color palette of the shoes is neutral, featuring shades of brown and beige, which are versatile and can easily be paired with various outfits. This choice of colors aligns with a minimalist trend that favors simplicity and understated elegance. The lack of bright or flashy colors points to a timeless piece that could fit into multiple seasons and styles, making it a practical addition to any wardrobe.\n\nIn terms of trend changes, the shoes reflect a shift away from highly structured, synthetic materials commonly seen in earlier decades, moving instead toward softer, more natural-looking fabrics like suede. This reflects a larger trend in fashion where there's an emphasis on comfort, sustainability, and a return to classic designs with a modern twist."
7,,True,"The style of the clothes in the image suggests a preference for comfort and warmth, which is characteristic of winter or cold-weather fashion. The teddy bear coat and beanie are both knitwear items that have become popular due to their cozy texture and ability to provide insulation against the cold. These types of garments often evoke a sense of nostalgia and are reminiscent of childhood comforts.\n\nIn terms of color trends, brown has been a consistent choice in fashion for its versatility and natural appeal. It's a color that can easily blend with other shades, making it a practical option for layering during colder months. The choice of dark brown for this ensemble gives it an earthy tone, which is currently trending in minimalist and sustainable fashion circles where there is a shift towards more natural and subdued colors.\n\nThe overall look is understated yet functional, reflecting a modern trend toward clothing that serves as both a fashion statement and a practical solution for staying warm. This balance between aesthetics and utility is a hallmark of contemporary fashion design, particularly in seasonal collections aimed at providing warmth without sacrificing style."
8,,True,"The style of the clothes in the image leans towards a minimalist and utilitarian aesthetic, which is characterized by simplicity, functionality, and an absence of ornamental details. The color palette is monochromatic, with shades of beige and tan dominating the outfit, suggesting a preference for earthy tones that are often associated with natural materials and outdoor wear.\n\nThis choice of colors and style could indicate a trend towards eco-conscious fashion, where the emphasis is on sustainability and durability rather than fast fashion trends. The oversized nature of the jacket and pants suggests a comfort-oriented approach to clothing, aligning with contemporary fashion movements that prioritize ease of movement and a relaxed silhouette over form-fitting or restrictive designs.\n\nIn terms of trend changes, this ensemble reflects a shift away from highly saturated colors and towards more muted, neutral tones that can be versatile and easily mixed and matched. Additionally, the inclusion of layered garments like the scarf and the belted waistband adds depth and texture to the look, showcasing a trend that values layering as a means of achieving both warmth and style.\n\nOverall, the clothing in the image seems to represent a modern, sustainable, and comfortable take on contemporary fashion trends."
10,,True,"The style of the boots depicted in the image suggests a classic and possibly rugged aesthetic, often associated with outdoor or workwear. The dark brown color is versatile and can be considered timeless, as it does not easily go out of fashion. This particular shade might appeal to those looking for durable and practical footwear that can withstand various conditions.\n\nIn terms of trend changes, the design elements such as the rounded toe, the stitching pattern, and the use of leather are indicative of styles that have seen resurgence over the years. These features are commonly found in both vintage-inspired looks and contemporary casual fashion. The boots could potentially be part of a trend towards more sustainable and long-lasting clothing choices, as they appear to be made from high-quality materials that suggest durability.\n\nOverall, while the specific trendiness of these boots may vary depending on current fashion cycles, their classic design suggests they are likely to remain popular among individuals who value quality and functionality in their footwear."
11,,True,"The style of the clothes in the image leans towards casual and streetwear fashion. The use of denim, a material known for its durability and versatility, is a staple in many wardrobes due to its timeless appeal. Denim jackets are often associated with a laid-back, approachable look that can be dressed up or down depending on how they're worn.\n\nThe color palette of blue jeans and a light brown t-shirt suggests a preference for neutral and earthy tones, which are popular choices for their ability to create a calm and grounded appearance. These colors also tend to pair well together, offering a harmonious blend that is easy on the eyes.\n\nStreetwear influences are evident through the relaxed fit of the clothing, the layering technique, and the choice of accessories like the bucket hat. This type of attire has gained significant popularity over the years as it reflects a more informal and comfortable aesthetic, often inspired by urban culture and youth subcultures.\n\nIn terms of trend changes, there's a noticeable shift away from highly patterned or flashy designs towards simpler, classic pieces that emphasize comfort and functionality. The combination of denim with a basic t-shirt illustrates this trend, showcasing how everyday materials and simple garments can come together to form stylish and functional outfits."


# 3. LLM을 이용한 키워드 분석 및 보고서 작성

## 14-9 언어 모델(LLM) 로드

In [16]:
from vllm import LLM, SamplingParams

## vLLM 라이브러리를 사용하여 'LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct' 모델을 로드
## gpu_memory_utilization은 GPU 메모리 사용 비율을 0.5로 설정
## max_model_len은 모델이 처리할 수 있는 최대 토큰 길이를 10000으로 설정
llm = LLM(model='LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct', gpu_memory_utilization=0.5, max_model_len=10000)

INFO 11-12 01:54:59 __init__.py:207] Automatically detected platform cuda.


config.json: 0.00B [00:00, ?B/s]

INFO 11-12 01:55:00 config.py:2444] Downcasting torch.float32 to torch.float16.
INFO 11-12 01:55:23 config.py:549] This model supports multiple tasks: {'reward', 'score', 'classify', 'embed', 'generate'}. Defaulting to 'generate'.
INFO 11-12 01:55:23 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct', speculative_config=None, tokenizer='LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=10000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/563 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

INFO 11-12 01:55:25 cuda.py:178] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.
INFO 11-12 01:55:25 cuda.py:226] Using XFormers backend.
INFO 11-12 01:55:26 model_runner.py:1110] Starting to load model LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct...
INFO 11-12 01:55:27 weight_utils.py:254] Using model weights format ['*.safetensors']


model-00001-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.65G [00:00<?, ?B/s]

INFO 11-12 01:57:29 weight_utils.py:270] Time spent downloading weights for LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct: 122.000424 seconds


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 11-12 01:58:14 model_runner.py:1115] Loading model weights took 4.5146 GB
INFO 11-12 01:58:17 worker.py:267] Memory profiling takes 2.57 seconds
INFO 11-12 01:58:17 worker.py:267] the current vLLM instance can use total_gpu_memory (14.74GiB) x gpu_memory_utilization (0.50) = 7.37GiB
INFO 11-12 01:58:17 worker.py:267] model weights take 4.51GiB; non_torch_memory takes 0.02GiB; PyTorch activation peak memory takes 0.98GiB; the rest of the memory reserved for KV Cache is 1.86GiB.
INFO 11-12 01:58:17 executor_base.py:111] # cuda blocks: 1625, # CPU blocks: 3495
INFO 11-12 01:58:17 executor_base.py:116] Maximum concurrency for 10000 tokens per request: 2.60x
INFO 11-12 01:58:24 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utili

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:51<00:00,  1.47s/it]

INFO 11-12 01:59:16 model_runner.py:1562] Graph capturing finished in 51 secs, took 0.21 GiB
INFO 11-12 01:59:16 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 61.84 seconds





## 14-10 색상 정보 추출

In [17]:
from vllm import SamplingParams ## SamplingParams 임포트가 필요

def extract_color(style):
  ## 주어진 스타일 설명 텍스트에서 색상을 한글로 추출하는 함수
  prompt = [
      {
          "role": "system",
          "content": "You are EXAONE model from LG AI Research, a helpful assistant."
      },
      {
          "role": "user",
          "content": f"다음의 글에서 색상을 한글로 추출해주세요. 색상 외에 다른 정보는 적지 말아주세요.\n{style}" # vlm이 작성한 글에서 색상 정보 추출, 한글로 번역하면서
      }
  ]
  ## 샘플링 파라미터 설정 (온도, top_p, 최대 토큰 수)
  sampling_params = SamplingParams(temperature=0.2, top_p=0.95, max_tokens=1024)
  ## LLM 모델을 사용하여 프롬프트에 대한 응답 생성
  result = llm.chat(prompt, sampling_params)[0].outputs[0].text
  print(result)
  return result

## DataFrame의 'style' 열에 extract_color 함수를 적용
## 결과는 'color'라는 새로운 열에 저장
df['color'] = df['style'].apply(extract_color)

INFO 11-12 01:59:24 chat_utils.py:332] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.


Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.77s/it, est. speed input: 165.08 toks/s, output: 13.00 toks/s]


- 올리브 그린 (Olive Green)
- 브라운 (Brown)
- 버건디 (Rust)


Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.27it/s, est. speed input: 274.60 toks/s, output: 11.54 toks/s]


**색상:**
- 어두운 색상


Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  2.66it/s, est. speed input: 771.93 toks/s, output: 13.64 toks/s]


회색, 검정색


Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  2.18it/s, est. speed input: 652.70 toks/s, output: 26.19 toks/s]


**색상:**
- 올리브 그린
- 베이지


Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  4.65it/s, est. speed input: 1432.91 toks/s, output: 23.34 toks/s]


갈색
베이지


Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  4.91it/s, est. speed input: 1374.23 toks/s, output: 19.84 toks/s]


색상: 갈색


Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.90it/s, est. speed input: 565.31 toks/s, output: 21.01 toks/s]


**색상:**
- 베이지
- 탄


Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  2.44it/s, est. speed input: 627.50 toks/s, output: 17.16 toks/s]


**색상:** 어두운 갈색


Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.99it/s, est. speed input: 642.88 toks/s, output: 33.94 toks/s]

**색상:**
- 파란색 (Blue)
- 갈색 (Brown)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['color'] = df['style'].apply(extract_color)


## 14-11 스타일 키워드 추출

In [None]:
from vllm import SamplingParams ## SamplingParams 임포트가 필요

def extract_color(style):
  ## 주어진 스타일 설명 텍스트에서 스타일 키워드를 한글로 추출하는 함수
  prompt = [
      {
          "role": "system",
          "content": "You are EXAONE model from LG AI Research, a helpful assistant."
      },
      {
          "role": "user",
          "content": f"" # vlm이 작성한 글에서 스타일 키워드 추출, 한글로 번역하면서
      }
  ]
  ## 샘플링 파라미터 설정 (온도, top_p, 최대 토큰 수)
  sampling_params = SamplingParams(temperature=0.2, top_p=0.95, max_tokens=1024)
  ## LLM 모델을 사용하여 프롬프트에 대한 응답 생성
  result = llm.chat(prompt, sampling_params)[0].outputs[0].text
  print(result)
  return result

## DataFrame의 'style' 열에 extract_color 함수를 적용 (함수 이름은 이전과 동일하지만 기능 변경)
## 결과는 'keyword'라는 새로운 열에 저장
df['keyword'] = df['style'].apply(extract_color)

In [None]:
display(HTML(df.to_html(escape=False, formatters=dict(**{'image': path_to_image_html}))))

## 14-12 텍스트 데이터 정제

In [None]:
import re

def clean_text(text):
    ## 텍스트에서 특수 문자 및 HTML 태그를 제거하고 소문자로 변환하는 함수
    if isinstance(text, str):
       ## 영문, 숫자, 한글, 공백을 제외한 모든 문자 제거
       text = re.sub(r'[^a-zA-Z0-9가-힣\s]', '', text)
       ## HTML 태그 제거
       text = re.sub(r'<[^>]*>', '', text)
       ## 텍스트를 소문자로 변환
       text = text.lower()
       return text
    else:
        return ""

## 'color' 열의 텍스트 데이터 정제
df['color'] = df['color'].apply(clean_text)
## 'keyword' 열의 텍스트 데이터 정제
df['keyword'] = df['keyword'].apply(clean_text)

## 14-13 워드 클라우드 생성 및 시각화

In [None]:
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def get_word_count(df):
    ## DataFrame의 'color'와 'keyword' 열에서 단어 빈도를 계산하는 함수
    if not df.empty:
        ## 'color' 열의 모든 단어를 리스트로 합침
        all_nouns = df['color'].apply(str.split).sum()
        ## 'keyword' 열의 모든 단어를 추가
        all_nouns += df['keyword'].apply(str.split).sum()
        ## '색상' 단어를 제외한 모든 단어를 필터링
        all_nouns = [word for word in all_nouns if word not in ['색상']]
        ## 단어 빈도를 Counter 객체로 반환
        return Counter(all_nouns)
    return Counter() ## DataFrame이 비어있으면 빈 Counter 반환

def create_wordcloud(word_count):
    ## 단어 빈도수를 기반으로 워드 클라우드를 생성하고 시각화하는 함수
    if not word_count: ## 단어 빈도가 없으면 워드클라우드 생성하지 않음
        print("No words to generate word cloud.")
        return

    wordcloud = WordCloud(
        width=800,
        height=400,
        background_color='white',
        colormap='viridis',
        font_path='/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf' ## 한글 폰트 경로 지정
        ).generate_from_frequencies(word_count)

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off") ## 축 표시 제거
    plt.show() ## 워드 클라우드 출력

## DataFrame에서 단어 빈도 계산
word_count = get_word_count(df)
## 계산된 단어 빈도로 워드 클라우드 생성 및 시각화
create_wordcloud(word_count)

## 14-14 트렌드 분석 보고서 생성 프롬프트 구성 및 실행

## 14-15 분석 보고서 시각화

In [None]:
from vllm import SamplingParams ## SamplingParams 임포트가 필요

## 시스템 메시지로 시작하는 프롬프트 리스트 초기화
prompt = [
    {
        "role": "system",
        "content": "You are EXAONE model from LG AI Research, a helpful assistant."
    }
]
## DataFrame의 각 행을 순회하며 '스타일 노트'와 '이미지 URL'을 사용자 메시지로 추가
for row in df.itertuples():
  prompt.append({"role": "user", "content": f""})
## 마지막으로, 종합적인 트렌드 분석 보고서 작성을 요청하는 사용자 메시지 추가
## 보고서 제목, 내용의 전문성, 마크다운 형식, 예시 이미지 포함을 지시
prompt.append({"role": "user", "content": ""})

## 샘플링 파라미터 설정 (온도, top_p, 최대 토큰 수)
sampling_params = SamplingParams(temperature=0.2, top_p=0.95, max_tokens=4096)
## LLM 모델을 사용하여 구성된 프롬프트에 대한 응답 생성
result = llm.chat(prompt, sampling_params)[0].outputs[0].text

In [None]:
from IPython.display import display, Markdown

## LLM으로부터 생성된 결과(Markdown 형식의 보고서)를 Jupyter 환경에 표시
display(Markdown(result))