<a href="https://colab.research.google.com/github/Rotifunk/TIL/blob/main/%EB%8D%B0%EC%9D%B4%EC%BD%98_%ED%95%9C%EC%86%94%EB%8D%B0%EC%BD%94_%EC%8B%9C%EC%A6%8C2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 필요한 라이브러리 설치
!pip install transformers torch accelerate
!pip install transformers
!pip install huggingface_hub
!pip install -U sentence-transformers

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import csv
from tqdm import tqdm
from huggingface_hub import login

# Hugging Face 토큰으로 로그인
login(token="")  # 여기에 실제 토큰을 입력하세요

# Gemma 2B 모델 및 토크나이저 로드
model_name = "google/gemma-2b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

# GPU 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

def generate_answer(question, max_new_tokens=150):
    prompt = f"당신은 실내 마감재 전문가입니다. 다음 질문에 대해 간결하고 정확하게 답변해주세요.\n\n질문: {question}\n\n답변:"

    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            num_return_sequences=1,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.7
        )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = generated_text.split("답변:")[-1].strip()
    return answer

def process_csv(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as infile, \
         open(output_file, 'w', encoding='utf-8', newline='') as outfile:

        reader = csv.reader(infile)
        writer = csv.writer(outfile)

        # 첫 번째 행 스킵 (헤더)
        next(reader)

        # 헤더 추가
        writer.writerow(['질문', '답변'])

        # tqdm으로 진행 상황 표시
        for row in tqdm(reader, desc="처리 중"):
            question = row[1]  # 두 번째 열에 질문이 있음
            answer = generate_answer(question)
            writer.writerow([question, answer])

# 사용 예시
input_file = './test.csv'  # 입력 파일명
output_file = './answers_gemma.csv'   # 출력 파일명

process_csv(input_file, output_file)
print(f"처리가 완료되었습니다. 결과가 {output_file}에 저장되었습니다.")


처리 중: 130it [09:44,  4.49s/it]

처리가 완료되었습니다. 결과가 ./answers_gemma.csv에 저장되었습니다.





In [4]:
import pandas as pd

# CSV 파일 읽기
df = pd.read_csv(output_file)
df

Unnamed: 0,질문,답변
0,"방청 페인트의 종류에는 어떤 것들이 있는지 알고 계신가요? 또한, 원목사이딩을 사용...","방청 페인트의 종류에는 마감재료, 마감재료, 마감재료, 마감재료, 마감재료, 마감재..."
1,도배지에 녹은 자국이 발생하는 주된 원인과 그 해결 방법은 무엇인가요?,주요 원인\n\n1. <strong>제약</strong>: 녹은 자국은 흔히 도배지...
2,"큐블럭의 단점을 알려주세요. 또한, 압출법 단열판을 사용하는 것의 장점은 무엇인가요?",큐블럭은 전열판을 만들 때 사용됩니다. 큐
3,"철골구조를 사용하는 고층 건물에서, 단열 효과를 높이기 위한 시공 방법은 무엇이 있...","일반적으로 시공 중 단열재를 설치하는 것을 최우선으로 하여, 단열재를 설치하는 단계..."
4,도배지의 완전한 건조를 위해 몇 주 동안 기다려야 하나요?,도배지의 완전한 건조를 위해 몇 주 동안 기다려야 하나요? \n\n질문: 도배지의 ...
...,...,...
125,분말 소화기를 사용할 때 주의해야 할 사항은 무엇인가요? 그리고 아파트 도배 평수를...,"먼저, 다음과 같이 몇 가지 주의 사항이 있습니다.\n\n* <em>제조사의 지시에..."
126,"압출법 보온판의 가장 큰 장점은 무엇인가요?""",압출법 보온판의 가장 큰 장점
127,평지붕의 누수 문제를 방지하기 위해 수성 벽체용 탄성 방수 도료를 사용하는 것이 어...,그것은 몇 가지 장점이 있습니다.\n\n1) 탄성 방수 도료는 건조한 공간에서 쉽게...
128,석고수정이 발생하는 가장 큰 원인은 무엇인가요? 그리고 이를 해결하는 방법에 대해 ...,"<h2><b>석고수정에 대해 알아보자</b></h2>\n\n석고수정은 석고가 팽창,..."


In [24]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer # SentenceTransformer Version 2.2.2

# Embedding Vector 추출에 활용할 모델(distiluse-base-multilingual-cased-v1) 불러오기
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

# Load the provided CSV files
answers_gemma_path = './answers_gemma.csv'
sample_submission_path = './sample_submission.csv'

answers_gemma = pd.read_csv(answers_gemma_path)
sample_submission = pd.read_csv(sample_submission_path)

# Ensure all entries are strings, replace NaN with empty string
answers_gemma['답변'] = answers_gemma['답변'].astype(str).fillna('')

# Function to convert text to embedding
def get_embedding(text):
    return model.encode(text)

# Convert each answer into a 512-dimensional embedding vector
answers_gemma['embedding'] = answers_gemma['답변'].apply(get_embedding)

# Create a DataFrame for the embeddings and ensure 512 dimensions
embedding_df = pd.DataFrame(answers_gemma['embedding'].to_list(), index=sample_submission.index)
embedding_df = embedding_df.reindex(columns=range(512), fill_value=0)

# Combine with the sample_submission template
for i in range(512):
    sample_submission[f'vec_{i}'] = embedding_df[i]

# Save the result to a new CSV file
output_path = './sample_submission_with_embeddings.csv'
sample_submission.to_csv(output_path, index=False)

print("Embeddings have been successfully saved to:", output_path)




Embeddings have been successfully saved to: ./sample_submission_with_embeddings.csv


In [25]:
sample_submission

Unnamed: 0,id,vec_0,vec_1,vec_2,vec_3,vec_4,vec_5,vec_6,vec_7,vec_8,...,vec_502,vec_503,vec_504,vec_505,vec_506,vec_507,vec_508,vec_509,vec_510,vec_511
0,TEST_000,0.079321,0.068053,0.020293,0.021505,0.005051,-0.014718,0.061530,0.014531,0.026965,...,-0.030836,0.038706,-0.004742,-0.046678,-0.005876,0.047446,0.042425,-0.048727,-0.030252,0.055350
1,TEST_001,-0.060107,0.024410,0.008490,-0.008382,0.074769,0.008238,0.001181,-0.012014,-0.026650,...,-0.036318,-0.056504,0.038329,-0.048600,-0.033402,0.065517,0.029698,-0.056300,-0.042538,0.005731
2,TEST_002,-0.002061,-0.067414,-0.068355,-0.011449,-0.040189,-0.024203,-0.068569,0.016500,0.065660,...,-0.004257,-0.013580,0.112996,-0.015217,-0.012596,-0.046703,-0.047361,-0.029998,-0.010817,0.017150
3,TEST_003,0.008677,-0.005795,0.005848,0.022254,0.088467,-0.061575,-0.021868,0.038544,0.012135,...,-0.007061,-0.040608,0.041248,-0.028485,-0.029952,0.026069,-0.019170,-0.034508,-0.024013,0.039110
4,TEST_004,-0.009273,0.001615,0.012168,-0.038637,-0.008714,-0.017826,0.060809,0.042479,0.012519,...,0.051666,0.012146,0.031064,-0.002861,-0.021027,0.019970,0.025510,0.089493,-0.024055,0.007762
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,TEST_125,0.053426,0.013519,0.010067,-0.034738,0.023609,0.006125,-0.012804,0.044008,-0.076089,...,0.007227,-0.000020,0.015464,-0.097411,0.006401,-0.022876,-0.004396,-0.095824,-0.016352,-0.003955
126,TEST_126,0.023095,-0.051845,-0.078374,0.025385,0.086496,0.029462,0.003776,-0.025483,0.039096,...,-0.031984,-0.044880,0.028870,0.004495,0.049502,0.035716,-0.005076,-0.017957,0.046998,0.070672
127,TEST_127,-0.000146,-0.032152,0.036834,-0.009601,0.104419,-0.008617,-0.015729,-0.021183,0.010937,...,-0.071502,-0.024932,-0.042808,-0.043214,-0.016006,0.038726,0.037223,0.030717,0.004307,0.029325
128,TEST_128,0.054075,0.025163,-0.080647,-0.008204,0.097215,-0.044240,-0.022276,0.016180,0.000192,...,-0.005381,0.006300,-0.037534,-0.065639,0.028270,0.023625,0.017757,0.018945,0.004071,-0.052806
