# 광고평가 서술형 응답을 대분류/소분류 하면서   긍정/부정 토팍(=명사)추출하자
### engeneering 예외처리 추가
1. Q15 3글자 미만의 경우 분류하지 않음(Not enough data) 로 대체


In [1]:
import sys
import os
import anthropic
import time
import json
from dotenv import load_dotenv
import pandas as pd
from datetime import datetime, timedelta
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
from tqdm import tqdm

# .env 파일에서 API 키 로드
load_dotenv()
anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")

client = anthropic.Anthropic(
    # defaults to os.environ.get("ANTHROPIC_API_KEY")
    api_key=anthropic_api_key,
)

MAX_DAILY_REQUESTS = 5000  # 일일 최대 요청 수 설정 (예시 값)
request_count = 0
start_time = datetime.now()




In [2]:
def open_file(filename):
    current_dir = os.getcwd()
    file_path = os.path.join(current_dir, filename)
    try:
        with open(file_path, 'r') as file:
            return file.read()
    except FileNotFoundError:
        print(f"Error: File '{filename}' not found in the current directory.")
        return None
    except IOError:
        print(f"Error: Unable to read file '{filename}'.")
        return None


def analyze_responses(responses):

    #Caching 할 class content 를 읽어 들입니다.    
    class_content = open_file("class.txt")
    
    if class_content is None:
        print("!!!!!Unable to proceed due to cach file reading error.")
        #프로세스 종료
        return None

    prompt = f"""<context>You need to classify the survey response data of outdoor clothing ads into minor and major classifications, and extract positive and negative keywords for ad evaluation analysis from the response text. </context>

    <response text>
    {responses}
    </response text>

    <instruction>
    1. The response text classification is classified by reference to the <Classification> as a whole. 
    2. <Classification> consists of [Major Classification],[Minor Classification],[Reason for Classification]
    3. when classifying response text, classify the major classification first, and then classify the minor classification corresponding to the major classification.
    4. Extract positive and negative topics from the response text for ad evaluation analysis.
    5. Positive and negative topics should be extracted by noun.
    6. If there are multiple extracted keywords, they are separated by |.
    6. The output should be a CSV in the following format
        Major Classification, Minor Classification, Positive topics, Negative topics
    7. All output languages should be in your language
    8. Please print out the only result without the title
    8. Please Create accurate and reliable results for insightful ad survey analysis
     Note: Due to the CSV format, commas (,) should only be used as field separators, and if there are commas in the content, please change them to spaces
    </instruction>

    """

    try:
        message = client.beta.prompt_caching.messages.create(
            model="claude-3-5-sonnet-20240620",
            max_tokens=4000, 
            temperature=0.4,
            system=[
                {
                    "type": "text",
                    "text": "From now on, you are a video ad evaluation survey analytics expert, and you need to analyse the narrative ad evaluations that have been answered. The main thing you need to do is to classify the response text and extract positive and negative keywords."
                },
                {
                    "type": "text",
                    "text": "The classification criteria you should use when working with classifications. <Classification>" + class_content + "</Classification>",
                    "cache_control": {"type": "ephemeral"}
                }
            ],            
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": prompt
                        }
                    ]
                }
            ]
            #extra_headers={"anthropic-beta": "prompt-caching-2024-07-31"}
        )
        return message.content
    except anthropic.RateLimitError as e:
        print(f"요청 제한 초과: {e}")
        if e.status_code == 429:
            print("429 에러 발생. 프로그램을 종료합니다.")
            sys.exit(1)  
        raise
    except anthropic.InternalServerError as e:
        print(f"내부 서버 오류 발생: {e}")
        raise
    except RateLimitError as e:
        print(f"속도 제한 오류: {e}")
        traceback.print_exc()  # 전체 오류 스택 출력
        raise 
    except Exception as e:
        print(f"예상치 못한 오류 발생: {e}")
        raise


In [3]:
def read_csv(file_path):
    try:
        # CSV 파일을 읽을 때 on_bad_lines='warn'을 사용하여 문제가 있는 행을 경고로 처리
        return pd.read_csv(file_path, encoding='utf-8-sig', quoting=3, on_bad_lines='warn')
    except Exception as e:
        print(f"CSV 파일 읽기 중 오류 발생: {e}")
        # 오류가 발생한 경우, 더 관대한 옵션으로 다시 시도
        return pd.read_csv(file_path, encoding='utf-8-sig', quoting=3, on_bad_lines='warn', sep=None, engine='python')

@retry(
    stop=stop_after_attempt(5),
    wait=wait_exponential(multiplier=1, min=4, max=60),
    retry=retry_if_exception_type((anthropic.InternalServerError, anthropic.RateLimitError))
)


def process_response(analysis):
    if isinstance(analysis, list) and len(analysis) > 0:
        text = analysis[0].text if hasattr(analysis[0], 'text') else str(analysis[0])
    else:
        text = str(analysis)
    
    parts = text.split(',')
    if len(parts) == 4:
        return {
            'Major Classification': parts[0].strip(),
            'Minor Classification': parts[1].strip(),
            'Positive topics': parts[2].strip(),
            'Negative topics': parts[3].strip()
        }
    return None

def process_batch(batch, start_index):
    results = []
    for index, row in batch.iterrows():
        result = {
            'ID': row['ID'],
            'wave': row['wave'],
            'IDX': row['IDX'],
            'Q1': row['Q1'],
            'Q2': row['Q2'],
            'Q2_R': row['Q2_R'],
            'ad_type': row['ad_type'],
            'Likert': row['Likert'],
            'Q15': row['Q15']
        }
        
        try:
            response = row['Q15']

            if len(response.strip()) <= 3: 
                result.update({
                    'Major Classification': 'Not enough data',
                    'Minor Classification': 'Not enough data',
                    'Positive topics': '',
                    'Negative topics': ''
                })
            else:
                analysis = analyze_responses(response)
                analysis_result = process_response(analysis)
            
                if analysis_result:
                    result.update(analysis_result)
                else:
                    result.update({
                        'Major Classification': '',
                        'Minor Classification': '',
                        'Positive topics': '',
                        'Negative topics': ''
                    })
        except Exception as e:
            print(f"행 {start_index + index + 1} 처리 중 오류 발생: {e}")
            result.update({
                'Major Classification': 'Error',
                'Minor Classification': 'Error',
                'Positive topics': '',
                'Negative topics': ''
            })
        
        results.append(result)
        
        #print(f"Processed row {start_index + index + 1}")4
        
        time.sleep(2)  # 각 요청 사이에 10초 대기
    
    return results

def save_checkpoint(current_index, all_results):
    checkpoint = {
        'current_index': current_index,
        'results': all_results
    }
    with open('checkpoint.json', 'w', encoding='utf-8') as f:
        json.dump(checkpoint, f, ensure_ascii=False, indent=2)

def load_checkpoint():
    if os.path.exists('checkpoint.json'):
        with open('checkpoint.json', 'r', encoding='utf-8') as f:
            return json.load(f)
    return None

def main():
    #input_file = 'data_ad_total_2401.csv'  #24년 1wave 본파일 
    #input_file = 'data_ad_test.csv'  #테스트 파일 

    input_file = 'data_ad_total_remain.csv'  #부족분 재실행 파일
        
    output_file = "[labeling]" + input_file
    batch_size = 50  # 배치 크기를 더 줄임
    
    df = read_csv(input_file)
    
    #데이터의 크기만 확인
    print(df.shape)

    #df 전체 데이터중 처음부터 500개만 사용
    #df = df.head(3)
    
    #df 데이터 중 1200번 부터 2000번 사용
    #df = df.iloc[2140:2501]
    

    #df 데이터 중 1000번 부터 끝까지 사용    
    #df = df.iloc[1000:]
    
    
    print(df.shape)
    
    checkpoint = load_checkpoint()
    if checkpoint:
        start_index = checkpoint['current_index']
        all_results = checkpoint['results']
        print(f"체크포인트에서 재개: 인덱스 {start_index}")
    else:
        start_index = 0
        all_results = []
    
    total_batches = (len(df) - start_index + batch_size - 1) // batch_size
    
    with tqdm(total=total_batches, desc="Overall Progress") as pbar:
        for i in range(start_index, len(df), batch_size):
            batch = df.iloc[i:i+batch_size]
            results = process_batch(batch, i)
            all_results.extend(results)
            
            # 중간 결과 및 체크포인트 저장
            result_df = pd.DataFrame(all_results)
            result_df.to_csv(output_file, index=False, encoding='utf-8-sig')
            save_checkpoint(i + batch_size, all_results)
            print(f"중간 결과 및 체크포인트가 저장되었습니다.")
            
            pbar.update(1) 
            time.sleep(5)  # 각 배치 사이에 20초 대기
    
    print(f"분석이 완료되었습니다. 최종 결과가 {output_file}에 저장되었습니다.")

if __name__ == "__main__":
    main()



Skipping line 20: expected 11 fields, saw 14
Skipping line 27: expected 11 fields, saw 12
Skipping line 34: expected 11 fields, saw 12
Skipping line 35: expected 11 fields, saw 12
Skipping line 36: expected 11 fields, saw 12
Skipping line 41: expected 11 fields, saw 12
Skipping line 44: expected 11 fields, saw 14
Skipping line 46: expected 11 fields, saw 14
Skipping line 50: expected 11 fields, saw 13
Skipping line 53: expected 11 fields, saw 12
Skipping line 54: expected 11 fields, saw 12
Skipping line 61: expected 11 fields, saw 12
Skipping line 62: expected 11 fields, saw 13
Skipping line 66: expected 11 fields, saw 12
Skipping line 70: expected 11 fields, saw 14
Skipping line 71: expected 11 fields, saw 12
Skipping line 74: expected 11 fields, saw 12
Skipping line 78: expected 11 fields, saw 12
Skipping line 81: expected 11 fields, saw 15
Skipping line 82: expected 11 fields, saw 12
Skipping line 87: expected 11 fields, saw 12
Skipping line 88: expected 11 fields, saw 12
Skipping l

(5009, 11)
(5009, 11)


Overall Progress:   1%|          | 1/101 [03:21<5:35:43, 201.43s/it]

중간 결과 및 체크포인트가 저장되었습니다.


Overall Progress:   2%|▏         | 2/101 [06:48<5:37:21, 204.46s/it]

중간 결과 및 체크포인트가 저장되었습니다.


Overall Progress:   3%|▎         | 3/101 [10:24<5:43:13, 210.14s/it]

중간 결과 및 체크포인트가 저장되었습니다.


Overall Progress:   4%|▍         | 4/101 [13:45<5:33:28, 206.27s/it]

중간 결과 및 체크포인트가 저장되었습니다.


Overall Progress:   5%|▍         | 5/101 [17:11<5:30:06, 206.32s/it]

중간 결과 및 체크포인트가 저장되었습니다.


Overall Progress:   6%|▌         | 6/101 [20:12<5:12:47, 197.56s/it]

중간 결과 및 체크포인트가 저장되었습니다.


Overall Progress:   6%|▌         | 6/101 [22:56<6:03:15, 229.42s/it]

요청 제한 초과: Error code: 429 - {'type': 'error', 'error': {'type': 'rate_limit_error', 'message': 'Number of request tokens has exceeded your daily rate limit (https://docs.anthropic.com/en/api/rate-limits); see the response headers for current usage. Please reduce the prompt length or the maximum tokens requested, or try again later. You may also contact sales at https://www.anthropic.com/contact-sales to discuss your options for a rate limit increase.'}}
429 에러 발생. 프로그램을 종료합니다.





AttributeError: 'tuple' object has no attribute 'tb_frame'