# 커뮤니티 캐릭터 네임 크롤링

In [None]:
import requests
from bs4 import BeautifulSoup
import time
import random
import csv
import pandas as pd
import requests
import time
import concurrent.futures
from datetime import datetime, timedelta
import json

In [1]:
def fetch_authors_from_pages(start_page, end_page):
    authors = []
    base_url = "https://df.nexon.com/community/dnfboard/list?category=0&page="
    
    for page in range(start_page, end_page + 1):
        url = base_url + str(page)
        
        # HTTP 요청
        response = requests.get(url)
        response.encoding = response.apparent_encoding  # 인코딩 설정
        
        # HTML 파싱
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # 게시글 리스트 추출
        posts = soup.find_all('a', class_='dnf_charac_name_tag')
        
        # 작성자 리스트 추출
        for post in posts:
            author = post.get('data-characname').strip()
            authors.append(author)
        
        # 보안 문제 방지를 위해 요청 간 랜덤 딜레이 추가 (5초에서 10초)
        time.sleep(random.uniform(5, 10))
    
    return authors

# 500페이지씩 쪼개서 크롤링 (1페이지부터 2000페이지까지)
chunk_size = 500
start_total_pages = 1
end_total_pages = 2000

all_author_lists = []
for start_page in range(start_total_pages, end_total_pages + 1, chunk_size):
    end_page = min(start_page + chunk_size - 1, end_total_pages)
    authors_chunk = fetch_authors_from_pages(start_page, end_page)
    all_author_lists.append(authors_chunk)
    
    # CSV 파일로 저장
    csv_filename = f'authors_{start_page}_to_{end_page}.csv'
    with open(csv_filename, 'w', newline='', encoding='utf-8-sig') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Author'])
        for author in authors_chunk:
            writer.writerow([author])

    print(f'Data successfully written to {csv_filename}')

# 모든 리스트를 하나로 합치기
all_authors = [author for authors_chunk in all_author_lists for author in authors_chunk]

# 중복 제거 및 알파벳 순 정렬
unique_authors = sorted(set(all_authors))

# 전체 데이터를 하나의 CSV 파일로 저장
csv_filename = 'authors_all_501_to_2000.csv'
with open(csv_filename, 'w', newline='', encoding='utf-8-sig') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Author'])
    for author in unique_authors:
        writer.writerow([author])

print(f'All data successfully written to {csv_filename}')

Data successfully written to authors_501_to_1000.csv
Data successfully written to authors_1001_to_1500.csv
Data successfully written to authors_1501_to_2000.csv
All data successfully written to authors_all_501_to_2000.csv


# API 활용 데이터 수집

## 캐릭터 검색

In [37]:
# CSV 파일 읽기
file_path = './authors_1_to_2000.csv'
df = pd.read_csv(file_path)

In [41]:
# API 키 및 기본 URL 설정
api_key = '8VDYE9JHX7Qyp7m7kGWsjsKQ50TShRt9'
base_url = 'https://api.neople.co.kr/df/servers/all/characters'

# 함수: 캐릭터 정보 가져오기
def fetch_character_info(character_name):
    params = {
        'characterName': character_name,
        'limit': 200,  # 최대값으로 설정
        'wordType': 'match',
        'apikey': api_key
    }
    
    try:
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            return response.json()
    except Exception as e:
        return None

# CSV 파일에서 모든 캐릭터 이름 추출
character_names = df['Author'].tolist()
num_parts = 10  # 분할할 부분 수
chunk_size = len(character_names) // num_parts

# 함수: 데이터 수집 및 저장
def collect_and_save_data(part_index, chunk_character_names):
    character_infos = []
    
    for name in chunk_character_names:
        info = fetch_character_info(name)
        if info:
            character_infos.append(info)
        
        # 요청 간 랜덤 딜레이 추가 (1초에서 2초)
        time.sleep(random.uniform(1, 2))
    
    # 결과를 데이터프레임으로 변환
    info_list = []
    for info in character_infos:
        if 'rows' in info:
            for character in info['rows']:
                info_list.append(character)
    
    character_df = pd.DataFrame(info_list)

    # 데이터프레임 저장 (한글 깨짐 방지)
    output_file_path = f'./character_info_part_{part_index + 1}.csv'
    character_df.to_csv(output_file_path, index=False, encoding='utf-8-sig')
    print(f'Data successfully written to {output_file_path}')

# 병렬 처리로 데이터 수집
with concurrent.futures.ThreadPoolExecutor(max_workers=num_parts) as executor:
    futures = []
    for i in range(num_parts):
        start_idx = i * chunk_size
        end_idx = (i + 1) * chunk_size if i < num_parts - 1 else len(character_names)
        chunk_character_names = character_names[start_idx:end_idx]
        futures.append(executor.submit(collect_and_save_data, i, chunk_character_names))
    
    # 결과 기다리기
    concurrent.futures.wait(futures)

# 모든 CSV 파일을 하나로 합치는 과정
all_dfs = []
for i in range(num_parts):
    part_file_path = f'./character_info_part_{i + 1}.csv'
    part_df = pd.read_csv(part_file_path, encoding='utf-8-sig')
    all_dfs.append(part_df)

# 모든 데이터프레임을 하나로 합치기
final_df = pd.concat(all_dfs, ignore_index=True)

# 최종 데이터프레임 저장 (한글 깨짐 방지)
final_output_file_path = './character_info_all.csv'
final_df.to_csv(final_output_file_path, index=False, encoding='utf-8-sig')

print(f'All data successfully written to {final_output_file_path}')

Data successfully written to ./character_info_part_6.csv
Data successfully written to ./character_info_part_3.csv
Data successfully written to ./character_info_part_5.csv
Data successfully written to ./character_info_part_7.csv
Data successfully written to ./character_info_part_9.csv
Data successfully written to ./character_info_part_1.csv
Data successfully written to ./character_info_part_2.csv
Data successfully written to ./character_info_part_4.csv
Data successfully written to ./character_info_part_10.csv
Data successfully written to ./character_info_part_8.csv
All data successfully written to ./character_info_all.csv


## 타임 라인 검색

In [59]:
# CSV 파일 로드
character_info_df = pd.read_csv('./character_info_all.csv')

# 기본 설정
# 날짜 24/01/01 ~ 24/03/31
base_url = "https://api.neople.co.kr/df/servers/{}/characters/{}/timeline"
start_date = "2024-01-01 00:00"
end_date = "2024-03-31 23:59"
limit = 100
codes = "101,103,104,201"
api_key = "8VDYE9JHX7Qyp7m7kGWsjsKQ50TShRt9"

In [60]:
# 결과를 저장할 리스트
all_data = []

# 각 캐릭터에 대해 데이터 수집
for index, row in character_info_df.iterrows():
    server_id = row['serverId']
    character_id = row['characterId']
    
    url = f"{base_url.format(server_id, character_id)}?limit={limit}&code={codes}&startDate={start_date}&endDate={end_date}&apikey={api_key}"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        if 'timeline' in data and 'rows' in data['timeline']:
            for event in data['timeline']['rows']:
                event['serverId'] = server_id
                event['characterId'] = character_id
                all_data.append(event)
    else:
        print(f"Failed to retrieve data for character {character_id} on server {server_id}")
    
    time.sleep(0.1)  # API 호출 간 짧은 대기 시간 추가

# 수집한 데이터를 데이터프레임으로 변환
all_data_df = pd.DataFrame(all_data)

Failed to retrieve data for character d58c7d81c52a8e29eb5a9d4408af9e0d on server cain
Failed to retrieve data for character d58c7d81c52a8e29eb5a9d4408af9e0d on server cain
Failed to retrieve data for character d58c7d81c52a8e29eb5a9d4408af9e0d on server cain
Failed to retrieve data for character 5c737d7af02e4914f542d9dde3af8d34 on server cain
Failed to retrieve data for character 5c737d7af02e4914f542d9dde3af8d34 on server cain
Failed to retrieve data for character 5c737d7af02e4914f542d9dde3af8d34 on server cain
Failed to retrieve data for character 3f08b2b8736c05380a86ade134bfb85a on server siroco
Failed to retrieve data for character 3f08b2b8736c05380a86ade134bfb85a on server siroco
Failed to retrieve data for character 5c737d7af02e4914f542d9dde3af8d34 on server cain
Failed to retrieve data for character 5c737d7af02e4914f542d9dde3af8d34 on server cain
Failed to retrieve data for character 5c737d7af02e4914f542d9dde3af8d34 on server cain
Failed to retrieve data for character 5c737d7af02e

In [61]:
all_data_df.to_csv('./timeline_data_240101_240331.csv', index=False, encoding='utf-8-sig')

In [62]:
# 기본 설정
# 날짜 24/04/01 ~ 24/05/26
base_url = "https://api.neople.co.kr/df/servers/{}/characters/{}/timeline"
start_date = "2024-04-01 00:00"
end_date = "2024-05-26 23:59"
limit = 100
codes = "101,103,104,201"
api_key = "8VDYE9JHX7Qyp7m7kGWsjsKQ50TShRt9"

In [63]:
# 결과를 저장할 리스트
all_data = []

# 각 캐릭터에 대해 데이터 수집
for index, row in character_info_df.iterrows():
    server_id = row['serverId']
    character_id = row['characterId']
    
    url = f"{base_url.format(server_id, character_id)}?limit={limit}&code={codes}&startDate={start_date}&endDate={end_date}&apikey={api_key}"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        if 'timeline' in data and 'rows' in data['timeline']:
            for event in data['timeline']['rows']:
                event['serverId'] = server_id
                event['characterId'] = character_id
                all_data.append(event)
    else:
        print(f"Failed to retrieve data for character {character_id} on server {server_id}")
    
    time.sleep(0.1)  # API 호출 간 짧은 대기 시간 추가

# 수집한 데이터를 데이터프레임으로 변환
all_data_df = pd.DataFrame(all_data)

Failed to retrieve data for character d58c7d81c52a8e29eb5a9d4408af9e0d on server cain
Failed to retrieve data for character d58c7d81c52a8e29eb5a9d4408af9e0d on server cain
Failed to retrieve data for character d58c7d81c52a8e29eb5a9d4408af9e0d on server cain
Failed to retrieve data for character 5c737d7af02e4914f542d9dde3af8d34 on server cain
Failed to retrieve data for character 5c737d7af02e4914f542d9dde3af8d34 on server cain
Failed to retrieve data for character 5c737d7af02e4914f542d9dde3af8d34 on server cain
Failed to retrieve data for character 3f08b2b8736c05380a86ade134bfb85a on server siroco
Failed to retrieve data for character 3f08b2b8736c05380a86ade134bfb85a on server siroco
Failed to retrieve data for character 5c737d7af02e4914f542d9dde3af8d34 on server cain
Failed to retrieve data for character 5c737d7af02e4914f542d9dde3af8d34 on server cain
Failed to retrieve data for character 5c737d7af02e4914f542d9dde3af8d34 on server cain
Failed to retrieve data for character 5c737d7af02e

In [64]:
all_data_df.to_csv('./timeline_data_240401_240525.csv', index=False, encoding='utf-8-sig')

## 캐릭터 & 타임라인 데이터 병합 및 전처리

### 데이터 병합

In [195]:
df1 = pd.read_csv('./timeline_data_240101_240331.csv')
df2 = pd.read_csv('./timeline_data_240401_240525.csv')
df3 = pd.read_csv('./character_info_all.csv')

In [196]:
# 타임라인 데이터 240101 ~ 240525로 병합
timeline_df = pd.concat([df1, df2], axis=0, ignore_index=True)

# 캐릭터 중복 값 제거
df3 = df3.drop_duplicates(['characterId', 'serverId'], keep='first').reset_index(drop=True)
timeline_df = timeline_df.drop_duplicates(['code', 'name', 'date', 'data', 'serverId', 'characterId'], keep='first').reset_index(drop=True)

# 캐릭터, 타임라인 데이터 병합
all_df = pd.merge(timeline_df, df3, on=['serverId', 'characterId'], how='left')

In [197]:
all_df.head(2)

Unnamed: 0,code,name,date,data,serverId,characterId,characterName,level,jobId,jobGrowId,jobName,jobGrowName,fame
0,201,레이드,2024-03-30 15:09,"{'raidName': '기계 혁명', 'raidPartyName': '일반 2인쩔...",cain,f111e939e5dadfc359dca0fe479567ae,민꾱이,110,944b9aab492c15a8474f96947ceeb9e4,c9b492038ee3ca8d27d7004cf58d59f3,거너(여),眞 스핏파이어,46324.0
1,201,레이드,2024-03-24 18:03,"{'raidName': '기계 혁명', 'raidPartyName': '고정 일반 ...",cain,f111e939e5dadfc359dca0fe479567ae,민꾱이,110,944b9aab492c15a8474f96947ceeb9e4,c9b492038ee3ca8d27d7004cf58d59f3,거너(여),眞 스핏파이어,46324.0


### json 데이터 전처리

In [198]:
# JSON데이터와 중복 컬럼 사전 명칭 변경
all_df = all_df.rename(columns = {'level' : 'latest_level', 'jobGrowId' : 'latest_jobGrowId', 'jobGrowName' : 'latest_jobGrowName'})

# code별로 데이터 분리
all_df_101 = all_df[all_df['code'] == 101].reset_index(drop=True)
all_df_103 = all_df[all_df['code'] == 103].reset_index(drop=True)
all_df_104 = all_df[all_df['code'] == 104].reset_index(drop=True)
all_df_201 = all_df[all_df['code'] == 201].reset_index(drop=True)

# 필요없는 컬럼, 중복 컬럼 제거
all_df_101.drop(columns='data', inplace=True)
all_df_103.drop(columns=['jobId','jobName'], inplace=True)

In [199]:
# 문자열을 안전하게 사전으로 변환하는 함수 정의
def safe_eval(data_str):
    if isinstance(data_str, dict):
        return data_str
    try:
        # json.loads를 사용하여 문자열을 파싱
        return json.loads(data_str.replace("'", "\""))
    except (ValueError, json.JSONDecodeError):
        return None

# data 열의 문자열을 사전으로 변환
all_df_103['data'] = all_df_103['data'].apply(safe_eval)
all_df_104['data'] = all_df_104['data'].apply(safe_eval)

# 유효한 사전 데이터만 남기기
all_df_103 = all_df_103[all_df_103['data'].notnull()]
all_df_104 = all_df_104[all_df_104['data'].notnull()]

# data 열을 개별 열로 분리
all_df_103_normalized = pd.json_normalize(all_df_103['data'])
all_df_104_normalized = pd.json_normalize(all_df_104['data'])

# 원본 데이터프레임의 인덱스를 재설정하고, data 열을 제거한 후 분리된 데이터와 병합
all_df_103.reset_index(drop=True, inplace=True)
all_df_104.reset_index(drop=True, inplace=True)

all_df_103 = pd.concat([all_df_103.drop(columns=['data']), all_df_103_normalized], axis=1)
all_df_104 = pd.concat([all_df_104.drop(columns=['data']), all_df_104_normalized], axis=1)

In [200]:
# 필요없는 열 삭제
all_df_101 = all_df_101.drop(columns=['jobId', 'latest_jobGrowId'])
all_df_103 = all_df_103.drop(columns=['jobId', 'latest_jobGrowId', 'jobGrowId']).rename(columns={'jobGrowName' : 'jobGrowName_103'})
all_df_104 = all_df_104.drop(columns=['jobId','latest_jobGrowId','level'])
all_df_201 = all_df_201.drop(columns=['data','jobId','latest_jobGrowId'])

# 모든 데이터 병합
all_df_final = pd.concat([all_df_101,all_df_103,all_df_104,all_df_201]).reset_index(drop=True)

In [203]:
all_df_final.head()

Unnamed: 0,code,name,date,serverId,characterId,characterName,latest_level,jobName,latest_jobGrowName,fame,jobGrowName_103
0,101,캐릭터 생성,2024-03-24 23:03,diregie,f22246ec6f2a32c535b4f25914cf6ebf,착호갑순이,110,아처,眞 비질란테,59293.0,
1,101,캐릭터 생성,2024-02-14 09:59,cain,da494159908bfd4bda9c5c82aedc3927,사골곰탕완샷,64,아처,드리머,724.0,
2,101,캐릭터 생성,2024-03-16 22:12,cain,114a92c82af16d3dd8d247acc6ca0986,메가코프,110,아처,眞 비질란테,27599.0,
3,101,캐릭터 생성,2024-02-10 17:38,cain,3f17f04a5a3266901218c1630b8d94aa,푸른향신료,110,마법사(여),眞 마도학자,61167.0,
4,101,캐릭터 생성,2024-02-24 17:39,cain,100216a61a84ca605a77779f4e0b0adf,배고픈류뷰,110,귀검사(여),眞 블레이드,52188.0,


In [201]:
all_df_final.to_csv('./all_df_final.csv', index=False, encoding='utf-8-sig')