### song_name, artist 중복제거 데이터프레임 생성 코드

In [1]:
import json
import os
import pandas as pd

def read_json_from_directory(directory, file_name):
    # 파일 전체 경로 생성
    file_path = os.path.join(directory, file_name)

    try:
        # JSON 파일 읽기
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
        song_names = data['song_info']['song_name']
        artists = data['song_info']['artist']

        # 데이터프레임 생성
        df = pd.DataFrame({'song_name': song_names, 'artist': artists})

        return df  # 데이터프레임 반환
    except Exception as e:
        print(f"Error reading {file_name}: {e}")
        return None  # 오류 발생 시 None 반환

def count_files_in_directory(directory):
    # 지정된 디렉토리의 모든 파일과 폴더 목록을 가져옴
    items = os.listdir(directory)

    # 파일만 세기 위해 디렉토리가 아닌 항목의 수를 센다
    file_count = sum(1 for item in items if os.path.isfile(os.path.join(directory, item)))

    return file_count

# 빈 데이터프레임 생성
result_df = pd.DataFrame(columns=['song_name', 'artist'])

# 함수 사용 예시
directory = r"D:\data\json\2180_re_1"

for i in range(count_files_in_directory(directory)):
    df = read_json_from_directory(directory, f'playlist{i}.json')
    if df is not None:
        result_df = pd.concat([result_df, df])
result_df = result_df.drop_duplicates().reset_index(drop=True)
# 중복이 제거된 데이터프레임 출력
print(result_df)

Error reading playlist17.json: [Errno 2] No such file or directory: 'D:\\data\\json\\2180_re_1\\playlist17.json'
Error reading playlist430.json: [Errno 2] No such file or directory: 'D:\\data\\json\\2180_re_1\\playlist430.json'
Error reading playlist1177.json: [Errno 2] No such file or directory: 'D:\\data\\json\\2180_re_1\\playlist1177.json'
Error reading playlist1270.json: [Errno 2] No such file or directory: 'D:\\data\\json\\2180_re_1\\playlist1270.json'
Error reading playlist1334.json: [Errno 2] No such file or directory: 'D:\\data\\json\\2180_re_1\\playlist1334.json'
Error reading playlist1373.json: [Errno 2] No such file or directory: 'D:\\data\\json\\2180_re_1\\playlist1373.json'
Error reading playlist1935.json: [Errno 2] No such file or directory: 'D:\\data\\json\\2180_re_1\\playlist1935.json'
Error reading playlist2163.json: [Errno 2] No such file or directory: 'D:\\data\\json\\2180_re_1\\playlist2163.json'
                            song_name             artist
0         Sum

In [2]:
result_df.to_csv(r'D:\data\csv\result_data.csv', index=False)


### 전처리 코드(장르 일원화, 연도 정규화, 외국어 가사 처리)

In [105]:
import re
korean = re.compile('[\u3131-\u3163\uac00-\ud7a3]+')
directory = "C:\\Users\\gimge\\OneDrive\\문서\\dic_json\\unge2180"
def Pretreatmen(read_directory, save_directory):
    for i in range(count_files_in_directory(read_directory)):
        try:
            if os.path.getsize(file_path) == 0:
                print(f"Skipping empty file: {file_path}")
                continue
            with open(read_directory + f'\\playlist{i}.json', 'r', encoding='utf-8') as file:
                data = json.load(file)
            for index, g in enumerate(data['song_info']['genre']):
                if g == 'J-POP':
                    data['song_info']['lyric'][index] = re.sub(korean, '', data['song_info']['lyric'][index])
            data['song_info']['genre'] = [item.split(', ')[-1] for item in data['song_info']['genre']]
            data['song_info']['release_date'] = [f"{int(date.split('.')[0]) - 2000}.{date.split('.')[1]}.{date.split('.')[2]}" if date != '-' and len(date.split('.')) == 3 else date for date in data['song_info']['release_date']]
            with open(save_directory + f'\\playlist{i}.json', 'w', encoding='utf-8') as file:
                json.dump(data, file, ensure_ascii=False, indent=4)
        except Exception as e:
            print(f"Error processing file {i}: {e}. Skipping file.")
            continue      

In [106]:
Pretreatmen(directory, 'C:\\Users\\gimge\\OneDrive\\문서\\dic_json\\unge_re')

Error processing file 17: list indices must be integers or slices, not str. Skipping file.
Error processing file 430: list indices must be integers or slices, not str. Skipping file.
Error processing file 1177: list indices must be integers or slices, not str. Skipping file.
Error processing file 1270: list indices must be integers or slices, not str. Skipping file.
Error processing file 1334: list indices must be integers or slices, not str. Skipping file.
Error processing file 1373: list indices must be integers or slices, not str. Skipping file.
Error processing file 1935: list indices must be integers or slices, not str. Skipping file.
Error processing file 2163: list indices must be integers or slices, not str. Skipping file.


Japanese found in file: a11111111.json
Japanese found in file: playlist1004.json
Japanese found in file: playlist1010.json
Japanese found in file: playlist1018.json
Japanese found in file: playlist1020.json
Japanese found in file: playlist1038.json
Japanese found in file: playlist1055.json
Japanese found in file: playlist1067.json
Japanese found in file: playlist1083.json
Japanese found in file: playlist1099.json
Japanese found in file: playlist111.json
Japanese found in file: playlist1118.json
Japanese found in file: playlist1122.json
Japanese found in file: playlist1129.json
Japanese found in file: playlist1138.json
Japanese found in file: playlist1153.json
Japanese found in file: playlist1154.json
Japanese found in file: playlist1157.json
Japanese found in file: playlist1163.json
Japanese found in file: playlist1173.json
Japanese found in file: playlist1176.json


TypeError: list indices must be integers or slices, not str