In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
%cd /content/gdrive/MyDrive/bit_conference/

/content/gdrive/.shortcut-targets-by-id/1YDrmXvwQeDTF3AVegVo_-qlULY2-1-qE/bit_conference


### 데이터프레임 병합

In [None]:
# Standard data processing libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [None]:
df_tcc = pd.read_csv('song_lyrics/tcc_ceds_music.csv')
df_tcc

In [None]:
df_en_cleaned_large = pd.read_csv('coding/df_en_cleaned_large.csv')
df_en_cleaned_large = df_en_cleaned_large[df_en_cleaned_large['year']<=2019]
df_en_cleaned_large = df_en_cleaned_large[df_en_cleaned_large['views']>=20000]
df_en_cleaned_large

In [None]:
# 필요한 라이브러리 다시 불러오기 (세션 리셋됨)
import pandas as pd
import re
from tqdm import tqdm
from multiprocessing import Pool, cpu_count

# 정리 함수 정의
def clean_text(text):
    """대소문자를 무시하고, 영어/숫자가 아닌 문자를 제거하며, 괄호 안의 내용을 제거한 후 비교."""
    if pd.isna(text):
        return ""
    text = text.lower()  # 소문자로 변환
    text = re.sub(r'\([^)]*\)', '', text)  # 괄호 안 내용 제거
    text = re.sub(r'[^a-z0-9\s]', '', text)  # 영어와 숫자가 아닌 문자 제거
    return text.strip()

# 정리된 컬럼 추가
df_tcc['clean_track_name'] = df_tcc['track_name'].apply(clean_text)
df_tcc['clean_artist_name'] = df_tcc['artist_name'].apply(clean_text)
df_en_cleaned_large['clean_title'] = df_en_cleaned_large['title'].apply(clean_text)
df_en_cleaned_large['clean_artist'] = df_en_cleaned_large['artist'].apply(clean_text)

# 병렬 매칭 함수 정의
def match_rows(row_tcc):
    """df_tcc의 한 행과 df_en_cleaned_large의 모든 행을 비교하여 조건을 만족하는 경우 병합"""
    matched = []
    for _, row_en in df_en_cleaned_large.iterrows():
        if (row_tcc['clean_track_name'] in row_en['clean_title'] or
            row_en['clean_title'] in row_tcc['clean_track_name']) and \
           (row_tcc['clean_artist_name'] in row_en['clean_artist'] or
            row_en['clean_artist'] in row_tcc['clean_artist_name']):
            matched.append({**row_tcc.to_dict(), **row_en.to_dict()})
    return matched

# 병렬 처리 실행
num_workers = max(1, cpu_count() - 1)  # CPU 코어 개수 - 1 개를 사용
print("num_workers 수 :", num_workers)
with Pool(num_workers) as pool:
    results = list(tqdm(pool.imap(match_rows, [row for _, row in df_tcc.iterrows()]),
                        total=len(df_tcc), desc="Processing in Parallel"))

# 결과 병합
merged_rows = [item for sublist in results for item in sublist]
df_merged_parallel = pd.DataFrame(merged_rows)


num_workers 수 : 95


Processing in Parallel: 100%|██████████| 28372/28372 [28:50<00:00, 16.40it/s]


In [None]:
file_path = '/content/gdrive/MyDrive/bit_conference/coding/df_merged_parallel.csv'
df_merged_parallel.to_csv(file_path, index=False)
print(f"DataFrame successfully saved to {file_path}")

DataFrame successfully saved to /content/gdrive/MyDrive/bit_conference/coding/df_merged_parallel.csv


In [None]:
df_merged_parallel = pd.read_csv('coding/df_merged_parallel.csv')
df_merged_parallel

In [None]:
df_filtered = df_merged_parallel[
    (df_merged_parallel['clean_track_name'] == df_merged_parallel['clean_title']) &
    (df_merged_parallel['clean_artist_name'] == df_merged_parallel['clean_artist'])
]

columns_to_keep = [
    "artist_name", "track_name", "release_date", "genre", "lyrics", "dating", "violence",
    "world/life", "night/time", "shake the audience", "family/gospel", "romantic",
    "communication", "obscene", "music", "movement/places", "light/visual perceptions",
    "family/spiritual", "like/girls", "sadness", "feelings", "danceability", "loudness",
    "acousticness", "instrumentalness", "valence", "energy", "topic", "age", "views", "lyrics_clean"
]

df_filtered = df_filtered[columns_to_keep]
df_filtered = df_filtered.rename(columns={"lyrics": "short_lyrics", "lyrics_clean": "long_lyrics"})
df_filtered

file_path = '/content/gdrive/MyDrive/bit_conference/coding/df_filtered.csv'
df_filtered.to_csv(file_path, index=False)
print(f"DataFrame successfully saved to {file_path}")

DataFrame successfully saved to /content/gdrive/MyDrive/bit_conference/coding/df_filtered.csv


### 가사 요약

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
%cd /content/gdrive/MyDrive/bit_conference/

/content/gdrive/.shortcut-targets-by-id/1YDrmXvwQeDTF3AVegVo_-qlULY2-1-qE/bit_conference


In [None]:
!pip install transformers accelerate huggingface_hub

In [None]:
from huggingface_hub import login

login("...")

import torch
from transformers import pipeline
from tqdm import tqdm
import pandas as pd

# 모델 설정
model_id = "meta-llama/Llama-3.2-3B-Instruct"
pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


In [None]:
df_filtered = pd.read_csv('coding/df_filtered.csv')

In [None]:
from concurrent.futures import ThreadPoolExecutor

# 병렬 처리 함수
def summarize_lyrics(lyrics):
    half_length = len(lyrics) // 2
    input_text = lyrics[:half_length]

    messages = [
        {"role": "system", "content": "Summarize the given song lyrics into two concise sentences."},
        {"role": "user", "content": input_text},
    ]

    output = pipe(messages, max_new_tokens=100)
    summary = output[0]["generated_text"][-1]['content']  # 요약된 텍스트 추출
    return summary

# 병렬 처리 실행 (코어 4개 사용)
with ThreadPoolExecutor(max_workers=2) as executor:
    summaries = list(tqdm(executor.map(summarize_lyrics, df_filtered["long_lyrics"]), total=len(df_filtered), desc="Summarizing Lyrics"))

# 결과를 새로운 칼럼에 저장
df_filtered["lyrics_summary"] = summaries
df_filtered

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Summarizing Lyrics:   0%|          | 1/4312 [00:03<4:20:24,  3.62s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Summarizing Lyrics:   0%|          | 2/4312 [00:04<2:33:53,  2.14s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Summarizing Lyrics:   0%|          | 3/4312 [00:06<2:32:32,  2.12s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Summarizing Lyrics:   0%|          | 4/4312 [00:08<2:10:25,  1.82s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Summarizing Lyrics:   0%|          | 5/4312 [00:09<1:50:51,  1.54s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Summarizing Lyrics:   0%|          | 6/4312 [00:11<2:10:02,  1.81s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generati

Unnamed: 0,artist_name,track_name,release_date,genre,short_lyrics,dating,violence,world/life,night/time,shake the audience,...,loudness,acousticness,instrumentalness,valence,energy,topic,age,views,long_lyrics,lyrics_summary
0,frankie laine,i believe,1950,pop,believe drop rain fall grow believe darkest ni...,0.035537,0.096777,0.443435,0.001284,0.001284,...,0.647540,0.954819,0.000002,0.325021,0.263240,world/life,1.000000,24942,believe for every drop of rain that falls flow...,"The lyrics convey a message of hope and faith,..."
1,andy williams,it's the most wonderful time of the year,1953,pop,wonderful time year kid jingle bell tell good ...,0.001253,0.060214,0.001253,0.323077,0.001253,...,0.655642,0.778112,0.000000,0.718673,0.554541,night/time,0.957143,27015,its the most wonderful time of the year with t...,Here is a 2-sentence summary of the song lyric...
2,bobby vinton,blue velvet,1954,pop,blue wear blue bluer night softer satin light ...,0.001462,0.001462,0.001462,0.001462,0.001462,...,0.642694,0.873494,0.000002,0.330173,0.258235,music,0.942857,26447,she wore blue velvet bluer than velvet was the...,The song appears to be a romantic and poetic t...
3,andy williams,(where do i begin) love story,1957,pop,begin tell story great sweet story older simpl...,0.001012,0.001012,0.355473,0.084521,0.001012,...,0.636156,0.632530,0.000717,0.289984,0.453436,world/life,0.900000,54831,where do begin to tell the story of how great ...,The song's narrator is drawn to a person who b...
4,simon & garfunkel,mrs. robinson,1959,pop,love know bless heaven hold place pray like kn...,0.000940,0.000940,0.000940,0.000940,0.000940,...,0.531882,0.783132,0.000021,0.812448,0.426409,sadness,0.871429,311410,dee dee dee dee dee dee dee dee dee dee dee de...,"Unfortunately, I don't see any meaningful lyri..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4307,q-tip,life is better,2019,hip hop,life fill come fall away play bangin gonna ban...,0.001645,0.001645,0.440262,0.001645,0.001645,...,0.729098,0.364457,0.000000,0.689819,0.444427,world/life,0.014286,24323,lifes filled with gray but now it comes clean ...,Here is a two-sentence summary of the song lyr...
4308,future,tony montana,2019,hip hop,fuckin cockroaches motherfuckin freebandz want...,0.064614,0.067606,0.002193,0.002193,0.002193,...,0.752583,0.018172,0.000000,0.245672,0.705697,obscene,0.014286,179219,cockroaches muthafuckin freebandz want me to b...,Here is a 2-sentence summary of the song lyric...
4309,rakim,when i b on tha mic,2019,hip hop,internationally know hardcore real niggas inte...,0.001754,0.001754,0.001754,0.001754,0.001754,...,0.726226,0.045581,0.000000,0.839242,0.484468,obscene,0.014286,118179,hardcore real ill am internationally known whe...,Here is a two-sentence summary of the song lyr...
4310,nipsey hussle,hussle in the house,2019,hip hop,look comin straight slauson crazy motherfucker...,0.001096,0.001096,0.001096,0.001096,0.033829,...,0.851755,0.014156,0.000000,0.735161,0.913911,obscene,0.014286,45580,look am comin straight off of slauson crazy na...,Here is a concise summary of the song lyrics i...


In [None]:
# 특정 패턴 제거 함수 (대소문자 구분 없이 "here" 확인)
def clean_summary(text):
    if text.lower().startswith("here") and "\n\n" in text:
        return text.split("\n\n", 1)[-1]
    return text

# lyrics_summary 정제 적용 (df_filtered → df_summarized)
df_summarized = df_filtered.copy()  # 원본 데이터 유지
df_summarized["lyrics_summary"] = df_summarized["lyrics_summary"].apply(clean_summary)


In [None]:
file_path = '/content/gdrive/MyDrive/bit_conference/coding/df_summarized.csv'
df_summarized.to_csv(file_path, index=False)
print(f"DataFrame successfully saved to {file_path}")

DataFrame successfully saved to /content/gdrive/MyDrive/bit_conference/coding/df_summarized.csv
