# 🎼 lylics_scoring

## 🧱 SetUp

### 🚀 installation

In [1]:
!git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git

Cloning into 'Mecab-ko-for-Google-Colab'...
remote: Enumerating objects: 138, done.[K
remote: Counting objects: 100% (47/47), done.[K
remote: Compressing objects: 100% (38/38), done.[K
remote: Total 138 (delta 26), reused 22 (delta 8), pack-reused 91 (from 1)[K
Receiving objects: 100% (138/138), 1.72 MiB | 8.92 MiB/s, done.
Resolving deltas: 100% (65/65), done.


In [2]:
%cd Mecab-ko-for-Google-Colab/

/content/Mecab-ko-for-Google-Colab


In [3]:
!bash install_mecab-ko_on_colab_light_220429.sh

Installing konlpy.....
Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting JPype1>=0.7.0 (from konlpy)
  Downloading jpype1-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jpype1-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (493 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.8/493.8 kB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: JPype1, konlpy
Successfully installed JPype1-1.5.1 konlpy-0.6.0
Done
Installing mecab-0.996-ko-0.9.2.tar.gz.....
Downloading mecab-0.996-ko-0.9.2.tar.gz.......
from https://bitbucket.org/eunjeon/mecab-ko/downloads/mecab-0.996-ko-0.9.2.tar.gz
--2024-11-27 05:11:55--  https://bitbucket.org/eunjeon/mecab-ko/downloads/me

### 📚 Library

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings
warnings.filterwarnings('ignore')

# Tokenizer
from konlpy.tag import Mecab, Komoran

### 🗂️ Google Drive Mount

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 📦 Data

In [6]:
# 데이터 불러오기
vocab_data = pd.read_csv("/content/drive/MyDrive/ComInKo/data/dictionary.csv")
lyrics_data = pd.read_csv("/content/drive/MyDrive/ComInKo/data/data_R.csv")
lyrics_data = lyrics_data[lyrics_data["lyrics"].notnull()]
lyrics_data = lyrics_data[lyrics_data["lyrics"].str.strip() != ""]

## 🔎 PreProcessing & Tokenizer

In [7]:
# Mecab 및 Komoran 객체 생성
mecab = Mecab()
komoran = Komoran()

# 한글 여부 확인 함수
def is_hangul(text):
    return bool(re.search("[가-힣]", text))

# 영어 여부 확인 함수
def is_english(text):
    return bool(re.search("[a-zA-Z]", text))

# 어간 추출 함수
def extract_stems_with_pos(text, mecab, komoran):
    stems = []
    # Mecab을 사용하여 형태소 분석
    mecab_tokens = mecab.pos(text)
    for word, pos in mecab_tokens:
        if pos in ["VV", "VA", "NNG"]:
            stems.append((word, pos))
    # Komoran을 사용하여 보완
    try:
        komoran_tokens = komoran.pos(text)
        for word, pos in komoran_tokens:
            if pos in ["VV", "VA", "NNG"]:
                if (word, pos) not in stems:
                    stems.append((word, pos))
    except Exception as e:
        # 예외 발생 시 로깅 또는 무시
        pass
    return stems

# 단어장을 어간 단위로 변환
def preprocess_vocab(vocab_data, mecab, komoran):
    stem_to_level = {}
    temp = []
    for _, row in vocab_data.iterrows():
        vocab = row["Vocabulary"]
        level = row["Level"]
        stems = extract_stems_with_pos(vocab, mecab, komoran)
        for stem, pos in stems:
            key = (stem, pos)
            temp.append({"Vocabulary": vocab, "Stem": stem, "Word class": pos, "Level": level})
            if key in stem_to_level:
                if isinstance(level, int) and level < stem_to_level[key]:
                    stem_to_level[key] = level
            else:
                stem_to_level[key] = level
    temp_df = pd.DataFrame(temp)
    return stem_to_level, temp_df

# 어간 확장 함수
def extend_stem(stem):
    extensions = [stem + "이"]
    return extensions

# 가사 난이도 계산 함수
def calculate_lyrics_difficulty(lyrics, stem_to_level, mecab, komoran):
    stems = extract_stems_with_pos(lyrics, mecab, komoran)
    total_level = 0
    total_count = 0
    analysis_result = []
    for stem, pos in stems:
        level = stem_to_level.get((stem, pos), "<없음>")
        if level == "<없음>" and pos in ["VV", "VA"]:
            extended_stems = extend_stem(stem)
            for ext_stem in extended_stems:
                level = stem_to_level.get((ext_stem, pos), "<없음>")
                if isinstance(level, int):
                    break
        if isinstance(level, int):
            total_level += level
            total_count += 1
        analysis_result.append((stem, pos, level))
    difficulty = total_level / total_count if total_count > 0 else 0
    return difficulty, analysis_result

# 전체 가사 데이터 분석 및 정렬
def analyze_and_sort_lyrics(data, lyrics_data, mecab, komoran):
    stem_to_level, _ = preprocess_vocab(data, mecab, komoran)
    results = []
    for _, row in lyrics_data.iterrows():
        title = row["title"]
        artist = row["artist"]
        lyrics = row["lyrics"]
        try:
            difficulty, _ = calculate_lyrics_difficulty(lyrics, stem_to_level, mecab, komoran)
            results.append({"title": title, "artist": artist, "difficulty_level": difficulty})
        except Exception as e:
            # 예외 발생 시 난이도 0으로 설정
            results.append({"title": title, "artist": artist, "difficulty_level": 0})
    result_df = pd.DataFrame(results)
    sorted_lyrics_data = result_df.sort_values(by="difficulty_level", ascending=False)
    return sorted_lyrics_data

# lyrics_data에 레벨 정보 추가
def add_level_info_to_lyrics(lyrics_data, stem_to_level, mecab, komoran):
    difficulty_levels = []
    for _, row in lyrics_data.iterrows():
        lyrics = row["lyrics"]
        if pd.notnull(lyrics) and lyrics.strip() != "":
            try:
                difficulty, _ = calculate_lyrics_difficulty(lyrics, stem_to_level, mecab, komoran)
                difficulty_levels.append(difficulty)
            except Exception as e:
                # 예외 발생 시 난이도 0으로 설정
                difficulty_levels.append(0)
        else:
            # 비어있는 가사에 대해서는 난이도 0 처리
            difficulty_levels.append(0)
    lyrics_data["difficulty_level"] = difficulty_levels
    return lyrics_data

# 단어장을 어간 단위로 변환 및 저장 (저장 부분 제거)
stem_to_level, temp_vocab_df = preprocess_vocab(vocab_data, mecab, komoran)

# 전체 가사 데이터 분석 및 정렬
sorted_lyrics_data = analyze_and_sort_lyrics(vocab_data, lyrics_data, mecab, komoran)

# lyrics_data에 난이도 추가
lyrics_data = add_level_info_to_lyrics(lyrics_data, stem_to_level, mecab, komoran)

### 🪜 Divide into Steps

In [8]:
# 4분위수 정의
quant = lyrics_data["difficulty_level"].quantile([0.25, 0.5, 0.75])

def re_scoring(difficulty_level):
    if difficulty_level <= quant[0.25]:
        return "초급"
    elif difficulty_level <= quant[0.75]:
        return "중급"
    else:
        return "고급"

# 난이도 범주화
lyrics_data["difficulty_level_category"] = lyrics_data["difficulty_level"].apply(re_scoring)

lyrics_data[["title", "artist", "difficulty_level", "difficulty_level_category"]].head()

Unnamed: 0,title,artist,difficulty_level,difficulty_level_category
0,Magnetic,아일릿(ILLIT),3.30303,고급
1,The Astronaut,진,2.241379,고급
2,GGUM,연준,2.19697,중급
3,SHEESH,BABYMONSTER,2.2,중급
4,CRAZY,LE SSERAFIM (르세라핌),3.181818,고급


### 🗂️ Save to csv, xlsx

In [9]:
# lyrics_data.to_excel("/content/drive/MyDrive/ComInKo/data/lyrics_with_levels.xlsx", index=False)
# lyrics_data.to_csv("/content/drive/MyDrive/ComInKo/data/lyrics_with_levels.csv", encoding = "cp949", index=False)

### 🎨 Check Lylics and Visualization

In [11]:
# 노래 제목 입력 및 분석 실행 (별도의 셀)
def show_lyrics_analysis_by_sentence(title, lyrics_data, stem_to_level, mecab, komoran):
    song = lyrics_data[lyrics_data["title"] == title]
    if song.empty:
        print(f"제목 '{title}'에 해당하는 가사를 찾을 수 없습니다.")
        return
    lyrics = song.iloc[0]["lyrics"]
    print(f"제목: {title}\n가사:\n{lyrics}\n 단위 분석:\n")
    sentences = [sentence.strip() for sentence in re.split(r'[.!?]', lyrics) if sentence.strip()]
    total_level = 0
    total_count = 0
    for i, sentence in enumerate(sentences, start=1):
        # print(f"문장 {i}: {sentence}")
        difficulty, analysis = calculate_lyrics_difficulty(sentence, stem_to_level, mecab, komoran)
        for stem, pos, level in analysis:
            if isinstance(level, int):
                total_level += level
                total_count += 1
            print(f"\n --> 단어: {stem}, 품사: {pos}, 레벨: {level}")
    difficulty = total_level / total_count if total_count > 0 else 0
    print(f"\n가사 전체 난이도: {difficulty:.2f}")

# 노래 제목 입력
input_title = input("노래 제목을 입력하세요: ")
show_lyrics_analysis_by_sentence(input_title, lyrics_data, stem_to_level, mecab, komoran)

노래 제목을 입력하세요: Horizon
제목: Horizon
가사:
반복해 Round and round
늘 같은 Ending
색채를 잃은 Eyes
그 시선에 담긴 난 Dangerous

익숙함에 길들어
안주할 뿐이야
색다른 걸 찾아봐
내 안의 틀을 부숴

끝없는 Spatial Horizon
사라진 현실의 경계
늘 잠겨있던 문을 열어

Spatial Horizon
찰나에 덮쳐 온 Panic
아득히 빠져
마치 홀린 듯이
날 뒤덮어
드리우는 Shadow
익숙했던 궤도를 벗어나

내 맘에 벌어진 괴리
닿을 것처럼
멀어지는 느낌
내 시야를 삼켜
Spatial Horizon

내 시야를 삼켜
Spatial Horizon

혜성처럼 퍼져가
Can feel the light is coming
빛이 나를 감싸고

넘고 싶지 않던 저, 지평선 넘어
다가온 별빛들이 구원 같아 Hold me
홀린 듯해 그 끝이 어디든
더 두렵지 않아 난 No more time

Oh Better days
Oh Better days come
‘Cuz Better days
‘Cuz Better days
Oh Better days
Oh Better days
잠겨있던 문을 열어

Spatial Horizon
찰나에 덮쳐 온 Panic
아득히 빠져
마치 홀린 듯이
날 뒤덮어
드리우는 Shadow
익숙했던 궤도를 벗어나

내 맘에 벌어진 괴리
닿을 것처럼
멀어지는 느낌
내 시야를 삼켜
Spatial Horizon

내 시야를 삼켜
Spatial Horizon

Oh Better days
Oh Better days come
‘Cuz Better days
‘Cuz Better days
Oh Better days
Oh Better days

내 시야를 삼켜
Spatial Horizon
 단위 분석:


 --> 단어: 반복, 품사: NNG, 레벨: 3

 --> 단어: 같, 품사: VA, 레벨: 1

 --> 단어: 색채, 품사: NNG, 레벨: 6

 --> 단어: 잃, 품사: VV, 레벨: 2

 --> 단어