In [1]:
import os
import json
import numpy as np
import pandas as pd

In [2]:
with open('data/train.json',encoding='utf-8-sig') as f:
    train_dict = json.load(f)
    
with open('data/song_meta.json',encoding='utf-8-sig') as f:
    song_dict = json.load(f)
    
with open('data/genre_gn_all.json',encoding='utf-8-sig') as f:
    genre_dict = json.load(f)
    
train_df = pd.DataFrame.from_dict(train_dict)
song_df = pd.DataFrame.from_dict(song_dict)

# 데이터 처리

## train_df 전처리

In [3]:
train_df.head(3)

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
0,[락],61281,여행같은 음악,"[525514, 129701, 383374, 562083, 297861, 13954...",71,2013-12-19 18:36:19.000
1,"[추억, 회상]",10532,요즘 너 말야,"[432406, 675945, 497066, 120377, 389529, 24427...",1,2014-12-02 16:19:42.000
2,"[까페, 잔잔한]",76951,"편하게, 잔잔하게 들을 수 있는 곡.-","[83116, 276692, 166267, 186301, 354465, 256598...",17,2017-08-28 07:09:34.000


### train_df 필요한 컬럼 추가

In [4]:
# 플레이리스트 곡수 컬럼 추가
train_df['tags_cnt'] = train_df['tags'].map(lambda x : len(x))

# 플레이리스트 태그수 컬럼 추가
train_df['songs_cnt'] = train_df['songs'].map(lambda x : len(x))

### train_df 에 포함된 곡들 (중복포함,중복제거)

In [5]:
from itertools import chain

# 플레이리스트 포함된 노래 중복포함
songs_duplicate = chain.from_iterable(train_df['songs'].tolist())

# 플레이리스트 포함된 노래 중복제거
songs_unique = list(set(songs_duplicate))

### train_df 에 포함된 태그들 (중복포함,중복제거)

In [6]:
# 플레이리스트 포함된 태그 중복포함
tags_duplicate = list(chain.from_iterable(train_df['tags'].tolist()))

# 플레이리스트 포함된 태그 중복제거
tags_unique = list(set(tags_duplicate))

### tag에 새로운 id부여, new_tags_id 컬럼 생성

In [7]:
# { 태그 : 새로운id } 딕셔너리
tag_to_id = dict(zip(tags_unique,range(0,len(tags_unique))))

# { 새로운id : 태그 } 딕셔너리
id_to_tag = dict(zip(range(0,len(tags_unique)),tags_unique))

train_df['new_tags_id'] = train_df['tags'].map(lambda x : [tag_to_id[v] for v in x])

### songs에 새로운 id부여, new_songs_id 컬럼 생성

In [8]:
# { 노래 : 새로운id } 딕셔너리
song_to_id = dict(zip(songs_unique,range(0,len(songs_unique))))

# { 새로운id : 태그 } 딕셔너리
id_to_song = dict(zip(range(0,len(songs_unique)),songs_unique))

train_df['new_songs_id'] = train_df['songs'].map(lambda x : [song_to_id[v] for v in x])

In [9]:
train_df.head(3)

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date,tags_cnt,songs_cnt,new_tags_id,new_songs_id
0,[락],61281,여행같은 음악,"[525514, 129701, 383374, 562083, 297861, 13954...",71,2013-12-19 18:36:19.000,1,19,[28739],"[456704, 112732, 333158, 488440, 258853, 12127..."
1,"[추억, 회상]",10532,요즘 너 말야,"[432406, 675945, 497066, 120377, 389529, 24427...",1,2014-12-02 16:19:42.000,2,42,"[20164, 4666]","[375894, 587314, 431997, 104605, 338568, 21226..."
2,"[까페, 잔잔한]",76951,"편하게, 잔잔하게 들을 수 있는 곡.-","[83116, 276692, 166267, 186301, 354465, 256598...",17,2017-08-28 07:09:34.000,2,28,"[15170, 26443]","[72132, 240434, 144495, 161861, 307991, 222934..."


In [10]:
train_df = train_df[['id','plylst_title','tags','new_tags_id','songs','new_songs_id','tags_cnt','songs_cnt','like_cnt','updt_date']]
train_df.head(3)

Unnamed: 0,id,plylst_title,tags,new_tags_id,songs,new_songs_id,tags_cnt,songs_cnt,like_cnt,updt_date
0,61281,여행같은 음악,[락],[28739],"[525514, 129701, 383374, 562083, 297861, 13954...","[456704, 112732, 333158, 488440, 258853, 12127...",1,19,71,2013-12-19 18:36:19.000
1,10532,요즘 너 말야,"[추억, 회상]","[20164, 4666]","[432406, 675945, 497066, 120377, 389529, 24427...","[375894, 587314, 431997, 104605, 338568, 21226...",2,42,1,2014-12-02 16:19:42.000
2,76951,"편하게, 잔잔하게 들을 수 있는 곡.-","[까페, 잔잔한]","[15170, 26443]","[83116, 276692, 166267, 186301, 354465, 256598...","[72132, 240434, 144495, 161861, 307991, 222934...",2,28,17,2017-08-28 07:09:34.000


In [11]:
train_df.columns = ['플리_id','플리제목','태그','새태그_id','노래_id','새노래_id','태그수','노래수','좋아요수','갱신일']
train_df.head(3)

Unnamed: 0,플리_id,플리제목,태그,새태그_id,노래_id,새노래_id,태그수,노래수,좋아요수,갱신일
0,61281,여행같은 음악,[락],[28739],"[525514, 129701, 383374, 562083, 297861, 13954...","[456704, 112732, 333158, 488440, 258853, 12127...",1,19,71,2013-12-19 18:36:19.000
1,10532,요즘 너 말야,"[추억, 회상]","[20164, 4666]","[432406, 675945, 497066, 120377, 389529, 24427...","[375894, 587314, 431997, 104605, 338568, 21226...",2,42,1,2014-12-02 16:19:42.000
2,76951,"편하게, 잔잔하게 들을 수 있는 곡.-","[까페, 잔잔한]","[15170, 26443]","[83116, 276692, 166267, 186301, 354465, 256598...","[72132, 240434, 144495, 161861, 307991, 222934...",2,28,17,2017-08-28 07:09:34.000


### 변수 정리

<현재>

- tags_duplicate : 태그 중복 포함 리스트
- tags_unique : 태그 중복 제거 리스트
- songs_duplicate : 노래 중복 포함 리스트
- songs_unique : 노래 중복 제거 리스트
 
- tag_to_id : { 태그 : 새로운id } 딕셔너리
- id_to_tag : { 새로운id : 태그 } 딕셔너리
- song_to_id : { 노래 : 새로운id } 딕셔너리
- id_to_song : { 새로운id : 노래 } 딕셔너리

## song_df 전처리

- 칼럼명 한글화
- 수정날짜 데이터 변환

In [12]:
song_df.head(3)

Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket,id
0,[GN0901],20140512,불후의 명곡 - 7080 추억의 얄개시대 팝송베스트,2255639,[2727],Feelings,[GN0900],[Various Artists],0
1,"[GN1601, GN1606]",20080421,"Bach : Partitas Nos. 2, 3 & 4",376431,[29966],"Bach : Partita No. 4 In D Major, BWV 828 - II....",[GN1600],[Murray Perahia],1
2,[GN0901],20180518,Hit,4698747,[3361],Solsbury Hill (Remastered 2002),[GN0900],[Peter Gabriel],2


In [13]:
song_df.columns = ['장르_소분류','수정날짜','앨범명','앨범ID','가수ID','노래명','장르_대분류','가수이름','노래ID']
song_df = song_df[['노래ID','노래명','앨범ID','앨범명','가수ID','가수이름','장르_대분류','장르_소분류','수정날짜']]
song_df.head(3)

Unnamed: 0,노래ID,노래명,앨범ID,앨범명,가수ID,가수이름,장르_대분류,장르_소분류,수정날짜
0,0,Feelings,2255639,불후의 명곡 - 7080 추억의 얄개시대 팝송베스트,[2727],[Various Artists],[GN0900],[GN0901],20140512
1,1,"Bach : Partita No. 4 In D Major, BWV 828 - II....",376431,"Bach : Partitas Nos. 2, 3 & 4",[29966],[Murray Perahia],[GN1600],"[GN1601, GN1606]",20080421
2,2,Solsbury Hill (Remastered 2002),4698747,Hit,[3361],[Peter Gabriel],[GN0900],[GN0901],20180518


In [14]:
song_df.sort_values(by='수정날짜',ascending=False).head(3)

Unnamed: 0,노래ID,노래명,앨범ID,앨범명,가수ID,가수이름,장르_대분류,장르_소분류,수정날짜
448286,448286,여수 바다 (Yeosu sea),10403230,고요한 밤바다 구경하기 [여수 바다],[2737142],[무드홀릭 (Moodholic)],[GN1800],[GN1801],20220113
141185,141185,잔잔한 바람이 불어와 (There is a gentle breeze),10403230,고요한 밤바다 구경하기 [여수 바다],[2737142],[무드홀릭 (Moodholic)],[GN1800],[GN1801],20220113
419219,419219,MDKDVV (Gardons le sourire),10420389,MDKDVV (Gardons le sourire),[2764604],[Bryans],[GN1300],[GN1301],20200423


In [15]:
songid_songname_dict = dict(zip(song_df['노래ID'].tolist(),song_df['노래명'].tolist()))
songid_songname_dict

{0: 'Feelings',
 1: 'Bach : Partita No. 4 In D Major, BWV 828 - II. Allemande',
 2: 'Solsbury Hill (Remastered 2002)',
 3: 'Feeling Right (Everything Is Nice) (Feat. Popcaan & Wale)',
 4: '그남자 그여자',
 5: 'Para Los Enamorados',
 6: 'Sibelius : Valse Triste Op.44 (시벨리우스 : 슬픈 왈츠 작품번호 44)',
 7: 'Superman March (From &#34;Superman&#34; / Live At Walt Disney Concert Hall, Los Angeles / 2019)',
 8: 'Lovers’ Leap (Feat. Qypthone)',
 9: '사랑, 그대라는 멜로디',
 10: 'Hi (Heyoo)',
 11: 'Everything We Do (2002 Digital Remaster)',
 12: 'So In Love',
 13: 'Voyage',
 14: 'Knock You Out',
 15: 'Faure: Romance Sans Paroles In A Flat Major Op.17 III.Andante.Moderato',
 16: 'Can&#39;t Stand Still',
 17: 'Girl Crush',
 18: 'ASMR 숙면과 휴식에 좋은 편안한 빗소리 (백색소음)',
 19: '무얼 기다리나 (Feat. 조원선)',
 20: 'IL MONDO (Sung by 길병민, 유채훈, 한기주) (‘팬텀싱어 3’ 유채훈 가창곡)',
 21: 'WHY',
 22: 'Hush, Hush, Sweet Charlotte',
 23: 'Anonymous: Up! Awake! From Highest Steeple - Arr. Jacob Praetorius (1586-1651)',
 24: 'As Long As I`m Alive (Radio Edit)

In [16]:
songid_singername_dict = dict(zip(song_df['노래ID'].tolist(),song_df['가수이름'].tolist()))

In [17]:
most_recent = '20200423'

In [18]:
songs_2020 = song_df[song_df.수정날짜.str.startswith('2020')]

In [19]:
songs_2020.loc[:,('수정날짜')] = pd.to_datetime(songs_2020.loc[:,('수정날짜')])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [20]:
songs_2020.loc[:,('수정날짜')]

21       2020-03-31
27       2020-01-29
38       2020-02-14
209      2020-04-17
293      2020-02-12
            ...    
707546   2020-02-21
707601   2020-02-18
707637   2020-01-17
707759   2020-02-25
707949   2020-03-13
Name: 수정날짜, Length: 12426, dtype: datetime64[ns]

In [21]:
from datetime import datetime

date1 = datetime(2019,1,1)
date2 = datetime(2020,4,23)
specific_date = songs_2020[(songs_2020['수정날짜'] >= date1) & (songs_2020['수정날짜'] <= date2)]

my_song_list = specific_date['노래ID'].tolist()
my_genre_b_list = specific_date['장르_대분류'].tolist()
my_genre_d_list = specific_date['장르_소분류'].tolist()

In [22]:
train_df.loc[:,('갱신일')] = pd.to_datetime(train_df['갱신일'])

In [30]:
specific_date_plyst = train_df[(train_df['갱신일'] >= date1) & (train_df['갱신일'] <= date2)]
specific_date_plyst

Unnamed: 0,플리_id,플리제목,태그,새태그_id,노래_id,새노래_id,태그수,노래수,좋아요수,갱신일
3,147456,크리스마스 분위기에 흠뻑 취하고 싶을때,"[연말, 눈오는날, 캐럴, 분위기, 따듯한, 크리스마스캐럴, 겨울노래, 크리스마스,...","[14776, 18924, 21355, 3039, 3164, 14889, 16921...","[394031, 195524, 540149, 287984, 440773, 10033...","[342495, 169897, 469393, 250235, 383169, 87161...",10,38,33,2019-12-05 15:15:18
8,70741,DANCING IN THE MOON-LIGHT .01,"[일렉트로니카, 포크, 메탈, 락, 댄스, 인디]","[14054, 3359, 27752, 28739, 21715, 3176]","[634861, 270738, 163936, 692209, 449477, 56342...","[551639, 235247, 142479, 601408, 390705, 48960...",6,40,0,2019-11-30 21:17:59
10,31804,걸그룹 땐쓰쏭,"[kpop, 댄스, 걸그룹댄스, 스트레스해소]","[7324, 21715, 2961, 21211]","[507380, 144826, 553894, 660381, 260730, 28188...","[440947, 125865, 481342, 573755, 226532, 24493...",4,157,74,2020-04-13 23:36:55
21,12460,ᴡʜɪᴛᴇ : ʀᴏᴍᴀɴᴛɪᴄ ᴊᴀᴢᴢ ᴀᴛ ᴀ ᴄᴀғᴇ,"[카페, 재즈, 잔잔한]","[7729, 16219, 26443]","[476581, 605317, 386899, 54627, 30636, 107115,...","[414200, 525962, 336248, 47324, 26523, 93073, ...",3,14,12,2020-03-05 03:06:06
24,122843,"지친 하루 끝, 힐링이 필요한 당신에게 추천하는 인디곡","[감성, 어쿠스틱, 잔잔한, 새벽, 편안한, 인디, 밤]","[12391, 11126, 26443, 6609, 9108, 3176, 9306]","[418935, 413444, 572996, 334539, 39801, 641888...","[364155, 359384, 497917, 290721, 34468, 557681...",7,20,14,2019-07-26 18:03:21
...,...,...,...,...,...,...,...,...,...,...
115060,63967,#기분업! 에너지필~ 사운드굿! 해외 EDM/일렉 선곡#2019년9월#,"[일렉트로니카, 운동, 에너지, 드라이브, EDM, 기분전환, 파티, 기분업, 하우...","[14054, 12243, 29157, 2883, 5587, 20455, 25372...","[615085, 195531, 563726, 556964, 186423, 24687...","[534442, 169903, 489875, 484001, 161965, 21451...",10,67,10,2019-12-21 11:22:39
115066,120325,METAL E'SM #2,"[록메탈, 밴드사운드, 록, 락메탈, 메탈, 락, extreme]","[27199, 20187, 17863, 4419, 27752, 28739, 27290]","[429629, 441511, 612106, 516359, 691768, 38714...","[373452, 383795, 531854, 448795, 601027, 33646...",7,12,3,2020-04-17 04:31:11
115068,11343,#1. 눈물이 앞을 가리는 나의_이야기,"[담시, 가족, 눈물, 그리움, 주인공, 나의_이야기, 사랑, 친구]","[6886, 19848, 10315, 25565, 1847, 8474, 17313,...","[50512, 249024, 250608, 371171, 229942, 694943...","[43717, 216378, 217755, 322592, 199796, 603802...",8,11,4,2019-08-16 20:59:22
115069,131982,퇴근 버스에서 편히 들으면서 하루를 마무리하기에 좋은 POP,"[잔잔한, 버스, 퇴근버스, Pop, 풍경, 퇴근길]","[26443, 26975, 17777, 17171, 2564, 7218]","[533534, 608114, 343608, 417140, 609009, 30217...","[463690, 528384, 298561, 362605, 529162, 26262...",6,55,4,2019-10-25 23:40:42


In [24]:
from collections import Counter

songs_list = chain.from_iterable(specific_date_plyst['노래_id'].tolist())
song_counted = Counter(songs_list)
song_added_count = sorted(song_counted.items(),key=lambda x : x[1],reverse=True)

In [25]:
song_added_count[0:10]

[(463173, 1519),
 (520093, 1483),
 (648628, 1440),
 (680366, 1440),
 (350309, 1398),
 (427724, 1380),
 (215411, 1380),
 (485155, 1324),
 (42155, 1305),
 (187047, 1302)]

In [26]:
for id,count in song_added_count[0:10]:
    print(songid_songname_dict[id],songid_singername_dict[id],count)

비가 내렸어 (Vocal by 스티브언니) ['업라이트 (Upright)'] 1519
고마운 사람 (Vocal by 이소진) ['업라이트 (Upright)'] 1483
분홍빛 가득한 날에 (Vocal by 호수) ['아재'] 1440
끝내지 못한 이야기 (Feat. 호수) ['어쿠스틱 멜로디 (Acoustic Melody)'] 1440
Sad Movie (Vocal by Levi) ['업라이트 (Upright)'] 1398
지워줄게 (Vocal by 스티브언니) ['아재'] 1380
지금보다 조금 (Feat. 이원) ['어쿠스틱 멜로디 (Acoustic Melody)'] 1380
선물 ['민우'] 1324
벙어리 ['홍아'] 1305
착각 ['박은옥'] 1302


In [27]:
my_genre_b_list = chain.from_iterable(my_genre_b_list)
genre_b_counted = Counter(my_genre_b_list)
genre_b_added_count = sorted(genre_b_counted.items(),key=lambda x : x[1],reverse=True)
genre_b_added_count

[('GN0900', 2216),
 ('GN0500', 1405),
 ('GN1200', 1229),
 ('GN0300', 987),
 ('GN0100', 968),
 ('GN0400', 737),
 ('GN1300', 713),
 ('GN1000', 674),
 ('GN1100', 662),
 ('GN1500', 560),
 ('GN2700', 521),
 ('GN0600', 389),
 ('GN1700', 341),
 ('GN1900', 323),
 ('GN2100', 316),
 ('GN1600', 296),
 ('GN1800', 292),
 ('GN2500', 275),
 ('GN0700', 262),
 ('GN0800', 236),
 ('GN0200', 207),
 ('GN2600', 207),
 ('GN2000', 179),
 ('GN1400', 152),
 ('GN2200', 71),
 ('GN2400', 55),
 ('GN2800', 51),
 ('GN9000', 39),
 ('GN2900', 9)]

In [28]:
for id,count in genre_b_added_count[0:10]:
    print(genre_dict[id],count)

POP 2216
인디음악 1405
랩/힙합 1229
랩/힙합 987
발라드 968
R&B/Soul 737
R&B/Soul 713
록/메탈 674
일렉트로니카 662
OST 560


In [29]:
my_genre_d_list = chain.from_iterable(my_genre_d_list)
genre_d_counted = Counter(my_genre_d_list)
genre_d_added_count = sorted(genre_d_counted.items(),key=lambda x : x[1],reverse=True)
for id,count in genre_d_added_count[0:10]:
    print(genre_dict[id],count)

세부장르전체 2262
'10- 1319
세부장르전체 1161
세부장르전체 1080
세부장르전체 935
세부장르전체 925
'10- 758
세부장르전체 715
'10- 691
세부장르전체 682
