In [1]:
import tarfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import warnings
import collections
warnings.filterwarnings('ignore')

from collections import defaultdict
from tqdm import tqdm


In [2]:
class clean_song_data():
    def __init__(self, song, song_info):
        self.song = pd.read_csv(song)
        self.song_info = pd.read_csv(song_info)
        self.result = pd.merge(self.song, self.song_info, on='song_id')
        self.filter_1()
        self.filter_2()
        self.add_count()
        self.convert_type()
        
    def filter_1(self):
        '''
        1. Filter out NaN
        2. Use only English songs
        
        '''
        header = []
        self.result = self.result.dropna()
#         for i in self.result.columns:
#             header.append(i)
#         for i in header:
#             self.result = self.result[pd.notnull(self.result[i])]
        
        # 52.0 = English
        self.result = self.result[self.result.language == 52.0]
    
    def filter_2(self):
        '''
        genre and artist count > 15
        
        '''
        header = ['genre_ids','artist_name']
        for i in header:
            k = pd.DataFrame(self.result.groupby(i).size().items())
            k = k.rename(columns={0: 'item', 1: 'count'})
            k = k[k['count']>15]
            self.result = self.result[self.result[i].isin(k['item'])]
            self.result.head()
            print(i,len(self.result))
            
    def add_count(self):
        '''
        add count for genre, composer 
        
        '''
        header = ['genre_ids', 'composer']
        name = ['gener_count', 'composer_count']
        for num,i in enumerate(header):
            l = []
            for j in self.result[i]:
                l.append(len(j.split('|')))
            self.result[name[num]] = l
            
    def convert_type(self):
        '''
        Convert str to list
        '''
        tmp = []
        for i in self.result['genre_ids'].tolist():
            tmp.append(list(map(int,i.split('|'))))
        self.result['genre_ids_list'] = tmp

In [3]:
song = 'songs.csv'
song_info = 'song_extra_info.csv'
song = clean_song_data(song, song_info)

genre_ids 168873
artist_name 121316


In [4]:
song.song.head()

Unnamed: 0,song_id,song_length,genre_ids,artist_name,composer,lyricist,language
0,CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E=,247640,465,張信哲 (Jeff Chang),董貞,何啟弘,3.0
1,o0kFgae9QtnYgRkVPqLJwa05zIhRlUjfF7O1tDw0ZDU=,197328,444,BLACKPINK,TEDDY| FUTURE BOUNCE| Bekuh BOOM,TEDDY,31.0
2,DwVvVurfpuz+XPuFvucclVQEyPqcpUkHR0ne1RQzPs0=,231781,465,SUPER JUNIOR,,,31.0
3,dKMBWoZyScdxSkihKG+Vf47nc18N9q4m58+b4e7dSSE=,273554,465,S.H.E,湯小康,徐世珍,3.0
4,W3bqWd3T+VeHFzHAUfARgW9AvVRaF4N5Yzm4Mr6Eo/o=,140329,726,貴族精選,Traditional,Traditional,52.0


In [5]:
song.song_info.head()

Unnamed: 0,song_id,name,isrc
0,LP7pLJoJFBvyuUwvu+oLzjT+bI+UeBPURCecJsX1jjs=,我們,TWUM71200043
1,ClazTFnk6r0Bnuie44bocdNMM3rdlrq0bCGAsGUWcHE=,Let Me Love You,QMZSY1600015
2,u2ja/bZE3zhCGxvbbOB3zOoUjx27u40cf5g09UXMoKQ=,原諒我,TWA530887303
3,92Fqsy0+p6+RHe2EoLKjHahORHR1Kq1TBJoClW9v+Ts=,Classic,USSM11301446
4,0QFmz/+rJy1Q56C1DuYqT9hKKqi5TUqx0sN0IwvoHrw=,愛投羅網,TWA471306001


In [6]:
song.result.head()

Unnamed: 0,song_id,song_length,genre_ids,artist_name,composer,lyricist,language,name,isrc,gener_count,composer_count,genre_ids_list
9,btcG03OHY3GNKWccPP0auvtSbhxog/kllIIOx5grE/k=,232629,352|1995,Kodaline,Stephen Garrigan| Mark Prendergast| Vincent Ma...,Stephen Garrigan| Mark Prendergast| Vincent Ma...,52.0,The One,GBARL1401580,2,5,"[352, 1995]"
35,NV9HhUzyK50tGvxb3w0PdZoaw3Ypp86XDmmMr0vgFdg=,262749,139,John Legend,John Stephens,John Stephens,52.0,Under the Stars,USSM11509174,1,1,[139]
39,OLnB6amnCTMEz5KxrV7pdkXJWB+fnMoWVJSzv2AlBLk=,236355,465,Westlife,Bettis| Hammond,Bettis| Hammond,52.0,When You Tell Me That You Love Me,GBARL0500645,1,2,[465]
87,UfK2UdQAaYF6IJUXqSJ10FsbAF3YkvMJYcTYC2Ec5m4=,222649,1609,Calvin Harris,Calvin Harris,Calvin Harris,52.0,Summer,GBARL1400296,1,1,[1609]
103,nO+tjL6/qHgNO/ui0TjeP+nSsD+aoGcT9AHU7Dtrzsc=,227327,880,Casting Crowns,Mark Hall|Matt Maher|John Mabe|Blake Bollinger,Mark Hall|Matt Maher|John Mabe|Blake Bollinger,52.0,You Are the Only One,USA5W1300245,1,4,[880]


In [7]:
len(song.result)

121316

In [8]:
song.result.dtypes

song_id            object
song_length         int64
genre_ids          object
artist_name        object
composer           object
lyricist           object
language          float64
name               object
isrc               object
gener_count         int64
composer_count      int64
genre_ids_list     object
dtype: object

In [12]:
song.result['composer'].head()

9      Stephen Garrigan| Mark Prendergast| Vincent Ma...
35                                         John Stephens
39                                       Bettis| Hammond
87                                         Calvin Harris
103       Mark Hall|Matt Maher|John Mabe|Blake Bollinger
Name: composer, dtype: object