In [8]:
import os, os.path,sys
import csv
import collections
import subprocess
import pandas as pd
import numpy as np
import glob
from datetime import datetime as dt
from stanfordcorenlp import StanfordCoreNLP
from tqdm import tqdm
from multiprocessing import Pool
import requests


csv.field_size_limit(sys.maxsize)


9223372036854775807

In [9]:
tracks_file = 'data/tracks/tracks.csv'

In [10]:
def get_track_info(artist,track):
    base_url = 'http://ws.audioscrobbler.com/2.0/'
    context = '?method=track.getInfo&api_key=db3a905ecd9f0b7d0e9641c39e7527dd&format=json'
    params = '&artist={0}&track={1}'.format(artist,track)
    resp = requests.get(base_url+context+params)
    if resp.status_code != 200:
        raise Exception('Failed')
    response = resp.json()
    duration = response['track']['duration']
    tags = response['track']['toptags']['tag']
    genre = [x['name'] for x in tags]
    genre = ','.join(genre)
    return duration,genre    

def get_remaining_tracks():
    
    columns1 = ['artist_id','artist_name','track_name','count']
    all_tracks = pd.read_csv(tracks_file,names=columns1)

    failed_tracks_file = 'data/tracks/tracks_failed_*.csv'    
    failed_files = glob.glob(failed_tracks_file)
    failed = pd.concat((pd.read_csv(f,names=columns1) for f in failed_files))

    columns2 = ['artist_id','artist_name','track_name','count','duration','genre']
    complete_tracks_file = 'data/tracks/tracks_complete_*.csv'
    complete_files = glob.glob(complete_tracks_file)
    completed = pd.concat((pd.read_csv(f,names=columns2) for f in complete_files))

    tracks1 = all_tracks[~all_tracks['artist_name'].isin(completed['artist_name']) | 
                         ~all_tracks['track_name'].isin(completed['track_name'])]
    remaining = tracks1[~tracks1['artist_name'].isin(failed['artist_name']) | 
                      ~tracks1['track_name'].isin(failed['track_name'])]
    return remaining

In [11]:
def process_tracks(tracks,file_size,process_id):

    print('STARTED: Process: {0} with size: {1}'.format(process_id,file_size))
    complete_tracks_file = 'data/tracks/tracks_complete_{0}.csv'.format(process_id)
    failed_tracks_file = 'data/tracks/tracks_failed_{0}.csv'.format(process_id)    

    if os.path.exists(complete_tracks_file):
        aw = 'a' 
    else:
        aw = 'w+' 
        
    i = 0
    with open(complete_tracks_file, aw) as writer:
        writer = csv.writer(writer,quoting=csv.QUOTE_NONNUMERIC)
        for row in tqdm(tracks.itertuples(),total=file_size):
            data = []
            try:
                artist_id = row.artist_id
                artist = row.artist_name
                track = row.track_name
                count = row.count

                data.append(artist_id)
                data.append(artist)
                data.append(track)
                data.append(count)
                
                duration,genre = get_track_info(artist,track)
                data.append(duration)
                data.append(genre)
                writer.writerow(data)
            except Exception as e:
                if os.path.exists(failed_tracks_file):
                    append_write = 'a' 
                else:
                    append_write = 'w+' 
                
                with open(failed_tracks_file, append_write) as failed:
                    failed = csv.writer(failed,quoting=csv.QUOTE_NONNUMERIC)
                    failed.writerow(data)
                pass
            
#             if(i==100):
#                 break
            i = i+1 
    print('COMPLETED: Process: {0} with size: {1}'.format(process_id,file_size))

In [12]:
def parallel_process(remaining,threads):
    pool = Pool()
    data = np.array_split(remaining, threads)
    for i in range(threads):
        df = data[i]
        size = df.shape[0]
        pool.apply_async(process_tracks, [df,size,i])   

In [13]:
remaining = get_remaining_tracks()
remaining.shape

(0, 4)

In [7]:
parallel_process(remaining,3)

STARTED: Process: 0 with size: 128642


  0%|          | 1/128642 [00:00<7:04:12,  5.05it/s]

STARTED: Process: 1 with size: 128641


  0%|          | 2/128642 [00:00<6:48:40,  5.25it/s]

STARTED: Process: 2 with size: 128641


100%|██████████| 128642/128642 [4:38:54<00:00,  7.69it/s]    


COMPLETED: Process: 0 with size: 128642


100%|██████████| 128641/128641 [4:40:48<00:00,  7.64it/s]  


COMPLETED: Process: 1 with size: 128641


100%|██████████| 128641/128641 [4:43:21<00:00,  3.38it/s]


COMPLETED: Process: 2 with size: 128641
