In [1]:
# pip install python3-discogs-client
# pip install pandas
# pip install numpy
# pip install openpyxl # convert dataframe to xlsx file

In [2]:
import discogs_client
import time
import pandas as pd
import numpy as np
import os  

# [Set up discogs client](https://python3-discogs-client.readthedocs.io/en/latest/authentication.html)
* API_KEY
* user agent
    * A User-Agent is required for Discogs API requests, as it identifies your application to the Discogs servers.
* client_id

In [3]:
API_KEY = 'YOUR API_KEY' # 
user_agent = 'discogs_project/1.0'  # A User-Agent is required for Discogs API requests, as it identifies your application to the Discogs servers.
client_id = 108475
d = discogs_client.Client(user_agent, user_token=API_KEY)

In [4]:
# 108475 is the artist id for Greg Phillinganes.
greg = d.artist(client_id)

# All the releases/masters related to Greg Phillinganes.
greg_rel = greg.releases
# The release list can be paginated, 22 pages in total.
# greg_rel_page0 = greg_rel.page(0)
# greg_rel_page1 = greg_rel.page(1)
# greg_rel_page2 = greg_rel.page(2)
# greg_rel_page3 = greg_rel.page(3)
# greg_rel_page4 = greg_rel.page(4)
# greg_rel_page5 = greg_rel.page(5)


In [5]:
def copy_data(rel_entry):
    # We need the following line to force the program to copy all the data of the release.
    # Without it, the size of the resultant dict will be only 12. This is the most painful part.
    dum = rel_entry.url
    # Set the time value to 0.95 or larger, if received "too many requests" error from server.
    time.sleep(0.9)
    return rel_entry.data

# # Each entry of the master_list is a version_list.
# # A version_list contains every version of a master release.
# # Each entry of the version_list is a dict containing everything of a version(release/album).
# # All versions -> version_list -> master_list.

In [6]:
def get_json(page):
    start_time = time.time()
    master_list_page = []
    # greg_rel is the whole list, with 21 pages.
    for cur_rel in page:
        version_list = []
        # Each release's type is either master or release
        if cur_rel.data["type"] == "master":
            for rel_entry in cur_rel.versions:
                version_list.append(copy_data(rel_entry))
        else:
            version_list.append(copy_data(cur_rel))
            
        
        print(cur_rel.title)  # comment this line if you don't need detailed release names to be printed out.
        master_list_page.append(version_list)
    end_time = time.time()
    print(end_time - start_time)
    return master_list_page


# Expected data

### Performer Metadata
Role, Category, Track Title, Track Artists,	Album, Distributing Label, Release Year, Duration, Instruments, 	Number of Featured Performers, UPC, Evidence(url), 


### Conditional Data
Version(remix, radio, edit...), Additional Credit(producer, arranger, conductor), Year of Recording	Country of Recording(blank if same release year), Country of Release, Percentage of FP share, 

### Optional Data
ISRC, Genre, Format, Catalog #, Notes

## Release Dataframe

```
Releases:
    -Release 1:
        -tracklist
            -track 1
                -extraartists
                    -greg
                -......
            -track 2
                -extraartists
                    -greg
                    -......
            -track 3
                -extraartists
                    -others
                    -......
            ...
        -id
        -year
        -...
```

* Convert master data from list of dict to dataframe. 
* Drop some useless columns. 
* Get artists name of each releases
* Get role of our client if there is such information. If there is no such information, leave the cell 'unknown'

## Note: 
1. Roles might be displayed in extraartists column of releases dataframe or hidden deeper in tracklist column. 
2. The roles in artists column are always empty, so we extract role information from the extraartists fields.  

## Track Dataframe  --> Extract credits from tracks

> The track data comes from the tracklist column in each release

> Basiclly, each release has a different number of tracks, and each track has different extra artists. The main idea is to keep all the tracks first and then drop the ones with extra artists that do not include our client (GP). 

> In the process, if the client has credits in certain tracks, we get the corresponding roles. If there are no additional artists, we keep the track as well.


## Join release data into each track

> After we get all tracks, we can combine the release data to each track based on the same release id


## Note: 
1. There are some overlapped data in the dataframe, for instance, tracklist, role, artists, extraartists. 
2. If both track_role and release_role are 'unknown' which means there is no role information found in original data.

In [7]:
def get_csv(page, pageNum):
    release_df = pd.DataFrame()
    for master in page:
        release_df = pd.concat([release_df, pd.DataFrame(master)], ignore_index=True)
        
    # drop some useless columns
    release_df = release_df.drop(['videos', 'labels', 'status','stats', 'companies', 'format', 'community', 'images', 'artists_sort'], axis = 1)
    
    # extract artists name and id from artists dictionary
    release_df.artists = [[(i['name'], i['id']) if i != '' else i for i in d ] for d in release_df.artists]
    
    # extract formats from formats dictionary
    release_df.formats = [[i['name'] for i in d ]for d in release_df.formats ]
    
    # get roles if there is such information from releases
    cleaned_release_roles = []
    for extraartists in release_df.extraartists:
        role = []
        if extraartists:
            for extraartist in extraartists:
                if extraartist['id'] == 108475:
                    role.append(extraartist['role'])
        else:
            role.append('unknown')
        cleaned_release_roles.append(role)
    release_df['release_role'] = cleaned_release_roles
    release_df['release_role'] = [';'.join(map(str, l)) for l in release_df.release_role]
    
    # extract tracks from all releases
    tracks_df = pd.DataFrame([dict(**{'release_id':rel_id}, **y) for rel_id, v in zip(release_df.id, release_df.tracklist.values) for y in v], )
    tracks_df = tracks_df.replace(np.nan,'',regex=True)
    
    # get roles if there is such information from tracks

    cleaned_roles = []
    for extraartists in tracks_df.extraartists:
        role = []
        if extraartists:
            for extraartist in extraartists:
                if extraartist['id'] == 108475:
                    role.append(extraartist['role'])
        else:
            role.append('unknown')
        cleaned_roles.append(role)
    tracks_df['track_role'] = cleaned_roles
    tracks_df['track_role'] = [';'.join(map(str, l)) for l in tracks_df.track_role]
    tracks_df = tracks_df.replace('',np.nan,regex=True)
    tracks_df = tracks_df[tracks_df['track_role'].notna()]
    
    tracks_df.extraartists = tracks_df.extraartists.replace(np.nan,'',regex=True)
    tracks_df['track_extraartists'] = [[(i['name'], i['id']) if i != '' else i for i in d ] for d in tracks_df.extraartists]
    tracks_df['track_extraartists'] = [';'.join(map(str, l)) for l in tracks_df.track_extraartists]
    
    
    tracks_df.artists = tracks_df.artists.replace(np.nan,'',regex=True)
    tracks_df['track_artists'] = [[(i['name'], i['id']) if i != '' else i for i in d ] for d in tracks_df.artists]
    tracks_df['track_artists'] = [';'.join(map(str, l)) for l in tracks_df.track_artists]
    tracks_df = tracks_df.replace('',np.nan,regex=True)
    tracks_df = tracks_df.drop(['extraartists', 'artists'], axis = 1)
    
    
    
    release_df = release_df.rename(columns={"id": "release_id"})
    release_df = release_df.drop(['extraartists'], axis=1)
    release_df = release_df.drop(['tracklist'], axis=1)
    
    
    result_df = tracks_df.join(release_df.set_index('release_id'), on='release_id', lsuffix='_track', rsuffix='_release')
    
    idx = np.unique( result_df.index.values, return_index = True )[1]
    result_df = result_df.iloc[idx]
    
    os.makedirs('output', exist_ok=True)
    result_df.to_csv('output/v3_sample_output_{1}_Page{0}.csv'.format(pageNum, greg.name)) 
    result_df.to_excel("output/v3_sample_output_{1}_Page{0}.xlsx".format(pageNum, greg.name))

# Main

In [8]:
master_list_pages = []
for i in range(greg_rel.pages):
    print('page ', i)
    page = get_json(greg_rel.page(i))
    get_csv(page, i + 1)
    
    print('page {0} Done'.format(i))

page  0
Girl Talk


KeyboardInterrupt: 