In [14]:
# -*- coding: utf-8 -*-
import os, sys, time, random, urllib, subprocess, pickle
import pandas as pd
from eyed3 import id3
from mutagen.id3 import ID3
from mutagen.easyid3 import EasyID3
from mutagen import File

### Build out music library objects

#### Initialize path and collect all artist directories

In [None]:
path = "/Users/OGN/Music/iTunes/iTunes Music/Music/"
music_dict = dict()
music_lib = []

artist_dirs = [ os.path.join(path, i) for i in os.listdir(path) 
               if os.path.isdir(os.path.join(path, i))]

#### Create list of artist folders with their respective paths (as dicts)

In [None]:
def create_music_lib(path):
    for at_dir in os.listdir(path):
        at_dir_path = os.path.join(path, at_dir)
        if os.path.isdir(at_dir_path):
            artist_dict = {
                "ARTIST_NAME": at_dir,
                "ARTIST_PATH": at_dir_path
            }   
            yield artist_dict
            
music_lib = list(create_music_lib(path))
music_lib

#### For each artist folder, list all album subfolders

In [None]:
for artist in music_lib:
    album_dirs = [al for al in os.listdir(artist["ARTIST_PATH"])
                    if os.path.isdir(os.path.join(artist["ARTIST_PATH"], al))]
    
    artist['ALBUMS'] = []
    for album in album_dirs:
        album_dict = {
            "ALBUM_NAME": album,
            "ALBUM_PATH": os.path.join(artist['ARTIST_PATH'], album)
        }
        artist['ALBUMS'].append(album_dict)

In [None]:
pickle.dump(music_lib, open("musicLib.p" , "wb"))

In [2]:
music_lib = pickle.load(open("musicLib.p", "rb"))

##### Data structure: music_lib[artist_index] = {artist: path, albums = albums[album_index]}

In [3]:
music_lib[1]['ALBUMS'][0]['ALBUM_PATH']
music_lib[:5]

[{'ALBUMS': [{'ALBUM_NAME': 'Lost Horizons',
    'ALBUM_PATH': '/Users/OGN/Music/iTunes/iTunes Music/Music/Lemon Jelly/Lost Horizons'}],
  'ARTIST_NAME': 'Lemon Jelly',
  'ARTIST_PATH': '/Users/OGN/Music/iTunes/iTunes Music/Music/Lemon Jelly'},
 {'ALBUMS': [{'ALBUM_NAME': 'WT 21',
    'ALBUM_PATH': '/Users/OGN/Music/iTunes/iTunes Music/Music/Florian Kupfer/WT 21'},
   {'ALBUM_NAME': 'This Society',
    'ALBUM_PATH': '/Users/OGN/Music/iTunes/iTunes Music/Music/Florian Kupfer/This Society'}],
  'ARTIST_NAME': 'Florian Kupfer',
  'ARTIST_PATH': '/Users/OGN/Music/iTunes/iTunes Music/Music/Florian Kupfer'},
 {'ALBUMS': [{'ALBUM_NAME': 'From All Purity',
    'ALBUM_PATH': '/Users/OGN/Music/iTunes/iTunes Music/Music/Indian/From All Purity'}],
  'ARTIST_NAME': 'Indian',
  'ARTIST_PATH': '/Users/OGN/Music/iTunes/iTunes Music/Music/Indian'},
 {'ALBUMS': [{'ALBUM_NAME': "There's a Riot Goin' On",
    'ALBUM_PATH': "/Users/OGN/Music/iTunes/iTunes Music/Music/Sly and the Family Stone/There's a Riot

#### Get album art from song file or get from internet using sacad

In [4]:
def get_album_artwork(album_name, artist_name, song_file):

    artwork_file_name = "{} - {}.jpg".format(artist_name, album_name)
    rel_artwork_path = "/images/album art/" + artwork_file_name
    abs_artwork_path = "/Users/OGN/1Projects/norecords/app/assets/images/album_images/" + artwork_file_name
      
    try:
        artwork = song_file.tags["APIC:"].data
        with open(abs_artwork_path, 'wb') as img:
            img.write(artwork)
            rel_artwork_path = "/images/album art/{} - {}.jpg" \
                .format(artist_name, album_name)
    except Exception as e:
        try:
            result = subprocess.check_output('sacad "{}" "{}" 600 "{}"' \
                                        .format(artist_name, album_name, abs_artwork_path),
                                        shell=True, stderr=subprocess.STDOUT)
        except Exception, e:
            print repr(e)
        else:
            if "Downloading" not in result:
                rel_artwork_path = 'N/A' 
                print(artist_name, album_name)
    
    return rel_artwork_path

#### Create the music dictionary

In [5]:
def create_album_dict(lib):
    albums_dict = dict()
    for artist in lib:
        artist_name = artist['ARTIST_NAME']
        for album in artist['ALBUMS']:
            album_name = album['ALBUM_NAME']
            songs = [song for song in os.listdir(album['ALBUM_PATH'])
                        if not song.startswith('.')]
            if not songs:
                continue

            try:
                song_file = File('{}/{}'.format(album['ALBUM_PATH'], songs[0]))
            except Exception as e:
                pass
            
            artwork_path = get_album_artwork(album_name, artist_name, song_file)
            
            try:
                genre = str(song_file.tags["TCON"])
                genre = genre.replace('Prog.', 'Progressive').replace('Prog ', 'Progressive ')
                genre = genre.split(',')
            except:
                genre = []

            try:
                date = str(song_file.tags["TDRC"]).split('-')[0]
            except:
                date = ''
            
            
            songs = [song.replace('.mp3', '') for song in songs]
            songs = [song.replace('.m4a', '') for song in songs]
            
            albums_dict[album_name] = {
                "NAME": album_name, "PATH": album['ALBUM_PATH'],
                "ARTIST": artist_name, "GENRE": genre, "DATE": date, 
                "SONGS": songs, "IMAGE_PATH": artwork_path
        }     
    return albums_dict

In [6]:
albums_dict = create_album_dict(music_lib)

('Wu-Tang Clan', 'Limak n Lime')
('Limak n Lime', 'Limak n Lime')
('Oum Kolthoum', 'Yalli Kan Yeshgeek Anni')
('The Daniel Pemberton TV Orchestra', 'Little Big Planet_ Little Big Music Musical (Soundtrack from Video Game)')
('Ro\xcc\x88yksopp', 'Melody A.M_')
('Geto Boys', 'Comp CD 4')
('Beastie Boys', 'Limak n Lime')
('Al-Safi', 'The Best of Al-Sahi')
('Jan Dlabal', "Jan Dlabal's Album")
('Knxwledge_', 'klemintine_taype')
('Knxwledge_', 'karma.loops.prt.1')
('Knxwledge_', "[DB]FiXXX's")
('Compilations', 'Jazz re_freshed Live - Prelude')
('Limako', 'Kids (1995) Samples')
('Limako', 'Logxecokeh')
('Limako', 'The Camsound')
('Marc-Andre\xcc\x81 Dalbavie', 'Dalbavie - Color, Violin Concerto, Ciaccona')


#### Prune unusable entries

In [7]:
for album in albums_dict.items():
    if album[1]['IMAGE_PATH'] == 'N/A':
        del albums_dict[album[0]]            

### Show the created albums dict

In [8]:
df = pd.DataFrame(albums_dict)
df = df.T
df

Unnamed: 0,ARTIST,DATE,GENRE,IMAGE_PATH,NAME,PATH,SONGS
#3,Finalzinho Chegando,2012,[Deep House],/images/album art/Finalzinho Chegando - #3.jpg,#3,/Users/OGN/Music/iTunes/iTunes Music/Music/Fin...,"[02 Pesos Teoréticos, 01 Estruturas Potencial..."
#4,Finalzinho Chegando,2012,[Deep House],/images/album art/Finalzinho Chegando - #4.jpg,#4,/Users/OGN/Music/iTunes/iTunes Music/Music/Fin...,"[02 Eu e Claudinha, Pra Sempre (Em Abell 39), ..."
'Nard,Bernard Wright,1981,[Funk],/images/album art/Bernard Wright - 'Nard.jpg,'Nard,/Users/OGN/Music/iTunes/iTunes Music/Music/Ber...,"[03 Music is the Key, 05 Just Chillin' Out, 01..."
'Round About Midnight,Miles Davis,1957,[Jazz],/images/album art/Miles Davis - 'Round About M...,'Round About Midnight,/Users/OGN/Music/iTunes/iTunes Music/Music/Mil...,"[04 Bye Bye Blackbird, 03 All of You, 05 Tadd'..."
(Pronounced 'Lĕh-'nérd 'Skin-'nérd),Lynyrd Skynyrd,1973,[Country Rock],/images/album art/Lynyrd Skynyrd - (Pronounced...,(Pronounced 'Lĕh-'nérd 'Skin-'nérd),/Users/OGN/Music/iTunes/iTunes Music/Music/Lyn...,"[03 Gimme Three Steps, 06 Mississippi Kid, 08 ..."
0181,Four Tet,2013,[Electronic],/images/album art/Four Tet - 0181.jpg,0181,/Users/OGN/Music/iTunes/iTunes Music/Music/Fou...,"[08 Track H, 06 Track F, 03 Track C, 14 Track ..."
08-09,Repeat Pattern,2011,"[Hip Hop, Instrumental]",/images/album art/Repeat Pattern - 08-09.jpg,08-09,/Users/OGN/Music/iTunes/iTunes Music/Music/Rep...,"[10 ATH, 13 BBB, 01 RLFS3, 15 MV, 04 STAB, 08 ..."
10 + 2_ 12 American Text Sound Pieces,Compilations,1975,[Experimental],/images/album art/Compilations - 10 + 2_ 12 Am...,10 + 2_ 12 American Text Sound Pieces,/Users/OGN/Music/iTunes/iTunes Music/Music/Com...,"[01 Just, 13 Crickets, 05 Give It to Me, Baby,..."
13.0.0.0.0,TTNG,2013,[Math Rock],/images/album art/TTNG - 13.0.0.0.0.jpg,13.0.0.0.0,/Users/OGN/Music/iTunes/iTunes Music/Music/TTN...,"[08 Triptych, 07 Nice Riff, Clichard, 02 Havoc..."
154,Wire,1979,[Post-Punk],/images/album art/Wire - 154.jpg,154,/Users/OGN/Music/iTunes/iTunes Music/Music/Wir...,"[02 Two People in a Room, 13 40 Versions, 09 B..."


#### Save to pickle

In [9]:
pickle.dump(albums_dict, open("albumsDict.p" , "wb"))

#### Load with pickle

In [10]:
albums_dict = pickle.load(open("albumsDict.p", "rb"))

### Collect youtube links for all the songs in the music library

In [2]:
import re, requests
from bs4 import BeautifulSoup
from apiclient.discovery import build
from apiclient.errors import HttpError
from oauth2client.tools import argparser

#### Auth params

In [3]:
DEVELOPER_KEY = "AIzaSyAofeH0f3BlWm6oGiRXaVkS2jeIQVlrETE"
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"

In [4]:
def youtube_search(keywords, n_results=1):
    youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,
                    developerKey=DEVELOPER_KEY, cache_discovery=False)
    # Call the search.list method to retrieve results matching the specified
    # query term.
  
    search_response = youtube.search().list(
        q=keywords,
        type='video',
        part="id,snippet",
        maxResults=n_results
    ).execute()
    
    search_videos = []
    for search_result in search_response.get('items', []):
        link = "https://youtube.com/watch?v={}?autoplay=1".format(search_result['id']['videoId'])
        search_videos.append(link)
    return search_videos

#### Test with one song

In [5]:
youtube_search("Moacir Santos Coisa nº 2", 1)

['https://youtube.com/watch?v=mmdxIisJ6Zs?autoplay=1']

In [47]:
base = "https://www.youtube.com/results?search_query="
query = ['2Pac+all+eyez+on+me']
page = "&page="
count = 1

for q in query:
    query_url = base + str(q) + page + str(count)
    print "Scraping {}\n".format(query_url)
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    r = requests.get(query_url, headers=headers)
    soup = BeautifulSoup(r.text)
    titles = soup.find_all("h3", class_="yt-lockup-title")
    print titles[0].a["href"]

Scraping https://www.youtube.com/results?search_query=2Pac+all+eyez+on+me&page=1

/watch?v=05PCmqjIeNE


In [44]:
albums_dict_list = sorted(list(albums_dict.items()))

print albums_dict_list[11]
print len(albums_dict_list)

('1984', {'IMAGE_PATH': '/images/album art/Hugh Hopper - 1984.jpg', 'ARTIST': 'Hugh Hopper', 'GENRE': ['Progressive Rock'], 'DATE': '1973', 'PATH': '/Users/OGN/Music/iTunes/iTunes Music/Music/Hugh Hopper/1984', 'SONGS': ['03 Minipax II', '01 Miniluv', '05 Miniplenty', '06 Minitrue Reprise', '02 Minipax I', '04 Minitrue'], 'NAME': '1984'})
1753


In [45]:
#pickle.dump(albums_dict_list, open("albumsDictList.p", "wb"))

In [10]:
albums_dict_list = pickle.load(open("./albumsDictList.p", "rb"))

#### Create a list of random desktop agents for scraping

In [32]:
desktop_agents = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
                 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
                 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
                 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14',
                 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
                 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
                 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
                 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
                 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
                 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0']

def random_headers():
    return {'User Agent': random.choice(desktop_agents)}

In [33]:
def create_scrape_url(artist, song):

    base = "https://www.youtube.com/results?search_query="
    song_title = song.split()
    if re.search('[0-9]+', song_title[0]):
        song_title.pop(0)
    full_title = artist.split() + song_title
    query_string = ""
    for i, word in enumerate(full_title):
        query_string += word
        if len(full_title) - i > 1:
            query_string += "+"
    return base + str(query_string) + "&page="

In [7]:
def create_search_string(artist, song):

    song_title = song.split()
    if re.search('[0-9]+', song_title[0]):
        song_title.pop(0)
    full_title = artist.split() + song_title
    search = " ".join(full_title)
    return search

In [32]:
def get_youtube_link(scrape_url, random_header):

    try:
        r = requests.get(scrape_url, headers=random_header)
    except Exception, e:
        print repr(e)
        return 'N/A'
    else:
        soup = BeautifulSoup(r.text)
        titles = soup.find_all("h3", class_="yt-lockup-title")

    try:
        youtube_link = "https://youtube.com{}?autoplay=1".format(titles[0].a["href"])
        youtube_link = youtube_link.replace('watch?v=', 'embed/')
    except Exception, e:
        print scrape_url, titles
    else:
        time.sleep(random.randint(0,2))
    finally:
        return youtube_link

In [8]:
def add_songs_dicts(albums_dict_list):
        
    for i, album in enumerate(albums_dict_list):
        print i, album[0]
        songs_dict = []
        for song in album[1]['SONGS']:
            search_terms = create_search(album[1]['ARTIST'], song)

            try:
                youtube_link = youtube_search(search_terms)
            except Exception, e:
                print 'Error:', repr(e)
                raise BaseException('Request Blocked')
            else:
                song_dict = {"TITLE": song, "YT_LINK": youtube_link}
                songs_dict.append(song_dict)
                print song_dict 
        album[1]['SONGS_DICT'] = songs_dict

In [None]:
for i, album in enumerate(albums_dict_list[::20], 0):
    print "Adding albums up to {}\n\n\n".format((i+1)*20)
    add_songs_dicts(albums_dict_list[i*20:(i+1)*20])
    pickle.dump(albums_dict_list, open("albumsDictList.p", "wb"))

Adding albums up to 20



0 #3
{'YT_LINK': [], 'TITLE': '02 Pesos Teore\xcc\x81ticos'}
{'YT_LINK': [], 'TITLE': '01 Estruturas Potencializantes, Megalo\xcc\x81polis de Desiste\xcc\x82ncias'}
1 #4
{'YT_LINK': [], 'TITLE': '02 Eu e Claudinha, Pra Sempre (Em Abell 39)'}
{'YT_LINK': ['https://youtube.com/watch?v=pds10dlkKrk?autoplay=1'], 'TITLE': '01 Volta e Sonda'}
2 'Nard
{'YT_LINK': ['https://youtube.com/watch?v=mjGfQkO--uo?autoplay=1'], 'TITLE': '03 Music is the Key'}
{'YT_LINK': ['https://youtube.com/watch?v=yZxKKuKVJfU?autoplay=1'], 'TITLE': "05 Just Chillin' Out"}
{'YT_LINK': ['https://youtube.com/watch?v=obaK2mex2mY?autoplay=1'], 'TITLE': "01 Haboglabotribin'"}
{'YT_LINK': ['https://youtube.com/watch?v=it2gZnA7xBY?autoplay=1'], 'TITLE': "04 Spinnin'"}
{'YT_LINK': ['https://youtube.com/watch?v=jR-xt-iqSlU?autoplay=1'], 'TITLE': '06 Bread Sandwiches'}
{'YT_LINK': ['https://youtube.com/watch?v=3jrORzlahBU?autoplay=1'], 'TITLE': '09 Solar'}
{'YT_LINK': ['https://youtube.com/watch?v=Yfp

In [None]:
# pickle.dump(albums_dict_list, open("albumsDictList.p", "wb"))

In [13]:
print albums_dict_list[0]

('#3', {'IMAGE_PATH': '/images/album art/Finalzinho Chegando - #3.jpg', 'ARTIST': 'Finalzinho Chegando', 'SONGS_DICT': [{'YT_LINK': [], 'TITLE': '02 Pesos Teore\xcc\x81ticos'}, {'YT_LINK': [], 'TITLE': '01 Estruturas Potencializantes, Megalo\xcc\x81polis de Desiste\xcc\x82ncias'}], 'GENRE': ['Deep House'], 'DATE': '2012', 'PATH': '/Users/OGN/Music/iTunes/iTunes Music/Music/Finalzinho Chegando/#3', 'SONGS': ['02 Pesos Teore\xcc\x81ticos', '01 Estruturas Potencializantes, Megalo\xcc\x81polis de Desiste\xcc\x82ncias'], 'NAME': '#3'})


In [None]:
albums_dict_list = pickle.load(open("./albumsDictList.p", "rb"))

In [None]:
print albums_dict_list[1683]

In [None]:
for album in albums_dict_list[0:10]:
    for song in album["SONGS_D"]:
        print album["ARTIST"], ",", album["NAME"], ",", song["TITLE"], song["YT_LINK"]
    print album


### Create PostgreSQL database

In [None]:
import psycopg2

In [None]:
conn = psycopg2.connect(dbname='norecordsdb', user='gqe', host='localhost', password='')
cursor = conn.cursor()

In [None]:
create_table_sql = (
                'CREATE TABLE songs ('
                'song_id serial PRIMARY KEY,'
                'song_name text NOT NULL,'
                'artist text NOT NULL,'
                'album text NOT NULL,'
                'youtube_link text NOT NULL,'
                'artwork_path text NOT NULL);'
                )

In [None]:
cursor.execute(create_table_sql)

In [None]:
cursor.close()
conn.commit()

In [None]:
conn = psycopg2.connect(dbname='norecordsdb', user='gqe', host='localhost', password='')
cursor = conn.cursor()

In [None]:
for album in albums_dict_list[34:]:
    for song in album["SONGS_D"]:
        insertSQL = ("INSERT INTO songs (song_id, song_name, artist, album, youtube_link, artwork_path) VALUES (%i, '%s', '%s', '%s', '%s', '%s');" 
                     % (song["id"], song["TITLE"].replace("'", "''"), album["ARTIST"].replace("'", "''"), album["NAME"].replace("'","''"), song["YT_LINK"], album["IMAGE_PATH"].replace("'", "''")) )
        print insertSQL
        try:
            conn = psycopg2.connect(dbname='norecordsdb', user='gqe', host='localhost', password='')
            cursor = conn.cursor()
            cursor.execute(insertSQL)
            conn.commit()
        except Exception, e:
            print repr(e)
        

In [None]:
cursor.close()
conn.commit()