# Tratamento de dados 2

A partir de uma lista de pedalboards (`data/pedalboards-info.csv`), tentaremos obter os pedalboards.

## Lista de pedalboards

In [5]:
import pandas as pd
pedalboards = pd.read_csv('data/pedalboard-info.csv', index_col='index').sort_index()

pedalboards.head()

Unnamed: 0_level_0,link,title,artist,rating,has_audio,has_video,date,uploader,total_downloads
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
5299,patches.php?mode=show&unit=G3&ID=5299,plexi-delay,many,4.3,False,False,2011-08-18,otter,2899
5300,patches.php?mode=show&unit=G3&ID=5300,Tom Scholz tone,Boston - Smokin,3.5,False,False,2011-08-20,Henky Backer,4744
5301,patches.php?mode=show&unit=G3&ID=5301,Very Clean,,3.0,False,False,2011-08-22,Jimis,2293
5302,patches.php?mode=show&unit=G3&ID=5302,Pitch Rising Delay,Battles,1.0,False,False,2011-08-22,Jimis,1310
5303,patches.php?mode=show&unit=G3&ID=5303,2Tone,2Tone,1.0,False,False,2011-08-22,Jimis,1568


## Total de pedalboards na lista

In [6]:
len(pedalboards)

862

## Função para obter pedalboards

É esperado que o pedalboard tenha o formato **XML**.

Testaremos com o primeiro pedalboard da lista:

In [7]:
import requests
import xmltodict
import cgi

def read_pedalboard(index):
    response = requests.get('http://guitarpatches.com/download.php?unit=G3&mode=download&ID={}'.format(index))
    
    value, params = cgi.parse_header(response.headers['Content-Disposition'])
    filename = params['filename']
    
    try:
        data = xmltodict.parse(response.content)
    except:
        data = response.content
    
    return {
        'index': index,
        'filename': filename,
        'data': data
    }

In [8]:
pedalboard_g3v1 = read_pedalboard(pedalboards.index[0])
pedalboard_g3v2 = read_pedalboard(pedalboards.index[-1])

#pedalboard_g3v1
#pedalboard_g3v2

## Extrair informações importantes do modelo de dados

Inicialmente queremos somente o nome do pedalboard e o índice dos plugins de áudio utilizados neste pedalboard

In [9]:
def extract_data(pedalboard):
    index = pedalboard['index']
    extension = pedalboard['filename'].split('.')[-1]
    
    if extension == 'g3p':
        return extract_data_g3pn(index, pedalboard['data']['PatchData'], 3)
    elif extension == 'g3a':
        return extract_data_g3an(index, pedalboard['data'], 3)
    elif extension == 'g3xp':
        return extract_data_g3pn(index, pedalboard['data']['PatchData'], 6)
    elif extension == 'g3xa':
        return extract_data_g3an(index, pedalboard['data'], 6)
    elif extension == 'zip':
        return extract_data_zip(index, pedalboard['data'])
    else:
        raise Exception('Unknown format: {}'.format(extension))

def extract_data_g3pn(index, pedalboard, total):
    """
    Extract data for a pedalboard with a total audio plugins elements
    """
    data = [index, pedalboard['Name']]

    for index in range(0, total):
        data.append(pedalboard['Module{}'.format(index)]['Prm1'])
    
    return [data]

def extract_data_g3an(index, pedalboards, total):
    """
    Extract data for a pedalboard with a total audio plugins elements
    """
    function = lambda pedalboard: extract_data_g3pn(index, pedalboard, total)[0]
    return list(map(function, pedalboards['PatchSet']['Patches']['PatchData']))

import zipfile
import io

def extract_data_zip(index, data):
    zipped = zipfile.ZipFile(io.BytesIO(data))
    result = []
    
    for name in zipped.namelist():
        result += extract_data({
            'index': index,
            'data': xmltodict.parse(zipped.read(name)),
            'filename': name
        })
    
    return result

extract_data(pedalboard_g3v1) + extract_data(pedalboard_g3v2)

[[5299, 'plexi*dly*', '74', '102', '55'],
 [10872, 'Bass******', '30', '37', '94', '24', '107', '107']]

## Processar toda a lista de plugins

Processamentos todos os pedalboards listados. Serão geradas duas listas

 * `data`: Plugins processados
 * `errors`: [index, Causa do erro]

Erros podem ocorrer por:
 * Arquivos compactados.

In [10]:
def process_mode_one():
    data = []
    errors = []

    for i, index in enumerate(pedalboards.index):
        print(i, index, end='')

        try:
            pedalboard = read_pedalboard(index)
            data += extract_data(pedalboard)
            print()
        except Exception as e:
            errors.append((index, e))
            print('', e)

    return data


In [11]:
import multiprocessing

try:
    cpus = multiprocessing.cpu_count()
except NotImplementedError:
    cpus = 2   # arbitrary default

cpus = 32

def process(index):
    i, index = index
    print(i, index)

    data = []
    error = None

    try:
        pedalboard = read_pedalboard(index)
        data = extract_data(pedalboard)
    except Exception as e:
        error = (index, e)
        print(index, '-', e)

    return data, error

pool = multiprocessing.Pool(processes=cpus)
result = pool.map(process, enumerate(pedalboards.index))

result

0 5299
7 5347
35 5482
42 5703
70 5939
77 6075
56 5763
119 6452
112 6361
14 5360
84 6209
91 6261
21 5381
98 6280
133 6498
126 6461
105 6293
168 6730
210 6960
175 6745
161 6694
182 6776
154 6591
196 6879
63 5875
203 6934
217 6986
189 6826
49 5740
28 5435
147 6581
140 6558
36 5485
1 5300
57 5765
92 6262
85 6210
190 6827
106 6341
78 6076
197 6903
99 6281
6361 - 'content-disposition'
113 6380
64 5877
148 6582
8 5350
43 5705
15 5361
155 6592
120 6453
211 6961
29 5438
127 6462
134 6499
22 5398
218 6987
71 5940
50 5743
58 5768
204 6935
191 6833
176 6752
93 6275
2 5301
86 6230
37 5487
169 6734
114 6381
162 6696
100 6287
107 6342
79 6150
141 6573
65 5878
149 6584
183 6787
198 6929
44 5730
9 5352
16 5367
121 6454
30 5453
128 6490
212 6971
156 6610
135 6500
23 5401
72 5991
51 5751
59 5815
219 6988
94 6276
205 6955
3 5302
38 5655
170 6738
108 6344
142 6575
115 6389
87 6232
101 6288
177 6763
163 6702
192 6835
80 6168
66 5891
150 6585
184 6788
199 6930
10 5355
17 5368
45 5731
31 5475
122 6455
213 697

846 10667
859 10870
860 10871


[([[5299, 'plexi*dly*', '74', '102', '55']], None),
 ([[5300, 'HB*Boston3', '31', '102', '33']], None),
 ([[5301, '_Clean****', '24', '31', '28']], None),
 ([[5302, '_Battles**', '72', '107', '7']], None),
 ([[5303, '_2tone****', '72', '57', '96']], None),
 ([[5304, 'Numb______', '81', '57', '100']], None),
 ([[5307, '_Ethereal', '31', '109', '59']], None),
 ([[5347, 'The*Solo**', '105', '28', '19']], None),
 ([[5350, 'AC*DC*****', '28', '103', '61']], None),
 ([[5352, 'M1*Thom***', '8', '104', '28']], None),
 ([[5355, 'Slash*****', '24', '72', '102']], None),
 ([[5357, 'ACDC******', '55', '102', '61']], None),
 ([[5358, 'DearPruden', '96', '42', '55']], None),
 ([[5359, 'PaganBaby*', '100', '64', '67']], None),
 ([[5360, 'WalkOnWate', '96', '64', '67']], None),
 ([[5361, 'RambleTamb', '100', '64', '67']], None),
 ([[5367, 'Clean', '24', '19', '31']], None),
 ([[5368, 'solo*4*b*', '31', '78', '17']], None),
 ([[5371, 'Blues*****', '31', '98', '62']], None),
 ([[5372, 'Money4Noth', '28'

In [12]:
from functools import reduce

data, errors = zip(*result)

data = reduce(lambda a, b: a+b, data)
errors = list(filter(lambda e: e != None, errors))

errors = [(index, str(error)) for index, error in errors]

In [13]:
labels = ['id', 'name', 'plugin1', 'plugin2', 'plugin3', 'plugin4', 'plugin5', 'plugin6']

d = pd.DataFrame.from_records(data, columns=labels, index='id')
#d = d.drop_duplicates()
d = d.sort_values('name')

d.head(10)

Unnamed: 0_level_0,name,plugin1,plugin2,plugin3,plugin4,plugin5,plugin6
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
9467,!!*Cuda',23,27,73,109,106,61
8913,'70s*V.H**,39,30,100,60,107,107
7313,'70s*V.H**,39,30,100,60,107,107
9471,'70s*V.H**,39,30,100,60,107,107
7313,'90s*V.H**,23,99,31,42,53,60
9672,*!!*Wanted,23,27,85,49,60,107
9467,*!!*Wanted,23,27,85,49,60,107
10623,**********,107,107,107,107,107,107
10623,**********,107,107,107,107,107,107
10623,**********,107,107,107,107,107,107


In [19]:
import operator
import functools

def columns_same_effect(effect):
    columns = ['plugin{}'.format(i) for i in range(1, 7)]
    return reduce(operator.and_, [d[column] == str(effect) for column in columns])

# Replace all NaN to 107 - None
d = d.fillna('107')
print(len(d), 'patches')

4408 patches


In [20]:
# Remove effects with all effects equals
for i in range(117):
    d = d[~columns_same_effect(i)]

print(len(d), 'unique patches - disconsidering parameters')

3121 unique patches - disconsidering parameters


In [21]:
d.to_csv('data/pedalboard-plugin.csv')
d.head(10)

Unnamed: 0_level_0,name,plugin1,plugin2,plugin3,plugin4,plugin5,plugin6
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
9467,!!*Cuda',23,27,73,109,106,61
8913,'70s*V.H**,39,30,100,60,107,107
7313,'70s*V.H**,39,30,100,60,107,107
9471,'70s*V.H**,39,30,100,60,107,107
7313,'90s*V.H**,23,99,31,42,53,60
9672,*!!*Wanted,23,27,85,49,60,107
9467,*!!*Wanted,23,27,85,49,60,107
9588,**********,107,107,107,27,107,31
10849,**********,107,107,107,107,107,84
9467,****EMG*81,23,27,72,101,105,107


In [22]:
e = pd.DataFrame.from_records(errors, columns=['id', 'error'], index='id')
e.to_csv('data/pedalboard-plugin-error.csv')
e

Unnamed: 0_level_0,error
id,Unnamed: 1_level_1
6361,'content-disposition'
7074,Unknown format: rar
7975,Unknown format: jpg
7976,Unknown format: jpg
8313,Unknown format: rar
8447,Unknown format: g5p
9416,Unknown format: g5a
9592,"no element found: line 1, column 0"
9876,Unknown format: rar
10190,Unknown format: xml
