# 2.2 - Tratamento dos dados - Dicionário para rotulação

Será criado um dicionário de palavras com base nos pedalboards obtidos no passo anterior.
Como há uma baixa quantidade de patches, é complicado obter todas as palavras boas, uma alternativa é pegar de um outro banco:

* http://guitarpatches.com/units.php

In [1]:
import nltk

tokenize = lambda phase: [word.lower() for word in nltk.word_tokenize(phase)]

## tokens

In [9]:
import pandas as pd
tokens = []

pedalboards = pd.read_json('mod-pedalboards-v2.json', orient='columns')

for title, description in zip (pedalboards['title'], pedalboards['description']):
    tokens += tokenize(title) + tokenize(description)

tokens

['basic',
 'power',
 'trio',
 'guit',
 'practical',
 'set',
 'for',
 'basic',
 'power',
 'trio',
 'gigs',
 '.',
 'clean',
 ',',
 'drive',
 ',',
 'and',
 'solo',
 'delay',
 '...',
 'crunch',
 'with',
 'clean',
 'ambience',
 'a',
 'strong',
 'crunch',
 '(',
 'ds',
 ')',
 ',',
 'with',
 'deep',
 'and',
 'clean',
 'ambience',
 '.',
 'using',
 'ds-1',
 'in',
 'parallel',
 'with',
 'chorus',
 ',',
 'delay',
 '(',
 'shiro',
 ')',
 'and',
 'shimmer',
 'in',
 'clean',
 'channel',
 '.',
 'dual',
 'channel',
 'bleepy',
 'ambience',
 'and',
 'overdriven',
 'lead',
 'this',
 'is',
 'an',
 'updated',
 'version',
 'of',
 'the',
 'board',
 'i',
 'used',
 'on',
 'the',
 'track',
 "'this",
 'is',
 'not',
 'the',
 'end',
 "'",
 ',',
 'that',
 "'s",
 'on',
 'youtube',
 '(',
 'search',
 'for',
 'steve',
 'lawson',
 '+',
 'the',
 'title',
 ')',
 '-',
 'f/s',
 '2',
 'switches',
 'between',
 'the',
 'two',
 'sides',
 'of',
 'the',
 'chain',
 ',',
 'and',
 'i',
 "'ve",
 'got',
 'the',
 'overdrive',
 ',',
 'pha

## Tagging

In [86]:
lancaster = nltk.LancasterStemmer()

taggs = nltk.pos_tag(tokens, tagset="universal")

taggs

[('basic', 'ADJ'),
 ('power', 'NOUN'),
 ('trio', 'NOUN'),
 ('guit', 'NOUN'),
 ('practical', 'ADJ'),
 ('set', 'NOUN'),
 ('for', 'ADP'),
 ('basic', 'ADJ'),
 ('power', 'NOUN'),
 ('trio', 'NOUN'),
 ('gigs', 'NOUN'),
 ('.', '.'),
 ('clean', 'ADJ'),
 (',', '.'),
 ('drive', 'NOUN'),
 (',', '.'),
 ('and', 'CONJ'),
 ('solo', 'ADJ'),
 ('delay', 'NOUN'),
 ('...', '.'),
 ('crunch', 'NOUN'),
 ('with', 'ADP'),
 ('clean', 'ADJ'),
 ('ambience', 'NOUN'),
 ('a', 'DET'),
 ('strong', 'ADJ'),
 ('crunch', 'NOUN'),
 ('(', '.'),
 ('ds', 'NOUN'),
 (')', '.'),
 (',', '.'),
 ('with', 'ADP'),
 ('deep', 'ADJ'),
 ('and', 'CONJ'),
 ('clean', 'ADJ'),
 ('ambience', 'NOUN'),
 ('.', '.'),
 ('using', 'VERB'),
 ('ds-1', 'ADJ'),
 ('in', 'ADP'),
 ('parallel', 'ADJ'),
 ('with', 'ADP'),
 ('chorus', 'NOUN'),
 (',', '.'),
 ('delay', 'NOUN'),
 ('(', '.'),
 ('shiro', 'NOUN'),
 (')', '.'),
 ('and', 'CONJ'),
 ('shimmer', 'NOUN'),
 ('in', 'ADP'),
 ('clean', 'ADJ'),
 ('channel', 'NOUN'),
 ('.', '.'),
 ('dual', 'ADJ'),
 ('channel', 'N

## Análise

In [110]:
tag_frequence_dist = nltk.FreqDist(tag for (word, tag) in taggs)

tag_frequence_dist.most_common()

#for common, total in tag_frequence_dist.most_common():
#    nltk.help.upenn_tagset(common)

[('NOUN', 1256),
 ('VERB', 516),
 ('ADJ', 471),
 ('.', 467),
 ('DET', 423),
 ('ADP', 376),
 ('ADV', 130),
 ('CONJ', 124),
 ('PRT', 112),
 ('PRON', 94),
 ('NUM', 84),
 ('X', 10)]

In [125]:
from collections import Counter

def details(words, tag, frequence_dist):
    words_tag = filter_words(words, tag)
    
    print(tag + ':')
    print(' - unique', len(Counter(words_tag).most_common()))
    print(' - total', frequence_dist[tag])
    print(' - most common', Counter(words_tag).most_common(25))

def filter_words(words, tag):
    return [word for (word, t) in words if t == tag]

details(taggs, 'NOUN', tag_frequence_dist)
details(taggs, 'VERB', tag_frequence_dist)
details(taggs, 'ADJ', tag_frequence_dist)

NOUN:
 - unique 568
 - total 1256
 - most common [('guitar', 41), ('mod', 28), ('sound', 23), ('i', 20), ('bass', 17), ('pedalboard', 17), ('loop', 16), ('tone', 14), ('delay', 14), ('reverb', 14), ('caps', 13), ('midi', 11), ('side', 11), ('cabinet', 11), ('sounds', 11), ('instruments', 10), ('space', 10), ('channel', 10), ('effects', 9), ('amp', 9), ('effect', 8), ('patch', 8), ('stefan', 8), ('floor', 8), ('elevator', 8)]
VERB:
 - unique 264
 - total 516
 - most common [('is', 26), ('using', 22), ('can', 11), ("'s", 9), ('made', 9), ('get', 9), ('do', 7), ('used', 7), ('has', 7), ('octave', 6), ('make', 6), ('setting', 5), ('use', 5), ('set', 5), ('playing', 5), ('looped', 5), ('have', 5), ('i', 5), ('tried', 5), ('was', 5), ('show', 5), ('added', 5), ('sounds', 5), ('ca', 4), ('looping', 4)]
ADJ:
 - unique 261
 - total 471
 - most common [('clean', 29), ('basic', 21), ('nice', 12), ('simple', 10), ('acoustic', 10), ('aka', 8), ('i', 8), ('first', 8), ('great', 7), ('big', 6), ('goo

Probabilidade de uma palavra ser a escolhida com base no grupo e na quantidade

In [130]:
Counter(filter_words(taggs, 'NOUN')).most_common()
#manual_words = [
#    'guitar', 'bass', 'loop', 'tone', 'delay', 'reverb', 'midi', 'cabinet', 'space', 'amp', 'loop', 'clean'
#]

[('guitar', 41),
 ('mod', 28),
 ('sound', 23),
 ('i', 20),
 ('bass', 17),
 ('pedalboard', 17),
 ('loop', 16),
 ('tone', 14),
 ('delay', 14),
 ('reverb', 14),
 ('caps', 13),
 ('midi', 11),
 ('side', 11),
 ('cabinet', 11),
 ('sounds', 11),
 ('instruments', 10),
 ('space', 10),
 ('channel', 10),
 ('effects', 9),
 ('amp', 9),
 ('effect', 8),
 ('patch', 8),
 ('stefan', 8),
 ('floor', 8),
 ('elevator', 8),
 ('chorus', 8),
 ('akke', 8),
 ('cowboys', 8),
 ('people', 8),
 ('pedal', 8),
 ('olaf', 8),
 ('ambience', 7),
 ('synth', 6),
 ('distortion', 6),
 ('fuzz', 6),
 ('duo', 6),
 ('pickup', 6),
 ('setup', 6),
 ('+', 6),
 ('tube', 6),
 ('models', 5),
 ('metal', 5),
 ('gain', 5),
 ('pedals', 5),
 ('notes', 5),
 ('board', 5),
 ('pitch', 5),
 ('output', 5),
 ('signal', 5),
 ('end', 5),
 ('beat', 5),
 ('phaser', 5),
 ('volume', 5),
 ('voice', 5),
 ('melody', 5),
 ('way', 5),
 ('shimmer', 4),
 ('chain', 4),
 ('lots', 4),
 ('crunch', 4),
 ('ac30', 4),
 ('whammy', 4),
 ('sample', 4),
 ('ampvts', 4),
 ('