## Parallelizing the computation
The loop-based approach to process each track id individually is taking too long.
Now we will leverage the data processing capabilities of pandas.

We use an outer join to combine lyrics with track ids. Then we aggregate the results for individual songs.

In [1]:
import numpy as np
import pandas as pd

In [2]:
word_table = pd.read_csv("../datasets/word_table.csv")
word_table.head()

Unnamed: 0,word,descriptor
0,bonk,"('non-discriminatory', 'mild', None)"
1,bukkake,"('non-discriminatory', 'strong', None)"
2,cocksucker,"('non-discriminatory', 'strong', None)"
3,dildo,"('non-discriminatory', 'strong', None)"
4,ho,"('non-discriminatory', 'strong', None)"


In [3]:
word_table["word"]=word_table["word"].astype(str)
word_table.index = word_table["word"]
word_table = word_table["descriptor"]
word_table.head()

word
bonk            ('non-discriminatory', 'mild', None)
bukkake       ('non-discriminatory', 'strong', None)
cocksucker    ('non-discriminatory', 'strong', None)
dildo         ('non-discriminatory', 'strong', None)
ho            ('non-discriminatory', 'strong', None)
Name: descriptor, dtype: object

In [5]:
import sqlite3
conn = sqlite3.connect("../datasets/mxm_dataset.db")

cursor = conn.cursor()
cursor.execute("SELECT track_id, word, count FROM lyrics ORDER BY track_id;")
track_word_count = cursor.fetchall()
cursor.close()
track_word_count[:5]

[('TRAAAAV128F421A322', 'i', 6),
 ('TRAAAAV128F421A322', 'the', 4),
 ('TRAAAAV128F421A322', 'you', 2),
 ('TRAAAAV128F421A322', 'to', 2),
 ('TRAAAAV128F421A322', 'and', 5)]

In [6]:
sqldb_frame = pd.DataFrame(track_word_count, columns=["track_id", "word", "count"])
del track_word_count

In [7]:
sqldb_frame["word"]=sqldb_frame["word"].astype(str)

## Performing an outer joint to match words between lyrics and offensiveness rating

In [8]:
joint = sqldb_frame.join(word_table, on="word", how="left", lsuffix='_caller', rsuffix='_other')
print(joint.shape)
joint.head()

(19045332, 4)


Unnamed: 0,track_id,word,count,descriptor
0,TRAAAAV128F421A322,i,6,
1,TRAAAAV128F421A322,the,4,
2,TRAAAAV128F421A322,you,2,
3,TRAAAAV128F421A322,to,2,
4,TRAAAAV128F421A322,and,5,


In [9]:
obscene_indices = joint["descriptor"].astype("str")!="nan"

In [10]:
np.any(obscene_indices)

True

In [11]:
np.sum(obscene_indices)

83542

In [12]:
joint_indexed = joint.set_index(["track_id", "descriptor"])
joint_indexed.loc[("TRAADYI128E078FB38",),]

Unnamed: 0_level_0,word,count
descriptor,Unnamed: 1_level_1,Unnamed: 2_level_1
,i,16
,the,26
,you,6
,to,13
,and,29
,a,2
,me,3
,it,7
,not,5
,in,9


## Aggregating the data
The joint table uses track ids and offensiveness categories as indices. This is what we want, but we still have individual cells for every word.

Now we aggregate the items for every index. We sum the entries. This gives us the total count of words in each category.

In [19]:
joint_indexed.index.is_unique

False

In [20]:
joint_indexed_filtered = joint_indexed["count"]
joint_indexed_filtered.head()

track_id            descriptor
TRAAAAV128F421A322  NaN           6
                    NaN           4
                    NaN           2
                    NaN           2
                    NaN           5
Name: count, dtype: int64

In [21]:
joint_aggregated = joint_indexed_filtered.agg("sum")

In [22]:
offensiveness_rating = joint_indexed_filtered.groupby(str, axis=0).agg("sum")

In [23]:
offensiveness_rating.head()

('TRAAAAV128F421A322', "('non-discriminatory', 'mild', None)")      1
('TRAAAAV128F421A322', nan)                                       102
('TRAAABD128F429CF47', nan)                                       226
('TRAAAED128E0783FAB', nan)                                       421
('TRAAAEF128F4273421', nan)                                       139
Name: count, dtype: int64

In [24]:
offensiveness_rating.to_pickle("../pickles/offensiveness_rating_unstructured")