# Main notebook

In [42]:
import pandas as pd
import numpy as np
from ast import literal_eval as parse_tuple

In [16]:
offensiveness_ratings = pd.read_pickle("../pickles/offensiveness_rating_structured")

In [25]:
import numpy as np
import pandas as pd

In [26]:
word_table = pd.read_pickle("../pickles/word_table_cleaned.pickle")
word_table.head()

import sqlite3
conn = sqlite3.connect("../datasets/mxm_dataset.db")

cursor = conn.cursor()
cursor.execute("SELECT track_id, word, count FROM lyrics ORDER BY track_id;")
track_word_count = cursor.fetchall()
cursor.close()
track_word_count[:5]

[('TRAAAAV128F421A322', 'i', 6),
 ('TRAAAAV128F421A322', 'the', 4),
 ('TRAAAAV128F421A322', 'you', 2),
 ('TRAAAAV128F421A322', 'to', 2),
 ('TRAAAAV128F421A322', 'and', 5)]

In [27]:
sqldb_frame = pd.DataFrame(track_word_count, columns=["track_id", "word", "count"])
del track_word_count
joint = sqldb_frame.join(word_table, on="word", how="left", lsuffix='_caller', rsuffix='_other')
print(joint.shape)
joint_indexed = joint.set_index(["track_id", "category", "strength", "target"])

(19045332, 6)


In [28]:
joint_indexed_filtered = joint_indexed["count"]
joint_indexed_filtered.head()

track_id            category  strength  target
TRAAAAV128F421A322  NaN       NaN       NaN       6
                                        NaN       4
                                        NaN       2
                                        NaN       2
                                        NaN       5
Name: count, dtype: int64

In [29]:
joint_aggregated = joint_indexed_filtered.agg("sum")

In [32]:
joint_indexed_filtered

track_id            category  strength  target
TRAAAAV128F421A322  NaN       NaN       NaN       6
                                        NaN       4
                                        NaN       2
                                        NaN       2
                                        NaN       5
                                        NaN       3
                                        NaN       1
                                        NaN       1
                                        NaN       1
                                        NaN       2
                                        NaN       3
                                        NaN       1
                                        NaN       1
                                        NaN       2
                                        NaN       2
                                        NaN       2
                                        NaN       2
                                        NaN       4
                 

In [35]:
joint_indexed_filtered

track_id            category  strength  target
TRAAAAV128F421A322  NaN       NaN       NaN       6
                                        NaN       4
                                        NaN       2
                                        NaN       2
                                        NaN       5
Name: count, dtype: int64

In [44]:
def process_tuple(t):
    
    def process_tuple_elem(elem):
        if elem == "nan":
            return np.nan
        else:
            return elem[1:-1]
    return list(map(process_tuple_elem, t[1:-1].replace(" ", "").split(",")))

In [45]:
process_tuple("('TRAAAAV128F421A322', 'non-discriminatory', 'mild', nan)")

['TRAAAAV128F421A322', 'non-discriminatory', 'mild', nan]

In [47]:
offensiveness_rating = pd.read_pickle("../pickles/offensiveness_rating_structured")

In [48]:
index_offensiveness = list(map(process_tuple, offensiveness_rating.index.tolist()))

In [50]:
len(index_offensiveness)

309633

In [54]:
value_offensiveness = list(offensiveness_rating.values)

In [57]:
offensiveness_agg = pd.DataFrame(columns=)

In [62]:
data_non_flat = list(zip(index_offensiveness, value_offensiveness))

[['TRAAAAV128F421A322', 'non-discriminatory', 'mild', nan, 1],
 ['TRAAAAV128F421A322', nan, nan, nan, 102],
 ['TRAAABD128F429CF47', nan, nan, nan, 226],
 ['TRAAAED128E0783FAB', nan, nan, nan, 421],
 ['TRAAAEF128F4273421', nan, nan, nan, 139],
 ['TRAAAEW128F42930C0', nan, nan, nan, 115],
 ['TRAAAFD128F92F423A', nan, nan, nan, 160],
 ['TRAAAGF12903CEC202', nan, nan, nan, 21],
 ['TRAAAHJ128F931194C', nan, nan, nan, 172],
 ['TRAAAHZ128E0799171',
  'discriminatory',
  'mild',
  'mentalorphysicalability',
  6],
 ['TRAAAHZ128E0799171', 'discriminatory', 'strongest', 'race', 4],
 ['TRAAAHZ128E0799171', 'non-discriminatory', 'medium', nan, 3],
 ['TRAAAHZ128E0799171', 'non-discriminatory', 'strongest', nan, 4],
 ['TRAAAHZ128E0799171', nan, nan, nan, 633],
 ['TRAAAJG128F9308A25', nan, nan, nan, 116],
 ['TRAAAOF128F429C156', nan, nan, nan, 398],
 ['TRAAARJ128F9320760', nan, nan, nan, 203],
 ['TRAAAUC128F428716F', nan, nan, nan, 161],
 ['TRAAAZF12903CCCF6B', nan, nan, nan, 349],
 ['TRAABEV12903CC53

In [70]:
columns = ["track_id", "category", "strength", "target", "value"]
track_off = pd.DataFrame.from_records([elem[0] + [elem[1]] for elem in data_non_flat], columns=columns)
track_off = track_off.set_index(["track_id", "category", "strength", "target"])
track_off.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,value
track_id,category,strength,target,Unnamed: 4_level_1
TRAAAAV128F421A322,non-discriminatory,mild,,1
TRAAAAV128F421A322,,,,102
TRAAABD128F429CF47,,,,226
TRAAAED128E0783FAB,,,,421
TRAAAEF128F4273421,,,,139


Now, we have the choice: absolute or relative count of swear words in a song ? This choice is rather difficult. But, in some way, even if a song is longer than another one, we can (for now) consider that simply having more swear words is equivalent to being more vulgar. So let's simply count the number of swear words per song:

In [71]:
test_track_frame = track_off.loc["TRAAAAV128F421A322"]

In [76]:
test_track_frame.reset_index().dropna(thresh = 3).value.sum()

1

In [87]:
frame_track_offensiveness = pd.DataFrame(columns=["track_id", "off"])

for index, track_id in enumerate(track_off.index.levels[0]):
    if index % 1000 == 0:
        print(index)
    curr_frame = track_off.loc[track_id]
    sum_swear_words = curr_frame.reset_index().dropna(thresh = 3).value.sum()
    frame_track_offensiveness.loc[frame_track_offensiveness.shape[0]] = [track_id, sum_swear_words]

0
1000
2000
3000
4000
5000
6000
7000
8000


KeyboardInterrupt: 

In [83]:
track_off.index.levels[0]

Index(['TRAAAAV128F421A322', 'TRAAABD128F429CF47', 'TRAAAED128E0783FAB',
       'TRAAAEF128F4273421', 'TRAAAEW128F42930C0', 'TRAAAFD128F92F423A',
       'TRAAAGF12903CEC202', 'TRAAAHJ128F931194C', 'TRAAAHZ128E0799171',
       'TRAAAJG128F9308A25',
       ...
       'TRZZZOW128F4248475', 'TRZZZQO128E078864C', 'TRZZZRJ128F42819AF',
       'TRZZZUK128F92E3C60', 'TRZZZWS128F429CF87', 'TRZZZXA128F428ED56',
       'TRZZZXV128F4289747', 'TRZZZYV128F92E996D', 'TRZZZYX128F92D32C6',
       'TRZZZZD128F4236844'],
      dtype='object', name='track_id', length=237662)