### Import libraries

In [1]:
from collections import Counter
import pandas as pd

### Load CSV files

In [2]:
awards = pd.read_csv('../data/wikidata_awards.csv')
awards

Unnamed: 0,award_id,award_name,award_type,award_category
0,0,MOJO Awards,MOJO Awards,
1,1,Grammy Award for Best Rock Performance by a Du...,Grammy Award,Best Rock Performance by a Duo or Group with V...
2,2,MTV Europe Music Award for Best Album,MTV Europe Music Award,Best Album
3,3,Grammy Award for Best Rock Album,Grammy Award,Best Rock Album
4,4,Grammy Award for Best Dance/Electronic Album,Grammy Award,Best Dance/Electronic Album
...,...,...,...,...
188,188,Latin Grammy Award for Best Recording Package,Latin Grammy Award,Best Recording Package
189,189,First prize of the Eurovision Song Contest,First prize of the Eurovision Song Contest,
190,190,MTV Europe Music Award for Best Korean Act,MTV Europe Music Award,Best Korean Act
191,191,P3 Gull for Artist of the Year,P3 Gull,Artist of the Year


In [3]:
statements = pd.read_csv('../data/wikidata_award_statements.csv')
statements

Unnamed: 0,artist_spotify_id,award_id,award_year
0,0L8ExT028jH3ddEcZwqJJ5,0,_
1,0L8ExT028jH3ddEcZwqJJ5,1,2006
2,0L8ExT028jH3ddEcZwqJJ5,2,2006
3,0L8ExT028jH3ddEcZwqJJ5,3,2006
4,4tZwfgrHOc3mvqYlEYSvVi,4,2008
...,...,...,...
994,0ghlgldX5Dd6720Q3qFyQB,186,2020
995,2KC9Qb60EaY0kW4eH68vr3,186,2020
996,4yxLYO2imECxGYTTV7RQKb,192,_
997,5t5FqBwTcgKTaWmfEbwQY9,186,2021


### Preprocessing
- try to split type and category with dashes
- identify non-unique awards
- asssign each award a class based on frequency

In [4]:
# split by dashes
for index, row in awards.iterrows():
    aType = row['award_type'].split(' – ', maxsplit=1)
    if len(aType) == 1:
        aType = row['award_type'].split(' - ', maxsplit=1)
    awards.at[index, 'award_type'] = aType[0]
    if len(aType) > 1:
        awards.at[index, 'award_category'] = aType[1]

In [5]:
# drop duplicates and get a list
award_types = awards['award_type'].drop_duplicates().reset_index(drop=True)
aw_list = awards['award_type'].tolist()
# compute frequencies
freq_award = Counter(aw_list)
# select awards types that appear at least twice
nonunique_awards = [el for el, freq in freq_award.items() if freq >= 2]
nonunique_awards

['Grammy Award',
 'MTV Europe Music Award',
 'Academy Award',
 'Latin Grammy Award',
 'American Music Award',
 'Juno Award',
 'Americana Award',
 'Gramophone Award',
 'Spellemann Award',
 'Soul Train Music Award',
 'Billboard Music Award']

We decided to assign a class separating unique and non-unique awards. 

In [6]:
# add award class column based on previous result
awards['award_category'] = awards['award_category'].fillna('_')
award_class = []
for index, row in awards.iterrows():
    award_type = row['award_type']
    if 'MTV' in award_type:
        award_class.append('MTVAward')
    elif 'Grammy' in award_type:
        award_class.append('GrammyAward')
    elif 'Americana' in award_type:
        award_class.append('AmericanaAward')
    elif 'American Music' in award_type:
        award_class.append('AmericanMusicAward')
    elif 'Juno' in award_type:
        award_class.append('JunoAward')
    elif 'Spellemann' in award_type:
        award_class.append('SpellemannAward')
    elif 'Soul Train' in award_type:
        award_class.append('SoulTrainAward')
    elif 'Billboard' in award_type:
        award_class.append('BillboardAward')
    elif 'Academy' in award_type:
        award_class.append('AcademyAward')
    elif 'Gramophone' in award_type:
        award_class.append('GramophoneAward')
    else:
        award_class.append('GenericAward')
        
awards['award_class'] = award_class
awards

Unnamed: 0,award_id,award_name,award_type,award_category,award_class
0,0,MOJO Awards,MOJO Awards,_,GenericAward
1,1,Grammy Award for Best Rock Performance by a Du...,Grammy Award,Best Rock Performance by a Duo or Group with V...,GrammyAward
2,2,MTV Europe Music Award for Best Album,MTV Europe Music Award,Best Album,MTVAward
3,3,Grammy Award for Best Rock Album,Grammy Award,Best Rock Album,GrammyAward
4,4,Grammy Award for Best Dance/Electronic Album,Grammy Award,Best Dance/Electronic Album,GrammyAward
...,...,...,...,...,...
188,188,Latin Grammy Award for Best Recording Package,Latin Grammy Award,Best Recording Package,GrammyAward
189,189,First prize of the Eurovision Song Contest,First prize of the Eurovision Song Contest,_,GenericAward
190,190,MTV Europe Music Award for Best Korean Act,MTV Europe Music Award,Best Korean Act,MTVAward
191,191,P3 Gull for Artist of the Year,P3 Gull,Artist of the Year,GenericAward


In [7]:
awards.to_csv('../data/wikidata_awards_processed.csv', index=False)

### Write TTL file

In [8]:
award_types = ['MTVAward',
               'GrammyAward',
               'AmericanaAward',
               'AmericanMusicAward',
               'JunoAward',
               'SpellemannAward',
               'SoulTrainAward',
               'BillboardAward',
               'AcademyAward',
               'GramophoneAward',
               'GenericAward']

In [24]:
ttl_file = open('../rdf/awards_classes.ttl', 'w')

# write prefixes
ttl_file.write('@prefix : <https://www.dei.unipd.it/db2/ontology/soundgraph#> .\n')
ttl_file.write('@prefix owl: <http://www.w3.org/2002/07/owl#> .\n')
ttl_file.write('@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .\n')
ttl_file.write('@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .\n')
ttl_file.write('@base <https://www.dei.unipd.it/db2/ontology/soundgraph#> .\n')

# create one class per year for each award_type
idx = 0
disjoint_classes = [[] for i in range(len(award_types))]
for award_type in award_types:
    with_year = statements[statements['award_year'] != '_']
    with_year = with_year[with_year['award_id'].isin(awards[awards['award_class'] == award_type]['award_id'])]
    
    min_year = int(with_year['award_year'].min())
    max_year = int(with_year['award_year'].max())
    
    for award_year in range(min_year, max_year + 1):
        disjoint_classes[idx].append(f':{award_type}{award_year}')
        ttl_file.write('# class %s%d\n' % (award_type, award_year))
        ttl_file.write(':%s%d rdf:type owl:Class ;\n\towl:equivalentClass [\n\t\trdf:type owl:Restriction ;\n\t\towl:onProperty :awardYear ;\n\t\towl:hasValue %d\n\t] ;\n\trdfs:subClassOf :%s .\n\n'
                       % (award_type, award_year, award_year, award_type))
    idx += 1

# write axioms to make classes disjoint
ttl_file.write('\n# axioms\n')
for disjoint_group in disjoint_classes:
    ttl_file.write('[ rdf:type owl:AllDisjointClasses ;\n\towl:members (\n\t\t%s\n\t)\n] .\n\n' % '\n\t\t'.join(disjoint_group))
    
ttl_file.close()