In [2]:
import numpy as np
import pandas as pd
import psycopg2 as psy
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sb
import pickle as pic
import nltk
import string
from gensim import corpora, models, similarities
from gensim.models.coherencemodel import CoherenceModel
import gensim
from nltk.corpus import stopwords
import time

%matplotlib inline



In [31]:
# Connect to the Lyrics Database
conn = psy.connect("dbname=test")
cur = conn.cursor()

# Put Data in Pandas DataFrame
songs = pd.read_sql_query("SELECT * FROM songs;", conn)
refs = pd.read_sql_query("SELECT * FROM referents;", conn)
anns = pd.read_sql_query("SELECT * FROM annotations;", conn)

In [32]:
# Songs DataFrame Info
print songs.shape
print songs.info()
songs.head()

(12511, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12511 entries, 0 to 12510
Data columns (total 8 columns):
song_id                   12511 non-null int64
hot                       12511 non-null bool
unreviewed_annotations    12511 non-null int64
title                     12511 non-null object
full_title                12511 non-null object
artist                    12511 non-null object
artist_id                 12511 non-null int64
annotation_count          12511 non-null int64
dtypes: bool(1), int64(4), object(3)
memory usage: 696.5+ KB
None


Unnamed: 0,song_id,hot,unreviewed_annotations,title,full_title,artist,artist_id,annotation_count
0,2890384,False,0,0-100,0-100 by John Nonny,John Nonny,1020738,0
1,156640,False,0,0 to 100 / The Catch Up,0 to 100 / The Catch Up by Drake,Drake,130,41
2,2136824,False,0,100,100 by The Game (Ft. Drake),The Game,42,43
3,2688225,False,0,100it Racks,"100it Racks by DJ Esco (Ft. 2 Chainz, Drake & ...",DJ Esco,49470,50
4,703738,False,2,10 Bands,10 Bands by Drake,Drake,130,30


In [33]:
# Referent DataFrame Info
print refs.shape
print refs.info()
refs.head()

(163483, 6)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 163483 entries, 0 to 163482
Data columns (total 6 columns):
id                163483 non-null int64
song_id           163483 non-null int64
classification    163483 non-null object
fragment          163461 non-null object
is_description    163483 non-null bool
annotator_id      163483 non-null int64
dtypes: bool(1), int64(3), object(2)
memory usage: 6.4+ MB
None


Unnamed: 0,id,song_id,classification,fragment,is_description,annotator_id
0,4961787,156640,accepted,[Part I: 0 to 100],False,605899
1,3274596,156640,accepted,"[Produced by Boi-1da, Frank Dukes, Noah ""40"" S...",False,104344
2,3272685,156640,accepted,"Maybe I'm searchin' for the problems, askin' w...",False,58812
3,3272333,156640,accepted,"The other night, Lavish Lee told me that I'm a...",False,658401
4,3272181,156640,accepted,[Bridge]\n Whole squad on that real shit\n Who...,False,18490


In [34]:
# Annotation DataFrame Info
print anns.shape
print anns.info()
anns.head()

(163401, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 163401 entries, 0 to 163400
Data columns (total 12 columns):
id               163401 non-null int64
song_id          163401 non-null int64
ref_id           163401 non-null int64
ann_text         163401 non-null object
verified         163401 non-null bool
cosigned_by      163401 non-null object
has_voters       163401 non-null bool
state            163401 non-null object
community        163401 non-null bool
pinned           163401 non-null bool
comment_count    163401 non-null int64
votes_total      163401 non-null int64
dtypes: bool(4), int64(5), object(3)
memory usage: 10.6+ MB
None


Unnamed: 0,id,song_id,ref_id,ann_text,verified,cosigned_by,has_voters,state,community,pinned,comment_count,votes_total
0,4961787,156640,4961787,This song was allegedly supposed to be Diddy’s...,False,{},True,accepted,True,False,0,11
1,3274596,156640,3274596,https://twitter.com/Boi1da/status/473262859418...,False,{},True,accepted,True,False,0,56
2,3272685,156640,3272685,"Like he says in “Think Good,” Drake is constan...",False,{},True,accepted,True,False,0,19
3,3272333,156640,3272333,Lavish Lee is the best friend of Melissa Shay ...,False,{},True,accepted,True,False,0,37
4,3272181,156640,3272181,Drake’s only squads are OVO (and TOPSZN lowkey...,False,{},True,accepted,True,False,0,44


In [35]:
# Change UTF-8 to ASCII
# Define Genius Decoder Function
def genius_decoder(x):
    if x:
        return x.decode('utf8').encode('ascii', 'ignore')
    else:
        return x

# Decode Referents and Annotations    
refs['fragment'] = refs['fragment'].apply(genius_decoder)
anns['ann_text'] = anns['ann_text'].apply(genius_decoder)

In [36]:
# Merge Referent and Song Info
ref_song = pd.merge(songs, refs, how='right', left_on='song_id', right_on='song_id')

# Drop Unnecessary Columns
ref_song.drop(['hot', 'unreviewed_annotations', 'full_title', 'annotation_count'], axis=1, inplace=True)

# Merge in Annotations to Dataset
all_data = pd.merge(ref_song, anns, how='left', left_on='id', right_on='ref_id')
print all_data.columns

# Drop Unnecessary Columns
all_data.drop(['verified', 'cosigned_by', 'state', 'community', 'pinned', 'song_id_y', 'ref_id'], axis=1, inplace=True)
all_data.rename(columns={"song_id_x": "song_id", "id_x": "ref_id", "id_y": "ann_id"}, inplace=True)

# Drop Row with Missing Data ~80
all_data.dropna(inplace=True)
print all_data.info()

# Drop Rows where fragment length = 0
all_data = all_data[all_data['fragment'].apply(lambda x: len(x.strip()) > 0)]

# Drop Rows where annotation length = 0
all_data = all_data[all_data['ann_text'].apply(lambda x: len(x.strip()) > 0)]
print all_data.info()

Index([u'song_id_x', u'title', u'artist', u'artist_id', u'id_x',
       u'classification', u'fragment', u'is_description', u'annotator_id',
       u'id_y', u'song_id_y', u'ref_id', u'ann_text', u'verified',
       u'cosigned_by', u'has_voters', u'state', u'community', u'pinned',
       u'comment_count', u'votes_total'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
Int64Index: 163379 entries, 0 to 163484
Data columns (total 14 columns):
song_id           163379 non-null int64
title             163379 non-null object
artist            163379 non-null object
artist_id         163379 non-null int64
ref_id            163379 non-null int64
classification    163379 non-null object
fragment          163379 non-null object
is_description    163379 non-null bool
annotator_id      163379 non-null int64
ann_id            163379 non-null float64
ann_text          163379 non-null object
has_voters        163379 non-null object
comment_count     163379 non-null float64
votes_total     

In [37]:
# Drop Rows where Fragment are titles, headers, or notes instead of actual lyrics
def is_lyrics(frag):
    if frag[0] == "[" and frag[-1] == "]":
        return False
    else:
        return True

all_data = all_data[all_data['fragment'].apply(is_lyrics)]
print all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 157041 entries, 2 to 163484
Data columns (total 14 columns):
song_id           157041 non-null int64
title             157041 non-null object
artist            157041 non-null object
artist_id         157041 non-null int64
ref_id            157041 non-null int64
classification    157041 non-null object
fragment          157041 non-null object
is_description    157041 non-null bool
annotator_id      157041 non-null int64
ann_id            157041 non-null float64
ann_text          157041 non-null object
has_voters        157041 non-null object
comment_count     157041 non-null float64
votes_total       157041 non-null float64
dtypes: bool(1), float64(3), int64(4), object(6)
memory usage: 16.9+ MB
None


In [38]:
# Drop Data from Artists below 1st quartile in # of references
refs_by_artist = all_data.groupby('artist_id').size()
first_quartile = np.percentile(refs_by_artist, 75)
drop_artist_ids = list(refs_by_artist[refs_by_artist < first_quartile].index)
drop_artist_mask = all_data['artist_id'].apply(lambda x: x not in drop_artist_ids)
all_data = all_data[drop_artist_mask]
all_data.reset_index(inplace=True, drop=True)
all_data.head()

Unnamed: 0,song_id,title,artist,artist_id,ref_id,classification,fragment,is_description,annotator_id,ann_id,ann_text,has_voters,comment_count,votes_total
0,156640,0 to 100 / The Catch Up,Drake,130,3272685,accepted,"Maybe I'm searchin' for the problems, askin' w...",False,58812,3272685.0,"Like he says in Think Good, Drake is constantl...",True,0.0,19.0
1,156640,0 to 100 / The Catch Up,Drake,130,3272333,accepted,"The other night, Lavish Lee told me that I'm a...",False,658401,3272333.0,Lavish Lee is the best friend of Melissa Shay ...,True,0.0,37.0
2,156640,0 to 100 / The Catch Up,Drake,130,3272181,accepted,[Bridge]\n Whole squad on that real shit\n Who...,False,18490,3272181.0,Drakes only squads are OVO (and TOPSZN lowkey)...,True,0.0,44.0
3,156640,0 to 100 / The Catch Up,Drake,130,3272064,accepted,"Fuck all that ""Drake you gotta chill"" shit\n I...",False,314816,3272064.0,Drake had caught criticism for acting hard in ...,True,1.0,43.0
4,156640,0 to 100 / The Catch Up,Drake,130,3272003,accepted,"I'm just here for the bucks and the billis, ni...",False,104344,3272003.0,These lines continue the shots between Drizzy ...,True,1.0,98.0


In [46]:
len(all_data.song_id.unique())

7048

In [39]:
# Get Text and Index
ann_text = all_data['ann_text'].values
ref_text = all_data['fragment'].values

# Define normalizer
stemmer = nltk.stem.porter.PorterStemmer()

stop = set(stopwords.words('english'))

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens if item not in stop]

def normalize(text):
    return stem_tokens(nltk.word_tokenize(
            text.lower().translate(None, string.punctuation)))

# Normalize our annotations and Referent Text
ann_text_bow = np.array(map(normalize, ann_text))
ref_text_bow = np.array(map(normalize, ref_text))

ann_dictionary = corpora.Dictionary(ann_text_bow)
ref_dictionary = corpora.Dictionary(ref_text_bow)

# Create our corpuses
ann_corpus_bow = np.array([ann_dictionary.doc2bow(text) for text in ann_text_bow])
ref_corpus_bow = np.array([ref_dictionary.doc2bow(text) for text in ref_text_bow])

In [40]:
# Make our Annotation Model
print "Making Model..."
ann_lda = models.LdaMulticore(ann_corpus_bow, id2word=ann_dictionary, num_topics=12) # Make LDA model
print "Making Corpus..."
ann_corpus_lda = ann_lda[ann_corpus_bow] # Make LDA corpus


# Get LDA Matrix
print "Making Matrix..."
ann_corpus_lda_matrix = gensim.matutils.corpus2dense(ann_corpus_lda, num_terms=12)

# Merge Data
ann_corpus_lda_df = pd.DataFrame(ann_corpus_lda_matrix.T)
ann_corpus_lda_df.columns = ['%s_lda_%s' % ('ann', i) for i in range(1, 13)]
all_data = pd.merge(all_data, ann_corpus_lda_df, how='inner', 
                      left_index=True, right_index=True)

Making Model...
Making Corpus...
Making Matrix...


In [41]:
# Make our Referent Model
ref_lda = models.LdaMulticore(ref_corpus_bow, id2word=ref_dictionary, num_topics=10) # Make LDA model
ref_corpus_lda = ref_lda[ref_corpus_bow] # Make LDA corpus

# Get LDA Matrix
ref_corpus_lda_matrix = gensim.matutils.corpus2dense(ref_corpus_lda, num_terms=10)

# Merge Data
ref_corpus_lda_df = pd.DataFrame(ref_corpus_lda_matrix.T)
ref_corpus_lda_df.columns = ['%s_lda_%s' % ('ref', i) for i in range(1, 11)]
all_data.drop([col for col in all_data.columns if col.startswith('ref_lda')], axis=1, inplace=True)
all_data = pd.merge(all_data, ref_corpus_lda_df, how='inner', left_index=True, right_index=True)

In [47]:
# Pickle our data and models
pic.dump(all_data, open("../Data/all_data.p", "wb"))
pic.dump(ann_lda, open("../Models/ann_lda_model.p", "wb"))
pic.dump(ref_lda, open("../Models/ref_lda_model.p", "wb"))
pic.dump(ann_dictionary, open("../Models/ann_dictionary.p", "wb"))
pic.dump(ref_dictionary, open("../Models/ref_dictionary.p", "wb"))
pic.dump(ann_corpus_bow, open("../Models/ann_corpus_bow.p", "wb"))
pic.dump(ref_corpus_bow, open("../Models/ref_corpus_bow.p", "wb"))
pic.dump(ann_corpus_lda, open("../Models/ann_corpus_lda.p", "wb"))
pic.dump(ref_corpus_lda, open("../Models/ref_corpus_lda.p", "wb"))