In [97]:
# Imports
import pickle as pic
import numpy as np
import seaborn as sb
import nltk, string
import pandas as pd
from gensim import corpora, models, similarities
from collections import defaultdict
from nltk.corpus import stopwords
import gensim


### Prepare Data (BOW Model)

In [105]:
# Load Datasets
songs = pic.load( open("./Data/songs.p", "rb"))
referents = pic.load( open("./Data/referents.p", "rb"))
annotations = pic.load( open("./Data/annotations.p", "rb"))

In [106]:
annotations = annotations.reset_index()

In [118]:
# For each referent, find the corresponding annotation and merge them
print referents.shape
ann_merge = annotations[['index', 'ref_id', 'ann_text']]
ann_merge.rename(columns={'index': 'annotation_index'}, inplace=True)
ref_ann = pd.merge(referents, ann_merge, how='left',
                   left_on='id', right_on='ref_id')
ref_ann.dropna(inplace=True)
print ref_ann.shape
ref_ann.head()

(98992, 6)
(98928, 9)


Unnamed: 0,id,song_id,classification,fragment,is_description,annotator_id,annotation_index,ref_id,ann_text
0,4961787,156640,accepted,[Part I: 0 to 100],False,605899,0.0,4961787.0,This song was allegedly supposed to be Diddys ...
1,3274596,156640,accepted,"[Produced by Boi-1da, Frank Dukes, Noah ""40"" S...",False,104344,1.0,3274596.0,https://twitter.com/Boi1da/status/473262859418...
2,3272685,156640,accepted,"Maybe I'm searchin' for the problems, askin' w...",False,58812,2.0,3272685.0,"Like he says in Think Good, Drake is constantl..."
3,3272333,156640,accepted,"The other night, Lavish Lee told me that I'm a...",False,658401,3.0,3272333.0,Lavish Lee is the best friend of Melissa Shay ...
4,3272181,156640,accepted,[Bridge]\n Whole squad on that real shit\n Who...,False,18490,4.0,3272181.0,Drakes only squads are OVO (and TOPSZN lowkey)...


In [119]:
# Get Text and Index
ann_text = ref_ann['ann_text'].values
ref_text = ref_ann['fragment'].values
ann_index = ref_ann['annotation_index'].values

In [120]:
# Define normalizer
stemmer = nltk.stem.porter.PorterStemmer()

stop = set(stopwords.words('english'))

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens if item not in stop]

def normalize(text):
    return stem_tokens(nltk.word_tokenize(
            text.lower().translate(None, string.punctuation)))

In [121]:
# Normalize our annotations and Referent Text
ann_text_bow = map(normalize, ann_text)
ref_text_bow = map(normalize, ref_text)

In [122]:
# Get frequency dictionary
frequency = defaultdict(int)
for text in ann_text_bow:
    for token in text:
        frequency[token] += 1

In [123]:
# Create Dictionary with our annotations
ann_text_mult = [[token for token in text if frequency[token] > 1] 
         for text in ann_text_bow]

dictionary = corpora.Dictionary(ann_text_mult)
dictionary.num_docs

98928

In [124]:
# Print Number of Words in the Dictionary
len(dictionary.values())

27552

In [125]:
# Create our corpus
corpus_bow = [dictionary.doc2bow(text) for text in ann_text_mult]

In [129]:
# Print first five sparse vectors in the corpus
print corpus_bow[:5]
print len(corpus_bow), len(ann_index)

[[(0, 2), (1, 5), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 2), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 3), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 2), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 2), (35, 1), (36, 1), (37, 1), (38, 3), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1)], [(0, 1), (5, 1), (7, 1), (26, 1), (27, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 2), (55, 1), (56, 1)], [(35, 2), (38, 2), (57, 1), (58, 1), (59, 1), (60, 1), (61, 2), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1)], [(30, 1), (38, 3), (58, 1), (68, 1), (74, 2), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 3), (82, 1), (83, 1), (84, 1), (85, 2), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 2), (94, 1), (95, 2), (96, 1), (97, 1), (98, 1)], [(38, 1), (99,

## TFIDF and Similarity Calculation

In [37]:
# Tf-Idf our corpus
tfidf = models.TfidfModel(corpus_bow)
corpus_tfidf = tfidf[corpus_bow]

In [78]:
# Print first five documents in our LSI corpus
for doc in corpus_tfidf[:5]:
    print doc

[(0, 0.29669811169033694), (1, 0.560592588135255), (2, 0.11718476190786585), (3, 0.08586748495348616), (4, 0.09624514336534767), (5, 0.0583116301708491), (6, 0.1270862129940806), (7, 0.2909223590347693), (8, 0.1104633742542093), (9, 0.09077523620896319), (10, 0.08881103476495539), (11, 0.06936932203487713), (12, 0.06798536456860937), (13, 0.08273502417983959), (14, 0.045852794072612675), (15, 0.12570170890726629), (16, 0.18191608105754248), (17, 0.07349411353976541), (18, 0.10009791437248157), (19, 0.08590458222000975), (20, 0.1790282047297587), (21, 0.09647222630900462), (22, 0.16032165232065493), (23, 0.055125131018004765), (24, 0.08579351007563972), (25, 0.0782159470839777), (26, 0.07616467456968423), (27, 0.23207483376624863), (28, 0.10115708108438375), (29, 0.12595046444972957), (30, 0.08794248051961315), (31, 0.11851252982798778), (32, 0.1356255777554609), (33, 0.10759518695973828), (34, 0.1295122849247989), (35, 0.03728700313686689), (36, 0.09072713833023852), (37, 0.11622382751

In [84]:
# Create index for querying
index_tfidf = similarities.MatrixSimilarity(corpus_tfidf)

In [93]:
# Create TDIDF representation of our test query (referent)
doc = ref_text_bow[2]
index = ann_index[2]
doc_bow = dictionary.doc2bow(doc)
doc_tfidf = tfidf[doc_bow]

# Query our index for closely related documents
sims_tfidf = index_tfidf[doc_tfidf]
sims_tfidf = sorted(enumerate(sims_tfidf), 
                    key=lambda item: item[1], reverse=True)
sims_tfidf = np.array(sims_tfidf)

In [151]:
# Find position of the "correct" annotation in the similarity list
correct_position = np.where(sims_tfidf[:, 0] == index)[0]

array([18213,     0])

In [159]:
# Repeat for sample of test queries
correct_positions = np.zeros(5)
for i in range(5):
    doc = ref_text_bow[i]
    index = ann_index[i]
    doc_bow = dictionary.doc2bow(doc)
    doc_tfidf = tfidf[doc_bow]

    # Query our index for closely related documents
    sims_tfidf = index_tfidf[doc_tfidf]
    sims_tfidf = sorted(enumerate(sims_tfidf), 
                        key=lambda item: item[1], reverse=True)
    sims_tfidf = np.array(sims_tfidf)
    correct_position = np.where(sims_tfidf[:, 0] == index)[0]
    print correct_position
    correct_positions[i] = correct_position

[2091]
[19]
[8559]
[2]
[40]


## Latent Semantic Analysis and Similarity Calcuation

In [175]:
# LSI our Tf-idf corpus
lsa = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)
corpus_lsa = lsa[corpus_tfidf]

In [176]:
# Print first ten documents in our LSI corpus
for doc in corpus_lsa[:5]:
    print doc

[(0, 0.11817592009507155), (1, 0.043670656922342309), (2, -0.14992542065700362), (3, -0.023131955586047574), (4, 0.0082964461991998188), (5, -0.0036412197581230563), (6, 0.0052674913880049476), (7, 0.025678545319327541), (8, -0.003650315236654838), (9, 0.0050921255380231801)]
[(0, 0.041115382283709485), (1, 0.0459444092820957), (2, -0.01277981635249471), (3, 0.0096576985185838193), (4, -0.0010830974269517796), (5, 0.010359230375508023), (6, 0.0043806227706480547), (7, -0.0031039945281084191), (8, 0.006223285692291719), (9, 0.016255697025536305)]
[(0, 0.17895704014220509), (1, -0.00018982998432079748), (2, -0.21609053888732968), (3, -0.047646168829888272), (4, 0.052260601211250153), (5, 0.0017649047671909641), (6, -0.010475616305728644), (7, -0.00059897264687310274), (8, 0.0055253356560165624), (9, 0.0085105921099546487)]
[(0, 0.12770153041292825), (1, -0.05397949383748999), (2, -0.19226011773138613), (3, -0.054172426540956664), (4, 0.044355091765307511), (5, -0.026641966038961379), (6,

In [177]:
# Create index for querying
index_lsa = similarities.MatrixSimilarity(corpus_lsa)

In [178]:
# Create LSA representation of our test query (referent)
doc = ref_text_bow[2]
index = ann_index[2]
doc_bow = dictionary.doc2bow(doc)
doc_tfidf = tfidf[doc_bow]
doc_lsa = lsa[doc_tfidf]

# Query our index for closely related documents
sims_lsa = index_lsa[doc_lsa]
sims_lsa = sorted(enumerate(sims_lsa), 
                    key=lambda item: item[1], reverse=True)
sims_lsa = np.array(sims_lsa)

In [179]:
# Find position of the "correct" annotation in the similarity list
correct_position = np.where(sims_lsa[:, 0] == index)[0]
correct_position

array([63523])

In [180]:
# Repeat for sample of test queries
correct_positions = np.zeros(5)
for i in range(5):
    doc = ref_text_bow[i]
    index = ann_index[i]
    doc_bow = dictionary.doc2bow(doc)
    doc_tfidf = tfidf[doc_bow]
    doc_lsa = lsa[doc_tfidf]

    # Query our index for closely related documents
    sims_lsa = index_lsa[doc_lsa]
    sims_lsa = sorted(enumerate(sims_lsa), 
                        key=lambda item: item[1], reverse=True)
    sims_lsa = np.array(sims_lsa)
    correct_position = np.where(sims_lsa[:, 0] == index)[0]
    print correct_position
    correct_positions[i] = correct_position

[22915]
[7296]
[63523]
[53994]
[98836]


In [None]:
# Find position of the "correct" annotation in the similarity list
correct_position = np.where(sims_lsa[:, 0] == index)[0]
correct_position

In [None]:
# Repeat for sample of test queries
correct_positions = np.zeros(5)
for i in range(5):
    doc = ref_text_bow[i]
    index = ann_index[i]
    doc_bow = dictionary.doc2bow(doc)
    doc_tfidf = tfidf[doc_bow]
    doc_lsa = lsa[doc_tfidf]

    # Query our index for closely related documents
    sims_lsa = index_lsa[doc_lsa]
    sims_lsa = sorted(enumerate(sims_lsa), 
                        key=lambda item: item[1], reverse=True)
    sims_lsa = np.array(sims_lsa)
    correct_position = np.where(sims_lsa[:, 0] == index)[0]
    print correct_position
    correct_positions[i] = correct_position

## Latent Dirichlet Allocation and Similarity Calculation

In [184]:
# LDA our BOW corpus
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=5)
corpus_lda = lda[corpus]

In [185]:
# Print first ten documents in our LDA corpus
for i in corpus_lda[:5]:
    print i

[(1, 0.13106263717234684), (2, 0.70330932327740769), (4, 0.15845293007370048)]
[(0, 0.010815749492938528), (1, 0.12333716806465662), (2, 0.78145765402605971), (3, 0.073827878405082495), (4, 0.0105615500112626)]
[(2, 0.36012623594661347), (4, 0.61327365359313557)]
[(0, 0.046939960071494792), (2, 0.14330507832413031), (4, 0.79908502909962764)]
[(0, 0.67258756719662194), (1, 0.029010811905071092), (2, 0.24084909912011762), (3, 0.028814185610665444), (4, 0.028738336167523849)]


In [186]:
# Create index for querying
index_lda = similarities.MatrixSimilarity(corpus_lda)

In [187]:
# Create LDA representation of our test query (referent)
doc = ref_text_bow[2]
index = ann_index[2]
doc_bow = dictionary.doc2bow(doc)
doc_lda = lda[doc_bow]

# Query our index for closely related documents
sims_lda = index_lda[doc_lda]
sims_lda = sorted(enumerate(sims_lda), 
                    key=lambda item: item[1], reverse=True)
sims_lda = np.array(sims_lda)

In [188]:
# Find position of the "correct" annotation in the similarity list
correct_position = np.where(sims_lsa[:, 0] == index)[0]
correct_position

array([72390])

In [189]:
# Repeat for sample of test queries
correct_positions = np.zeros(5)
for i in range(5):
    doc = ref_text_bow[i]
    index = ann_index[i]
    doc_bow = dictionary.doc2bow(doc)
    doc_lda = lda[doc_bow]

    # Query our index for closely related documents
    sims_lda = index_lsa[doc_lda]
    sims_lda = sorted(enumerate(sims_lda), 
                        key=lambda item: item[1], reverse=True)
    sims_lda = np.array(sims_lsa)
    correct_position = np.where(sims_lda[:, 0] == index)[0]
    print correct_position
    correct_positions[i] = correct_position

[82307]
[71095]
[72390]
[83366]
[98836]


In [195]:
print ref_text[4]
for i in sims_lda[:5,0]:
    print ann_text[i] + "\n\n---\n\n"

[Bridge]
 Whole squad on that real shit
 Whole squad on that real shit
 Whole squad on that real shit


---


Reference to Redmans hit Tonights Da Night

ACCEPTED COMMENT: https://www.youtube.com/watch?v=G6LVIi7pzZI

---


Lil Yachty- u trippin

---


Em is referring to the same Van Dyke Avenue in Yellow Brick Road from his Encore album.
This is also a play on words, knowing how much Eminem loves disrespecting women and talking about lesbians, which is shadowing his urge to want to call the girl hes with a dyke (lesbian).

Van Dyke Avenue is a street/road in Detroit crossing 8 Mile Road.
Its quite long, too:
Van Dyke

---


Ready to Die

---




## Hierarchical Dirichlet Process and Similarity Calculation

In [48]:
# HDP our BOW corpus
hdp = models.HdpModel(corpus, id2word=dictionary)
corpus_hdp = hdp[corpus]