In [65]:
# Imports
import pickle as pic
import numpy as np
import seaborn as sb
import nltk, string
import pandas as pd
from gensim import corpora, models, similarities
from collections import defaultdict
from nltk.corpus import stopwords
import gensim
from itertools import compress


### Prepare Data (BOW Model)

In [39]:
# Load Datasets
all_data = pic.load( open("../Data/all_data.p", "rb"))

In [40]:
# Get Text and Index
ann_text = all_data['ann_text'].values
ref_text = all_data['fragment'].values

In [41]:
# Define normalizer
stemmer = nltk.stem.porter.PorterStemmer()

stop = set(stopwords.words('english'))

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens if item not in stop]

def normalize(text):
    return stem_tokens(nltk.word_tokenize(
            text.lower().translate(None, string.punctuation)))

In [42]:
# Normalize our annotations and Referent Text
ann_text_bow = map(normalize, ann_text)
ref_text_bow = map(normalize, ref_text)

In [43]:
# Get frequency dictionary
frequency = defaultdict(int)
for text in ann_text_bow:
    for token in text:
        frequency[token] += 1

In [44]:
# Create Dictionary with our annotations (only including words that appear more than once)
ann_text_mult = [[token for token in text if frequency[token] > 1] 
         for text in ann_text_bow]

dictionary = corpora.Dictionary(ann_text_mult)
dictionary.num_docs

103882

In [45]:
# Print Number of Words in the Dictionary
len(dictionary.values())

28029

In [46]:
# Create our corpus
corpus_bow = [dictionary.doc2bow(text) for text in ann_text_mult]

In [47]:
# Print first five sparse vectors in the corpus
print corpus_bow[:5]
print len(corpus_bow), len(ann_index)

[[(0, 1), (1, 1), (2, 2), (3, 1), (4, 1), (5, 2), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 2), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1)], [(1, 1), (11, 3), (13, 1), (19, 2), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 3), (27, 1), (28, 1), (29, 1), (30, 2), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 2), (39, 1), (40, 2), (41, 1), (42, 1), (43, 1), (44, 1)], [(11, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1)], [(11, 2), (50, 1), (51, 2), (52, 3), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 2), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 2), (71, 1), (72, 1), (73, 3), (74, 1), (75, 1), (76, 1), (77, 2), (78, 1)], [(2, 1), (4, 1), (10, 1), (11, 1), (13, 1), (20, 1), (71, 1), (79, 1), (80, 1), (81, 2), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 3), (96, 2), (97, 4), (98, 1), (99,

## Artist Specific Corpus
In order to limit similarity comparisons for our specified query, we need to create a dictionary of artist specific corpuses that we can compare over to subset our search space in order to make it more manageable

In [87]:
# Create dictionary for storing artist corpuses ad their associated id
artist_corpus_dict = {}

# Fill dictionary with artist specific corpuses
artist_ids = set(all_data['artist_id'])
for artist in artist_ids:
    id_mask = list(all_data['artist_id'] == artist)
    artist_text = compress(ann_text_mult, id_mask)
    artist_corpus = [dictionary.doc2bow(text) for text in artist_text]
    artist_corpus_dict[artist] = artist_corpus

In [94]:
type(artist_corpus_dict)

dict

## TFIDF and Similarity Calculation

In [48]:
# Tf-Idf our corpus
tfidf = models.TfidfModel(corpus_bow)
corpus_tfidf = tfidf[corpus_bow]

In [112]:
# Create indices for querying separated by artist_id
artist_index_dict = {}
for key, value in artist_corpus_dict.iteritems():
    artist_index_dict[key] = similarities.MatrixSimilarity(tfidf[value])

In [111]:
# Randomly Sample 100 referents to compare
ref_indices = np.random.choice(range(all_data.shape[0]), 100, replace=False)


array([ 68485,  36527,  17286,  28695,  36167, 100640,  70762,  36919,
        15968,  50658,  11274,  89938,  11646,  19443,  97614,  34232,
         9171,  92791,  82036,   3490,  99559,  29158,  84038,  11156,
        61454,  89478,   8354, 101341,  14149,  39098,   7566,   6450,
        85205,  78451,  58198,  88711,  50481,  90716,  41912,  13948,
        86181,  87978, 100315,  25885,  66009,  18327,  25822,  96994,
        28622,   1060,  17911,  41290,  98243,  35840,  65687,  16202,
        43585,  71530,  95925,  68231,  59010,  19185,  47973,  95058,
        98601,  74249,  56745,  84678,  42100,  36829,  68630, 101589,
        84759,  31210,  45311,   3810,  19319,  43131,  65031,  19732,
        97769,  56216,  55815,   9700,  68683,  42051,  95448,  37871,
        19062,  48438,  39828,  82020,  47093,   6165,  90172,  53516,
         5112,  13345,  54155,  54054])

In [16]:
# Create TDIDF representation of our test query (referent)
doc = ref_text_bow[2]
index = ann_index[2]
doc_bow = dictionary.doc2bow(doc)
doc_tfidf = tfidf[doc_bow]

# Query our index for closely related documents
sims_tfidf = index_tfidf[doc_tfidf]
sims_tfidf = sorted(enumerate(sims_tfidf), 
                    key=lambda item: item[1], reverse=True)
sims_tfidf = np.array(sims_tfidf)

In [151]:
# Find position of the "correct" annotation in the similarity list
correct_position = np.where(sims_tfidf[:, 0] == index)[0]

array([18213,     0])

In [159]:
# Repeat for sample of test queries
correct_positions = np.zeros(5)
for i in range(5):
    doc = ref_text_bow[i]
    index = ann_index[i]
    doc_bow = dictionary.doc2bow(doc)
    doc_tfidf = tfidf[doc_bow]

    # Query our index for closely related documents
    sims_tfidf = index_tfidf[doc_tfidf]
    sims_tfidf = sorted(enumerate(sims_tfidf), 
                        key=lambda item: item[1], reverse=True)
    sims_tfidf = np.array(sims_tfidf)
    correct_position = np.where(sims_tfidf[:, 0] == index)[0]
    print correct_position
    correct_positions[i] = correct_position

[2091]
[19]
[8559]
[2]
[40]


## Latent Semantic Analysis and Similarity Calcuation

In [175]:
# LSI our Tf-idf corpus
lsa = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)
corpus_lsa = lsa[corpus_tfidf]

In [176]:
# Print first ten documents in our LSI corpus
for doc in corpus_lsa[:5]:
    print doc

[(0, 0.11817592009507155), (1, 0.043670656922342309), (2, -0.14992542065700362), (3, -0.023131955586047574), (4, 0.0082964461991998188), (5, -0.0036412197581230563), (6, 0.0052674913880049476), (7, 0.025678545319327541), (8, -0.003650315236654838), (9, 0.0050921255380231801)]
[(0, 0.041115382283709485), (1, 0.0459444092820957), (2, -0.01277981635249471), (3, 0.0096576985185838193), (4, -0.0010830974269517796), (5, 0.010359230375508023), (6, 0.0043806227706480547), (7, -0.0031039945281084191), (8, 0.006223285692291719), (9, 0.016255697025536305)]
[(0, 0.17895704014220509), (1, -0.00018982998432079748), (2, -0.21609053888732968), (3, -0.047646168829888272), (4, 0.052260601211250153), (5, 0.0017649047671909641), (6, -0.010475616305728644), (7, -0.00059897264687310274), (8, 0.0055253356560165624), (9, 0.0085105921099546487)]
[(0, 0.12770153041292825), (1, -0.05397949383748999), (2, -0.19226011773138613), (3, -0.054172426540956664), (4, 0.044355091765307511), (5, -0.026641966038961379), (6,

In [177]:
# Create index for querying
index_lsa = similarities.MatrixSimilarity(corpus_lsa)

In [178]:
# Create LSA representation of our test query (referent)
doc = ref_text_bow[2]
index = ann_index[2]
doc_bow = dictionary.doc2bow(doc)
doc_tfidf = tfidf[doc_bow]
doc_lsa = lsa[doc_tfidf]

# Query our index for closely related documents
sims_lsa = index_lsa[doc_lsa]
sims_lsa = sorted(enumerate(sims_lsa), 
                    key=lambda item: item[1], reverse=True)
sims_lsa = np.array(sims_lsa)

In [179]:
# Find position of the "correct" annotation in the similarity list
correct_position = np.where(sims_lsa[:, 0] == index)[0]
correct_position

array([63523])

In [180]:
# Repeat for sample of test queries
correct_positions = np.zeros(5)
for i in range(5):
    doc = ref_text_bow[i]
    index = ann_index[i]
    doc_bow = dictionary.doc2bow(doc)
    doc_tfidf = tfidf[doc_bow]
    doc_lsa = lsa[doc_tfidf]

    # Query our index for closely related documents
    sims_lsa = index_lsa[doc_lsa]
    sims_lsa = sorted(enumerate(sims_lsa), 
                        key=lambda item: item[1], reverse=True)
    sims_lsa = np.array(sims_lsa)
    correct_position = np.where(sims_lsa[:, 0] == index)[0]
    print correct_position
    correct_positions[i] = correct_position

[22915]
[7296]
[63523]
[53994]
[98836]


In [None]:
# Find position of the "correct" annotation in the similarity list
correct_position = np.where(sims_lsa[:, 0] == index)[0]
correct_position

In [None]:
# Repeat for sample of test queries
correct_positions = np.zeros(5)
for i in range(5):
    doc = ref_text_bow[i]
    index = ann_index[i]
    doc_bow = dictionary.doc2bow(doc)
    doc_tfidf = tfidf[doc_bow]
    doc_lsa = lsa[doc_tfidf]

    # Query our index for closely related documents
    sims_lsa = index_lsa[doc_lsa]
    sims_lsa = sorted(enumerate(sims_lsa), 
                        key=lambda item: item[1], reverse=True)
    sims_lsa = np.array(sims_lsa)
    correct_position = np.where(sims_lsa[:, 0] == index)[0]
    print correct_position
    correct_positions[i] = correct_position

## Latent Dirichlet Allocation and Similarity Calculation

In [184]:
# LDA our BOW corpus
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=5)
corpus_lda = lda[corpus]

In [185]:
# Print first ten documents in our LDA corpus
for i in corpus_lda[:5]:
    print i

[(1, 0.13106263717234684), (2, 0.70330932327740769), (4, 0.15845293007370048)]
[(0, 0.010815749492938528), (1, 0.12333716806465662), (2, 0.78145765402605971), (3, 0.073827878405082495), (4, 0.0105615500112626)]
[(2, 0.36012623594661347), (4, 0.61327365359313557)]
[(0, 0.046939960071494792), (2, 0.14330507832413031), (4, 0.79908502909962764)]
[(0, 0.67258756719662194), (1, 0.029010811905071092), (2, 0.24084909912011762), (3, 0.028814185610665444), (4, 0.028738336167523849)]


In [186]:
# Create index for querying
index_lda = similarities.MatrixSimilarity(corpus_lda)

In [187]:
# Create LDA representation of our test query (referent)
doc = ref_text_bow[2]
index = ann_index[2]
doc_bow = dictionary.doc2bow(doc)
doc_lda = lda[doc_bow]

# Query our index for closely related documents
sims_lda = index_lda[doc_lda]
sims_lda = sorted(enumerate(sims_lda), 
                    key=lambda item: item[1], reverse=True)
sims_lda = np.array(sims_lda)

In [188]:
# Find position of the "correct" annotation in the similarity list
correct_position = np.where(sims_lsa[:, 0] == index)[0]
correct_position

array([72390])

In [189]:
# Repeat for sample of test queries
correct_positions = np.zeros(5)
for i in range(5):
    doc = ref_text_bow[i]
    index = ann_index[i]
    doc_bow = dictionary.doc2bow(doc)
    doc_lda = lda[doc_bow]

    # Query our index for closely related documents
    sims_lda = index_lsa[doc_lda]
    sims_lda = sorted(enumerate(sims_lda), 
                        key=lambda item: item[1], reverse=True)
    sims_lda = np.array(sims_lsa)
    correct_position = np.where(sims_lda[:, 0] == index)[0]
    print correct_position
    correct_positions[i] = correct_position

[82307]
[71095]
[72390]
[83366]
[98836]


In [195]:
print ref_text[4]
for i in sims_lda[:5,0]:
    print ann_text[i] + "\n\n---\n\n"

[Bridge]
 Whole squad on that real shit
 Whole squad on that real shit
 Whole squad on that real shit


---


Reference to Redmans hit Tonights Da Night

ACCEPTED COMMENT: https://www.youtube.com/watch?v=G6LVIi7pzZI

---


Lil Yachty- u trippin

---


Em is referring to the same Van Dyke Avenue in Yellow Brick Road from his Encore album.
This is also a play on words, knowing how much Eminem loves disrespecting women and talking about lesbians, which is shadowing his urge to want to call the girl hes with a dyke (lesbian).

Van Dyke Avenue is a street/road in Detroit crossing 8 Mile Road.
Its quite long, too:
Van Dyke

---


Ready to Die

---




## Hierarchical Dirichlet Process and Similarity Calculation

In [48]:
# HDP our BOW corpus
hdp = models.HdpModel(corpus, id2word=dictionary)
corpus_hdp = hdp[corpus]