In [1]:
# Imports
import pickle as pic
import numpy as np
import seaborn as sb
import nltk, string
import pandas as pd
from gensim import corpora, models, similarities
from collections import defaultdict
from nltk.corpus import stopwords
import gensim
from itertools import compress


### Prepare Data (BOW Model)

In [2]:
# Load Datasets
all_data = pic.load( open("../Data/all_data.p", "rb"))

In [3]:
# Get Text and Index
ann_text = all_data['ann_text'].values
ref_text = all_data['fragment'].values

In [4]:
# Define normalizer
stemmer = nltk.stem.porter.PorterStemmer()

stop = set(stopwords.words('english'))

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens if item not in stop]

def normalize(text):
    return stem_tokens(nltk.word_tokenize(
            text.lower().translate(None, string.punctuation)))

In [5]:
# Normalize our annotations and Referent Text
ann_text_bow = map(normalize, ann_text)
ref_text_bow = map(normalize, ref_text)

In [6]:
# Get frequency dictionary
frequency = defaultdict(int)
for text in ann_text_bow:
    for token in text:
        frequency[token] += 1

In [7]:
# Create Dictionary with our annotations (only including words that appear more than once)
ann_text_mult = [[token for token in text if frequency[token] > 1] 
         for text in ann_text_bow]

dictionary = corpora.Dictionary(ann_text_mult)
dictionary.num_docs

103882

In [8]:
# Print Number of Words in the Dictionary
len(dictionary.values())

28029

In [9]:
# Create our corpus
corpus_bow = [dictionary.doc2bow(text) for text in ann_text_mult]

In [11]:
# Print first five sparse vectors in the corpus
print corpus_bow[:5]
print len(corpus_bow)

 [[(0, 1), (1, 1), (2, 2), (3, 1), (4, 1), (5, 2), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 2), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1)], [(1, 1), (11, 3), (13, 1), (19, 2), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 3), (27, 1), (28, 1), (29, 1), (30, 2), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 2), (39, 1), (40, 2), (41, 1), (42, 1), (43, 1), (44, 1)], [(11, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1)], [(11, 2), (50, 1), (51, 2), (52, 3), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 2), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 2), (71, 1), (72, 1), (73, 3), (74, 1), (75, 1), (76, 1), (77, 2), (78, 1)], [(2, 1), (4, 1), (10, 1), (11, 1), (13, 1), (20, 1), (71, 1), (79, 1), (80, 1), (81, 2), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 3), (96, 2), (97, 4), (98, 1), (99

## Artist Specific Corpus
In order to limit similarity comparisons for our specified query, we need to create a dictionary of artist specific corpuses that we can compare over to subset our search space to minimzing computational expense

In [12]:
# Create dictionary for storing artist corpuses ad their associated id
artist_corpus_dict = {}

# Fill dictionary with artist specific corpuses
artist_ids = all_data['artist_id'].unique()
for artist_id in artist_ids:
    id_mask = list(all_data['artist_id'] == artist_id)
    artist_text = compress(ann_text_mult, id_mask)
    artist_corpus = [dictionary.doc2bow(text) for text in artist_text]
    artist_corpus_dict[artist_id] = artist_corpus

In [13]:
artist_corpus_dict.keys()

[132097,
 2,
 3,
 4,
 5,
 1,
 1032,
 15369,
 100874,
 13,
 1039,
 1553,
 1043,
 20,
 1557,
 22,
 11353,
 47131,
 2588,
 2077,
 34,
 348422,
 42,
 555,
 556,
 45,
 46,
 1583,
 44080,
 1073,
 23090,
 241331,
 140856,
 59,
 64,
 4161,
 67,
 68,
 69,
 72,
 73,
 74,
 76,
 77,
 1102,
 82,
 83,
 24661,
 86,
 1111,
 88,
 89,
 271,
 92,
 13157,
 344,
 685,
 103,
 105,
 108,
 110,
 111,
 112,
 143985,
 115,
 464,
 119,
 448,
 123,
 1148,
 127,
 640,
 12417,
 130,
 643,
 132,
 646,
 4231,
 535,
 1181,
 147,
 148,
 2197,
 108185,
 8346,
 156,
 157,
 158,
 3781,
 161,
 170,
 1195,
 173,
 2736,
 136370,
 179,
 1205,
 182,
 183,
 186,
 3264,
 1218,
 195,
 197,
 198,
 78535,
 72393,
 202,
 330443,
 204,
 20185,
 223,
 976,
 228,
 229,
 745,
 996586,
 747,
 748,
 778477,
 7922,
 1267,
 758,
 2295,
 249,
 1274,
 251,
 554,
 255,
 185600,
 649478,
 129,
 1581,
 25560,
 276,
 47387,
 288,
 569121,
 291,
 1319,
 810,
 812,
 301,
 303,
 1001264,
 306,
 11571,
 1333,
 2358,
 1340,
 49470,
 835,
 310,
 326,
 

## TFIDF and Similarity Calculation

In [14]:
# Tf-Idf our corpus
tfidf = models.TfidfModel(corpus_bow)
corpus_tfidf = tfidf[corpus_bow]

In [15]:
# Create indices for querying separated by artist_id
artist_index_dict = {}
for key, value in artist_corpus_dict.iteritems():
    artist_index_dict[key] = similarities.MatrixSimilarity(tfidf[value])

In [16]:
# Randomly Sample 100 referents to compare
ref_indices = np.random.choice(range(all_data.shape[0]), 100, replace=False)

In [None]:
# Get the index for the "correct" annotation for each referent
ref_artist_id = all_data.iloc[ref_indices[0], :]['artist_id']
ref_ann_id = all_data.iloc[ref_indices[0], :]['ann_id']
artist_df = all_data[all_data['artist_id'] == ref_artist_id].reset_index(drop=True)
index = artist_df[artist_df['ann_id'] == ref_ann_id].index[0]
index

In [None]:
# Create TDIDF representation of our test query (referent)
doc = ref_text_bow[ref_indices[0]]
doc_bow = dictionary.doc2bow(doc)
doc_tfidf = tfidf[doc_bow]

# Query our index for closely related documents
sims_tfidf = artist_index_dict[ref_artist_id][doc_tfidf]
sims_tfidf = sorted(enumerate(sims_tfidf), 
                    key=lambda item: item[1], reverse=True)
sims_tfidf = np.array(sims_tfidf)
sims_tfidf[4][:]

In [17]:
# Create Function for calucating TFIDF searcha accuracy
def search_accuracy_tfidf(index):
    ref_artist_id = all_data.iloc[index, :]['artist_id']
    ref_ann_id = all_data.iloc[index, :]['ann_id']
    artist_df = all_data[all_data['artist_id'] == ref_artist_id].reset_index(drop=True)
    ref_position = artist_df[artist_df['ann_id'] == ref_ann_id].index[0]
    doc = ref_text_bow[index]
    doc_bow = dictionary.doc2bow(doc)
    doc_tfidf = tfidf[doc_bow]
    print doc_tfidf
    sims_tfidf = artist_index_dict[ref_artist_id][doc_tfidf]
    sims_tfidf = sorted(enumerate(sims_tfidf), 
                        key=lambda item: item[1], reverse=True)
    sims_tfidf = np.array(sims_tfidf)
    correct_position = np.where(sims_tfidf[:, 0] == ref_position)[0][0]
    percentile = 1 - correct_position/float(sims_tfidf.shape[0])
    return correct_position, percentile

In [18]:
scores = []
for idx in ref_indices:
    print idx
    accuracy = search_accuracy_tfidf(idx)
    scores.append(accuracy)

50888
[(4, 0.079370335248297), (27, 0.10072152860896998), (83, 0.16971580360208702), (85, 0.13779271034689036), (436, 0.1913632995573451), (438, 0.13466412582417508), (619, 0.4201503976903296), (691, 0.13061177328778079), (1285, 0.12383878842109806), (1687, 0.17263681354120147), (2672, 0.21699039284056174), (3399, 0.2061870919675847), (4430, 0.26836795546246284), (5221, 0.29565780067639197), (5913, 0.30346696750863505), (13891, 0.34082598769883476), (23706, 0.43011416632322685)]
34177
[(2, 0.07104873556955503), (290, 0.10279858974383073), (545, 0.20420345657190062), (704, 0.1339839753619566), (1823, 0.19527998582116018), (2458, 0.15587609636197367), (3786, 0.25368179109635713), (4006, 0.2222088515576143), (4270, 0.23563227570037465), (4818, 0.23116231842037896), (6176, 0.2512892090809866), (11127, 0.3202067721832947), (12254, 0.30910392407885), (14209, 0.3988119098472773), (14779, 0.26822313246098584), (19809, 0.3839190802775997)]
63751
[(59, 0.2744513850402358), (164, 0.19054772955554

In [20]:
search_accuracy_tfidf(14773)

[(263, 0.222092466594778), (593, 0.31058804801441875), (603, 0.1842142444074801), (773, 0.21790126932819534), (1147, 0.3292517581948753), (8359, 0.42421535873496063), (14691, 0.4612154931102332), (21631, 0.5212571608611428)]


IndexError: index 21631 is out of bounds for axis 1 with size 14909

## Latent Semantic Analysis and Similarity Calcuation

In [22]:
# LSI our Tf-idf corpus
lsa = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)
corpus_lsa = lsa[corpus_tfidf]

In [23]:
# Create indices for querying separated by artist_id
artist_lsa_index_dict = {}
for key, value in artist_corpus_dict.iteritems():
    artist_lsa_index_dict[key] = similarities.MatrixSimilarity(lsa[tfidf[value]])

In [27]:
# Create Function for calucating TFIDF searcha accuracy
def search_accuracy_lsa(index):
    ref_artist_id = all_data.iloc[index, :]['artist_id']
    ref_ann_id = all_data.iloc[index, :]['ann_id']
    artist_df = all_data[all_data['artist_id'] == ref_artist_id].reset_index(drop=True)
    ref_position = artist_df[artist_df['ann_id'] == ref_ann_id].index[0]
    doc = ref_text_bow[index]
    doc_bow = dictionary.doc2bow(doc)
    doc_lsa = lsa[tfidf[doc_bow]]
    sims_lsa = artist_lsa_index_dict[ref_artist_id][doc_lsa]
    sims_lsa = sorted(enumerate(sims_lsa), 
                        key=lambda item: item[1], reverse=True)
    sims_lsa = np.array(sims_lsa)
    correct_position = np.where(sims_lsa[:, 0] == ref_position)[0][0] # Find Ranking Position of correct annotation
    percentile = 1 - correct_position/float(sims_lsa.shape[0]) # Find Percentile Ranking
    return correct_position, percentile

In [28]:
scores = []
for idx in ref_indices:
    print idx
    accuracy = search_accuracy_lsa(idx)
    scores.append(accuracy)

50888
34177
63751
69840
3637
56152
9336
64695
23115
72805
103605
48527
95573
98754
4720
50798
10238
1060
80604
31880
70562
92846
15031
57455
6056
54067
99629
69430
102405
14601
53218
93046
57359
14598
13031
47215
54157
82292
50881
1420
72666
61452
2753
41670
14122
83697
56192
51304
88004
100376
55881
42496
32721
70868
16769
71393
23784
78035
74103
98915
23777
72017
52724
62588
78667
51693
78602
11127
55941
65487
1883
69675
70366
18371
80994
60899
23945
62909
38680
102909
28261
98326
78245
101999
77325
60001
48634
16292
63435
9188
20587
74099
74043
27947
45147
98548
48111
35835
59578
57218


In [None]:
# Find position of the "correct" annotation in the similarity list
correct_position = np.where(sims_lsa[:, 0] == index)[0]
correct_position

In [None]:
# Repeat for sample of test queries
correct_positions = np.zeros(5)
for i in range(5):
    doc = ref_text_bow[i]
    index = ann_index[i]
    doc_bow = dictionary.doc2bow(doc)
    doc_tfidf = tfidf[doc_bow]
    doc_lsa = lsa[doc_tfidf]

    # Query our index for closely related documents
    sims_lsa = index_lsa[doc_lsa]
    sims_lsa = sorted(enumerate(sims_lsa), 
                        key=lambda item: item[1], reverse=True)
    sims_lsa = np.array(sims_lsa)
    correct_position = np.where(sims_lsa[:, 0] == index)[0]
    print correct_position
    correct_positions[i] = correct_position

## Latent Dirichlet Allocation and Similarity Calculation

In [184]:
# LDA our BOW corpus
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=5)
corpus_lda = lda[corpus]

In [185]:
# Print first ten documents in our LDA corpus
for i in corpus_lda[:5]:
    print i

[(1, 0.13106263717234684), (2, 0.70330932327740769), (4, 0.15845293007370048)]
[(0, 0.010815749492938528), (1, 0.12333716806465662), (2, 0.78145765402605971), (3, 0.073827878405082495), (4, 0.0105615500112626)]
[(2, 0.36012623594661347), (4, 0.61327365359313557)]
[(0, 0.046939960071494792), (2, 0.14330507832413031), (4, 0.79908502909962764)]
[(0, 0.67258756719662194), (1, 0.029010811905071092), (2, 0.24084909912011762), (3, 0.028814185610665444), (4, 0.028738336167523849)]


In [186]:
# Create index for querying
index_lda = similarities.MatrixSimilarity(corpus_lda)

In [187]:
# Create LDA representation of our test query (referent)
doc = ref_text_bow[2]
index = ann_index[2]
doc_bow = dictionary.doc2bow(doc)
doc_lda = lda[doc_bow]

# Query our index for closely related documents
sims_lda = index_lda[doc_lda]
sims_lda = sorted(enumerate(sims_lda), 
                    key=lambda item: item[1], reverse=True)
sims_lda = np.array(sims_lda)

In [188]:
# Find position of the "correct" annotation in the similarity list
correct_position = np.where(sims_lsa[:, 0] == index)[0]
correct_position

array([72390])

In [189]:
# Repeat for sample of test queries
correct_positions = np.zeros(5)
for i in range(5):
    doc = ref_text_bow[i]
    index = ann_index[i]
    doc_bow = dictionary.doc2bow(doc)
    doc_lda = lda[doc_bow]

    # Query our index for closely related documents
    sims_lda = index_lsa[doc_lda]
    sims_lda = sorted(enumerate(sims_lda), 
                        key=lambda item: item[1], reverse=True)
    sims_lda = np.array(sims_lsa)
    correct_position = np.where(sims_lda[:, 0] == index)[0]
    print correct_position
    correct_positions[i] = correct_position

[82307]
[71095]
[72390]
[83366]
[98836]


In [195]:
print ref_text[4]
for i in sims_lda[:5,0]:
    print ann_text[i] + "\n\n---\n\n"

[Bridge]
 Whole squad on that real shit
 Whole squad on that real shit
 Whole squad on that real shit


---


Reference to Redmans hit Tonights Da Night

ACCEPTED COMMENT: https://www.youtube.com/watch?v=G6LVIi7pzZI

---


Lil Yachty- u trippin

---


Em is referring to the same Van Dyke Avenue in Yellow Brick Road from his Encore album.
This is also a play on words, knowing how much Eminem loves disrespecting women and talking about lesbians, which is shadowing his urge to want to call the girl hes with a dyke (lesbian).

Van Dyke Avenue is a street/road in Detroit crossing 8 Mile Road.
Its quite long, too:
Van Dyke

---


Ready to Die

---




## Hierarchical Dirichlet Process and Similarity Calculation

In [48]:
# HDP our BOW corpus
hdp = models.HdpModel(corpus, id2word=dictionary)
corpus_hdp = hdp[corpus]