# Doc2Vec Model Implementation on r/worldnews

[source](https://www.kaggle.com/fmitchell259/creating-a-doc2vec-model)

In [1]:
# import libraries
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import pandas as pd
import sys
sys.path.insert(0, "../")

In [2]:
# preprocess text
def replace_none(X):
    if X == '':
        X = np.nan
    return X

In [3]:
def build_model(max_epochs, vec_size, alpha, tagged_data):
    
    model = Doc2Vec(vector_size=vec_size,
               alpha=alpha,
               min_alpha=0.00025,
               min_count=1,
               dm=1)
    
    model.build_vocab(tag_data)
    
    # With the model built we simply train on the data.
    
    for epoch in range(max_epochs):
        print(f"Iteration {epoch}")
        model.train(tag_data,
                   total_examples=model.corpus_count,
                   epochs=model.epochs)

        # Here I decrease the learning rate. 
        model.alpha -= 0.0002

        model.min_alpha = model.alpha
    
    # Now simply save the model to avoid training again. 
    
    model.save("w2v_MODEL.model")
    print("Model Saved")
    return model

## Read & Preprocess Data

In [4]:
df = pd.read_csv("reddit_worldnews_start_to_2016-11-22.csv")
df.head(10)

Unnamed: 0,time_created,date_created,up_votes,down_votes,title,over_18,author,subreddit
0,1201232046,2008-01-25,3,0,Scores killed in Pakistan clashes,False,polar,worldnews
1,1201232075,2008-01-25,2,0,Japan resumes refuelling mission,False,polar,worldnews
2,1201232523,2008-01-25,3,0,US presses Egypt on Gaza border,False,polar,worldnews
3,1201233290,2008-01-25,1,0,Jump-start economy: Give health care to all,False,fadi420,worldnews
4,1201274720,2008-01-25,4,0,Council of Europe bashes EU&UN terror blacklist,False,mhermans,worldnews
5,1201287889,2008-01-25,15,0,Hay presto! Farmer unveils the illegal mock-...,False,Armagedonovich,worldnews
6,1201289438,2008-01-25,5,0,"Strikes, Protests and Gridlock at the Poland-U...",False,Clythos,worldnews
7,1201536662,2008-01-28,0,0,The U.N. Mismanagement Program,False,Moldavite,worldnews
8,1201558396,2008-01-28,4,0,Nicolas Sarkozy threatens to sue Ryanair,False,Moldavite,worldnews
9,1201635869,2008-01-29,3,0,US plans for missile shields in Polish town me...,False,JoeyRamone63,worldnews


In [5]:
# preprocess reddit posts
df['title'] = df['title'].apply(replace_none)
df = df.dropna()

# sample 10,000 posts
w2v_total_data = list(df['title'].sample(25000))

In [6]:
# tokenize data with NLTK
tag_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(w2v_total_data)]

## Model Training 

In [7]:
# build model
model = build_model(max_epochs=20, vec_size=15, alpha=0.025, tagged_data=tag_data)

Iteration 0
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Iteration 6
Iteration 7
Iteration 8
Iteration 9
Iteration 10
Iteration 11
Iteration 12
Iteration 13
Iteration 14
Iteration 15
Iteration 16
Iteration 17
Iteration 18
Iteration 19
Model Saved


In [8]:
# find similar words
model.wv.similar_by_word("russia")

  if np.issubdtype(vec.dtype, np.int):


[('iran', 0.905253529548645),
 ('rebalancing', 0.8895424008369446),
 ('britain', 0.8859010338783264),
 ('nato', 0.8856167793273926),
 ('romania', 0.8825780749320984),
 ('un', 0.8785852193832397),
 ('itself', 0.8729985952377319),
 ('turkey', 0.8729944229125977),
 ('ukraine', 0.8635074496269226),
 ('beijing', 0.85148024559021)]

In [9]:
# Helper Function - Print iterations progress
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 2, length = 100, fill = '█', printEnd = "\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
    # Print New Line on Complete
    if iteration == total: 
        print()

In [10]:
# get vector embeddings of all sampled titles
vectors = []
l = len(tag_data)
for i, title in enumerate(tag_data):
    vectors.append(model.infer_vector(title.words))

In [11]:
# Example - Get similar doc
tokens = "Britain s ex-spy boss: We did not kill Diana".lower().split()
inferred_vector = model.infer_vector(tokens)
# gives you top 10 document tags and their cosine similarity
sims = model.docvecs.most_similar([inferred_vector])
for sim in sims:
    print(tag_data[int(sim[0])])

TaggedDocument(['mccain', 'helps', 'another', 'needy', 'family', 'that', 'couldn', 't', 'afford', 'their', 'oceanfront', 'mansion', 'it', 'was', 'a', 'steal', 'j.m', '.'], ['23413'])
TaggedDocument(['bulgarian', 'woman', 'claims', 'she', 's', 'maria', 's', 'mom', ':', 'we', 'gifted', 'her', 'to', 'roma', 'family'], ['24134'])
TaggedDocument(['why', 'gaddafi', 'got', 'a', 'red', 'card', ',', 'by', 'pepe', 'escoban'], ['15328'])
TaggedDocument(['no', 'money', '?', 'eat', 'less', '!', 'united', 'russia', 'lawmaker', 'advises'], ['3729'])
TaggedDocument(['lebron', 'james', 'sad', 'kobe', 'bryant', 'retiring', ',', 'wishes', 'they', 'd', 'met', 'in', 'finals'], ['798'])
TaggedDocument(['100', 'cops', 'outside', ',', '100', 'cops', 'inside', ',', '100', 'or', 'so', 'bouncers', ',', '3', 'bulletproof', 'cars', ',', 'and', 'a', 'chopper', 'to', 'guard', 'her', '.', 'well', 'fuck', 'you', 'julia', 'roberts', 'you', 'aint', 'obama', 'so', 'stop', 'the', 'showoff', 'bitch'], ['16082'])
TaggedDocu

  if np.issubdtype(vec.dtype, np.int):


In [12]:
# get similarity ranks for all the titles
import time
l = len(tag_data)
printProgressBar(0, l, prefix = 'Progress:', suffix = 'Complete', length = 50)

ranks = []
second_ranks = []

for doc_id in range(len(tag_data)):
    inferred_vector = model.infer_vector(tag_data[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(str(doc_id))
    ranks.append(rank)
    # Update Progress Bar
    printProgressBar(doc_id + 1, l, prefix = 'Progress:', suffix = 'Complete', length = 50)
    second_ranks.append(sims[1])

  if np.issubdtype(vec.dtype, np.int):


Progress: |██████████████████████████████████████████████████| 100.00% Complete


## Get Similar Documents

In [13]:
# Pick a random document from the corpus and get most, second most, median, and least similar documents
import random
doc_id = random.randint(0, len(tag_data) - 1)
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(tag_data[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(tag_data[int(sims[index][0])].words)))

Document (4532): «no bunker-buster bomb in israel s us arms deal»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d15,n5,w5,s0.001,t3):

MOST ('17973', 0.931635856628418): «taliban suicide bomber kills 5 in pakistan»

SECOND-MOST ('2878', 0.9202002882957458): «why russia won ’ t help on syria»

MEDIAN ('8485', 0.4319780468940735): «obama bans new oil drilling in arctic ocean»

LEAST ('22798', -0.18453054130077362): «esa chief wants to establish a moon village»



In [14]:
# Pick a random document from the corpus and infer a vector from the model
import random
doc_id = random.randint(0, len(tag_data) - 1)

# Compare and print the second-most-similar document
print('Train Document ({}): «{}»\n'.format(doc_id, ' '.join(tag_data[doc_id].words)))
sim_id = second_ranks[doc_id]
print('Seond-Most Similar Document {}: «{}»\n'.format(sim_id, ' '.join(tag_data[int(sim_id[0])].words)))

Train Document (22270): «police chase school bus and arrest student for sexual assault !»

Seond-Most Similar Document ('1333', 0.9156079292297363): «bbc news - david cameron and nawaz sharif to hold talks»



In [15]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(tag_data) - 1)
inferred_vector = model.infer_vector(tag_data[doc_id].words)
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(tag_data[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(tag_data[int(sims[index][0])].words)))

Test Document (808): «syrian kurds accuse turkey of attacks , ask u.s. for explanation»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d15,n5,w5,s0.001,t3):

MOST ('2657', 0.9441635608673096): «north korea»

MEDIAN ('15128', 0.4652347266674042): «al qaeda gunmen drank in bar before unleashing ivory coast attack»

LEAST ('2017', -0.06031855568289757): «colombia and farc rebels agreed child soldier deal»



  if np.issubdtype(vec.dtype, np.int):


In [16]:
# save embeddings as tsv file
import csv

with open('vectors_2.tsv', 'w') as f: 
    write = csv.writer(f, delimiter='\t') 
    for i in range(len(vectors)):
        write.writerow(list(vectors[i])) 
        
with open('metadata_2.tsv', 'w', newline='') as f_output:
    tsv_output = csv.writer(f_output, delimiter='\t')
    for s in w2v_total_data:
        tsv_output.writerow([s])
        
        
print(len(w2v_total_data) == len(vectors))

True


## Run Greedy Algorithm

In [17]:
import pandas as pd
import numpy as np
from scipy.spatial import distance
from scipy.spatial.distance import pdist
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
df = pd.read_csv('vectors_2.tsv', sep='\t', header=None)
df_labels = pd.read_csv('metadata_2.tsv', sep='\t', names=['Labels'])

In [19]:
df['vector'] = df[:].values.tolist()
dfnew = pd.concat([df_labels, df], axis = 1)
dfnew.head()

Unnamed: 0,Labels,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,vector
0,Ebola crisis: Experimental vaccine shipped to...,-0.105281,0.02724,0.082224,-0.145819,0.007755,0.275304,-0.00857,-0.394311,-0.068798,-0.231672,0.219208,0.310434,0.179274,-0.344193,-0.205687,"[-0.10528074, 0.027240133, 0.082223594, -0.145..."
1,Lukashenko expects clarity from Russia in resp...,0.092238,-0.174913,-0.041843,0.01375,0.007302,0.048995,0.0571,-0.191275,-0.109585,0.095569,0.074516,0.26612,0.191533,0.054953,-0.051995,"[0.09223784, -0.17491344, -0.041842535, 0.0137..."
2,Ukraine puts on sale 300 state-owned enterprises,0.02508,-0.185489,-0.044454,0.002512,0.178259,0.241759,-0.065433,-0.228053,0.208765,0.083695,0.009705,0.175198,0.033695,-0.216903,-0.235001,"[0.025080029, -0.18548858, -0.044453833, 0.002..."
3,Sweetheart tax avoidance deals of multinationa...,-0.36812,-0.008282,0.065555,0.205778,0.13223,0.521165,-0.156815,-0.919952,-0.102732,-0.041488,0.319376,0.219044,0.235019,-0.358439,-0.109115,"[-0.36812046, -0.008281825, 0.065555334, 0.205..."
4,Islamic State militants seize four more foreig...,-0.179573,-0.08901,-0.104269,-0.000772,-0.0403,0.252294,-0.318238,-1.115867,0.153095,0.256193,-0.132797,0.628505,0.195041,-0.424344,-0.367347,"[-0.17957267, -0.0890096, -0.10426912, -0.0007..."


In [20]:
def get_similar_posts(sub_name, num_subs_to_reccomend):
    similarities = []
    sub_name_vector = dfnew['vector'][dfnew['Labels'] == sub_name].to_numpy()[0]
    sub_name_vector = np.array(sub_name_vector).reshape(1, -1)
    for vector in dfnew['vector'].tolist():
        vector = np.array(vector).reshape(1, -1)
        similarities.append(cosine_similarity(sub_name_vector, vector))

    pairs = list(zip(dfnew['Labels'], similarities))
    closest_subs = sorted(pairs, key=lambda item: item[1], reverse=True)[1:num_subs_to_reccomend+1]
    recommend_frame = []
    for val in closest_subs:
        recommend_frame.append({'Post':val[0],'Similarity':val[1].item(0)})

    df = pd.DataFrame(recommend_frame)
    df = df.set_index(['Post'])

    return df

In [21]:
pd.set_option('display.max_colwidth', None)
dfnew

Unnamed: 0,Labels,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,vector
0,Ebola crisis: Experimental vaccine shipped to Liberia,-0.105281,0.027240,0.082224,-0.145819,0.007755,0.275304,-0.008570,-0.394311,-0.068798,-0.231672,0.219208,0.310434,0.179274,-0.344193,-0.205687,"[-0.10528074, 0.027240133, 0.082223594, -0.14581919, 0.007755016, 0.27530393, -0.0085703805, -0.3943109, -0.068798184, -0.23167215, 0.21920843, 0.31043395, 0.17927392, -0.34419346, -0.20568688]"
1,Lukashenko expects clarity from Russia in respect of Belarus,0.092238,-0.174913,-0.041843,0.013750,0.007302,0.048995,0.057100,-0.191275,-0.109585,0.095569,0.074516,0.266120,0.191533,0.054953,-0.051995,"[0.09223784, -0.17491344, -0.041842535, 0.013749579, 0.007301635999999999, 0.048994604000000004, 0.057100244, -0.19127549, -0.10958466, 0.09556897, 0.07451574, 0.26611954, 0.19153270000000003, 0.05495258, -0.051994644000000007]"
2,Ukraine puts on sale 300 state-owned enterprises,0.025080,-0.185489,-0.044454,0.002512,0.178259,0.241759,-0.065433,-0.228053,0.208765,0.083695,0.009705,0.175198,0.033695,-0.216903,-0.235001,"[0.025080029, -0.18548858, -0.044453833, 0.0025121449, 0.17825903, 0.24175929999999998, -0.06543288400000001, -0.2280527, 0.208765, 0.083694905, 0.009705253, 0.17519823, 0.033694922999999995, -0.21690261, -0.23500082]"
3,Sweetheart tax avoidance deals of multinational corporations to receive greater scrutiny from the EU,-0.368120,-0.008282,0.065555,0.205778,0.132230,0.521165,-0.156815,-0.919952,-0.102732,-0.041488,0.319376,0.219044,0.235019,-0.358439,-0.109115,"[-0.36812046, -0.008281825, 0.065555334, 0.20577793, 0.13222973, 0.52116483, -0.15681468, -0.9199519, -0.10273188, -0.04148817, 0.31937647, 0.21904354, 0.23501863, -0.35843906, -0.109114945]"
4,Islamic State militants seize four more foreign hostages in Syria,-0.179573,-0.089010,-0.104269,-0.000772,-0.040300,0.252294,-0.318238,-1.115867,0.153095,0.256193,-0.132797,0.628505,0.195041,-0.424344,-0.367347,"[-0.17957267, -0.0890096, -0.10426912, -0.00077151787, -0.040299688, 0.2522936, -0.31823772, -1.115867, 0.15309541, 0.2561932, -0.13279660000000001, 0.62850523, 0.19504078, -0.42434424, -0.36734736]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,China denounces US cyber-theft charges,-0.105818,-0.135115,0.025764,0.004932,0.062450,0.184828,-0.041529,-0.364744,-0.034124,-0.022128,0.098928,0.240429,0.154148,-0.271799,-0.127128,"[-0.10581816, -0.13511531, 0.025764178, 0.004931595, 0.062449593, 0.18482819, -0.041528962999999995, -0.36474377, -0.034124464, -0.022127595, 0.09892769, 0.24042933, 0.15414774, -0.2717988, -0.12712803]"
24996,British police tactic kettling challenged on \r\nmultiple breaches of the European Convention on \r\nHuman Rights including: Article 5 - the right not \r\nto be unlawfully detained; Article 10 - the right to \r\nfreedom of expression; and Article 11 - the right \r\nto freedom of assembly,-1.059671,-0.029317,-0.154110,-0.313434,0.304884,1.080710,-0.607724,-0.988565,0.164820,0.654079,0.550402,0.389882,0.135927,-0.695539,-0.231910,"[-1.0596707, -0.029316586000000002, -0.15410984, -0.31343377, 0.3048836, 1.0807098, -0.60772437, -0.98856527, 0.16481954, 0.6540788000000001, 0.5504015999999999, 0.389882, 0.13592722, -0.695539, -0.23191015]"
24997,Uranium found on Syrian sites,-0.155882,-0.157883,-0.193674,-0.392019,0.017397,0.164001,0.072434,-0.201615,0.079111,-0.189015,0.126146,0.304885,0.249902,-0.339342,-0.011854,"[-0.15588239999999998, -0.15788299, -0.19367374, -0.39201936, 0.017396873, 0.1640006, 0.07243368, -0.20161463, 0.079111315, -0.18901545, 0.12614605, 0.30488452, 0.2499024, -0.33934224, -0.011853852]"
24998,Israel Defense Minister Says Russian Plane Breached Airspace,-0.195438,0.049077,-0.009683,-0.167793,-0.048882,0.308988,0.061224,-0.658713,-0.060071,-0.358350,0.372092,0.345981,0.390171,-0.392801,-0.064237,"[-0.19543827, 0.049077407, -0.009683411, -0.16779256, -0.048881583, 0.30898833, 0.061223503, -0.65871274, -0.06007063, -0.3583498, 0.3720921, 0.34598090000000004, 0.39017093, -0.39280057, -0.06423725]"


In [25]:
df_control = get_similar_posts("Islamic State militants seize four more foreign hostages in Syria", 50)
df_control

Unnamed: 0_level_0,Similarity
Post,Unnamed: 1_level_1
WASHINGTON: White House ‘deeply troubled’ by mass death sentence in Egypt | Washington Watch,0.962461
"Wikileaks: US says Berlusconi is feckless, vain, and ineffective as a modern European leader . Surely not",0.956526
British Satire about HAMAS.,0.956043
"Alvin Lee, British Blues-Rock Guitarist, Dies at 68",0.953181
Iran says it foiled foreign nuclear plots,0.95054
"New U.N. rights boss urges world to protect civilians in Iraq, Syria",0.94919
Iran shoots down Israeli drone,0.949147
Jamaica ablaze with street violence,0.947294
Growing Imbalance Between Germany and France Strains Their Relationship,0.947221
"Yet another car bomb in Beirut, Lebanon. Kills 5 and wounds 20.",0.946118


In [26]:
n = 50
dis_similarity = [x for x in pdist(df_control)]

avg_dissim_control = (sum(dis_similarity))/((n/2)*(n-1))
print('Average Dissimilarity: ' + str(avg_dissim_control))

sim_control = 1 - avg_dissim_control
print('Average Similarity: ' + str(sim_control))

Average Dissimilarity: 0.009069996002679812
Average Similarity: 0.9909300039973202


In [27]:
def similar_subreddits(target, num_subs_to_reccomend):
    similarities = []
    sub_name_vector = dfnew['vector'][dfnew['Labels'] == target].to_numpy()[0]
    sub_name_vector_reshaped = np.array(sub_name_vector).reshape(1, -1)
    for vector in dfnew['vector'].tolist():
        vector_reshaped = np.array(vector).reshape(1, -1)
        similarities.append(cosine_similarity(sub_name_vector_reshaped, vector_reshaped))

    pairs = list(zip(dfnew['Labels'], similarities, dfnew['vector']))
    closest_subs = sorted(pairs, key=lambda item: item[1], reverse=True)[1:num_subs_to_reccomend+1]
    recommend_frame = []
    for val in closest_subs:
        recommend_frame.append({'Subreddit':val[0],'Similarity':val[1].item(0), 'Vector':val[2]})

    df = pd.DataFrame(recommend_frame)
    return df

In [29]:
C_prime = similar_subreddits("Islamic State militants seize four more foreign hostages in Syria", 500)
C_prime

Unnamed: 0,Subreddit,Similarity,Vector
0,WASHINGTON: White House ‘deeply troubled’ by mass death sentence in Egypt | Washington Watch,0.962461,"[-0.3665853, -0.11563657, -0.07611073, -0.084455766, -0.17419690000000002, 0.5842148, -0.26442319999999997, -1.2426617, 0.25350976, 0.09010925, -0.15256064, 0.623948, 0.22719468, -0.65645236, -0.5546348000000001]"
1,"Wikileaks: US says Berlusconi is feckless, vain, and ineffective as a modern European leader . Surely not",0.956526,"[-0.44039518, -0.02031419, -0.164461, -0.07391902, 0.013270809, 0.15957429, -0.21719266, -0.8084123000000001, 0.17470276, 0.22412555, -0.14487092, 0.51704735, 0.069587834, -0.43996695, -0.33071095]"
2,British Satire about HAMAS.,0.956043,"[-0.17371182, -0.09478968, -0.17720862, 0.03685069999999999, 0.06094157, 0.27336940000000004, -0.14928705, -0.6838038000000001, 0.083238736, 0.09908101, 0.040339686, 0.36723459999999997, 0.15388239999999997, -0.36158714, -0.12807369]"
3,"Alvin Lee, British Blues-Rock Guitarist, Dies at 68",0.953181,"[-0.13392618, -0.13929589, -0.004260821, 0.029919688, 0.019777732, 0.1018062, -0.11365489, -0.44662434, 0.04866963, 0.054034933, 0.05632587, 0.23786151, 0.031271797000000004, -0.18412304, -0.16177207]"
4,Iran says it foiled foreign nuclear plots,0.950540,"[0.03184352, -0.11072876, -0.18270668, 0.12273393, -0.03272787, 0.110179015, -0.21431048, -0.57902145, 0.08633184, 0.06967174, -0.06103272, 0.33030578, 0.15076056, -0.17087886, -0.25136518]"
...,...,...,...
495,"1.8 Ton World War II Bomb Forces Mass Evacuation in Dortmund, Germany",0.894315,"[-0.14447770000000001, -0.18025593, 0.00787979, -0.06372242, -0.109494984, 0.48688105, -0.23198141, -0.8854325000000001, 0.022223992, -0.13542172, 0.11118783, 0.42894432, 0.21061347, -0.41402390000000006, -0.12799029]"
496,"Central Asia, Caucasus To See Further Declines In U.S. Aid",0.894241,"[-0.4547777, -0.09679145, -0.108397014, 0.02825251, 0.16546825, 0.39732373, -0.09481466, -0.81275046, 0.17906295, -0.08761135, 0.050445825, 0.5343899999999999, 0.14958522, -0.55392903, -0.31946963]"
497,Taliban Beheads Two Pakistani Officials In Swat,0.894241,"[-0.28464198, -0.07906422, -0.015669396000000002, -0.10370101, 0.17344904, 0.21483894, -0.18819077, -0.5810654000000001, 0.05307459, 0.015040963999999999, 0.14848751, 0.31024665, 0.10658315, -0.27321455, -0.26742172]"
498,"French officials said Sunday that they will continue to press ahead with plans to host a multilateral Middle East peace conference later this year, despite hearing, in blunt language, that Israel doesn’t really like the idea.",0.894238,"[-0.5696529, -0.35868904, -0.103533335, -0.41926253, -0.15290184, 1.0346507999999999, -0.49387327, -1.8503922, -0.11798186599999999, -0.1576995, 0.22904292, 1.1971644, 0.65905404, -1.0684897, -0.30063856]"


In [30]:
df_temp = C_prime
recommendations = ['dummy']
recommendations[0] = C_prime["Subreddit"][0]  # first item is always the one with the highest similarity

index = df_temp[(df_temp.Subreddit == recommendations[0])].index

df_temp = df_temp.drop(index)

In [31]:
def calculate_quality(c, R, df, df_sim):
    quality = 0
    rel_diversity = 0
    
    if len(R) == 0:
        rel_diversity = 1
        
    vector = np.array(df['Vector'][df['Subreddit'] == c].to_numpy()[0]).reshape(1, -1)
    diversity = []
    for item in R:
        diversity.append(1 - cosine_similarity(vector, np.array(df_sim['Vector'][df_sim['Subreddit'] == item].to_numpy()[0]).reshape(1, -1)))
        
    rel_diversity = sum(diversity)/len(R) # relative diversity
    
    similarity = df['Similarity'][df['Subreddit'] == c].to_numpy()[0] # similarity
    
    quality = rel_diversity[0][0] * similarity # quality
    return quality

In [32]:
# set k = 50 to get top 50 recommendations
k = 50
for i in range(k):
    qualities = {}
    # Calculate the quality of each subreddit
    for item in df_temp['Subreddit']:
        qualities[item] = calculate_quality(item, recommendations, df_temp, C_prime)

    highest_quality = max(qualities.values())
    highest_quality_subreddit = max(qualities, key= lambda x: qualities[x])
    recommendations.append(highest_quality_subreddit)
    
    index = df_temp[(df_temp.Subreddit == recommendations[-1])].index
    df_temp = df_temp.drop(index)

In [33]:
similarities = []
for item in recommendations:
    sim = C_prime['Similarity'][C_prime['Subreddit'] == item].to_numpy()[0]
    similarities.append(sim)

pairs = list(zip(recommendations, similarities))
recommend_frame = []
for val in pairs:
    recommend_frame.append({'Subreddit':val[0],'Similarity':val[1].item(0)})    

df_sim = pd.DataFrame(recommend_frame)
df_sim = df_sim.set_index(['Subreddit'])
df_sim

Unnamed: 0_level_0,Similarity
Subreddit,Unnamed: 1_level_1
WASHINGTON: White House ‘deeply troubled’ by mass death sentence in Egypt | Washington Watch,0.962461
"Cameron s Problems with the EU Are Just Starting\r\n\r\nmost EU partners have no interest in making concessions to the British. Abroad, Cameron s rhetoric has been met with head shaking. He has been warned repeatedly by Berlin not to blackmail his partners.",0.913691
"Russian, Iranian Ministers at Nuclear Talks: Don’t Threaten Us. Zarif retorted: “Never try to threaten the Iranians,” while Russian Foreign Minister Sergei Lavrov chimed in: “Nor the Russians.” Russia and Iran are both pressing to end the arms ban.",0.900306
Canada sending 6 CF-18s for NATO operation in Eastern Europe,0.896488
Evo Morales to John Kerry: Latin America Is Not Your Backyard.Morales said he was considering withdrawing Bolivia from the OAS if the regional body continues to interfere in the internal affairs of member countries.,0.918999
Turkey and Qatar s role in Palestinian affairs behind Israel s latest spat with Kerry,0.896356
"New York, once the financial capital of the world, is no longer even the financial capital of the U.S. That honor [now] falls on Washington D.C. -Nouriel Roubini",0.913469
Boko Haram beheads 7 prominent businessmen,0.902629
Kofi Annan and Jimmy Carter barred from visiting Zimbabwe to assess the humanitarian crisis,0.906119
EU given six weeks to protect itself against inevitable \r\nGreek default,0.902672


In [34]:
# Find the Diversity
n = 50
dis_similarity = [x for x in pdist(df_sim)]
avg_dissim_greedy = (sum(dis_similarity))/((n/2)*(n-1))
avg_dissim_greedy

0.011544163960410725

In [35]:
percent_change = ((avg_dissim_greedy - avg_dissim_control)/avg_dissim_control)*100
round(percent_change, 2)

27.28

In [None]:
### END OF NOTEBOOK ###