# Improving recommendation lists through topic diversification

[Source](https://www.researchgate.net/publication/200110416_Improving_recommendation_lists_through_topic_diversification)

In [1]:
import pandas as pd
import numpy as np
import torch
import io
import csv
from scipy.spatial.distance import pdist
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Sarcasm Dataset
df_m = pd.read_csv('../../civility/recommender/train-balanced-sarcasm-processed.csv')

In [3]:
# Add all comments to a list
corpus = df_m['comment'].to_list()

In [4]:
from sentence_transformers import SentenceTransformer, util
import torch

embedder = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Embed each comment
import time
start_time = time.time()
sarcasm_embeddings = embedder.encode(corpus, convert_to_tensor=True)
end_time = time.time()
print("Time for computing embeddings:"+ str(end_time-start_time) )

Time for computing embeddings:371.1001994609833


In [5]:
# Add vector embeddings as column in df
vectors = []
for vector in sarcasm_embeddings:
    vectors.append(list(vector.cpu().numpy()))
    
df_m['vector'] = vectors

### Step 1: Generate predictions (at least 5N for a final top-N recommendation list).

In [6]:
# Define Method for getting top-n similar comments
def get_similar_posts(query, n):
    """
    query (string): the text of the post
    n (int): number of posts to recommend
    """
    # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
    top_k = min(n, len(corpus))
    query_embedding = embedder.encode(query, convert_to_tensor=True)
    similarities = []
    pairs = []

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.pytorch_cos_sim(query_embedding, sarcasm_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    print("Query:", query)
    print("\nTop {n} most similar sentences in corpus:".format(n=n))

    for score, idx in zip(top_results[0], top_results[1]):
#         print(corpus[idx], "(Score: {:.4f})".format(score))
        pairs.append(tuple((corpus[idx], score)))
    
    recommend_frame = []
    for val in pairs:
        recommend_frame.append({'Comment':val[0],'Similarity':val[1].cpu().numpy()})
     
    df = pd.DataFrame(recommend_frame)
    return df

In [7]:
n = 10 # Select how many recommendations you want
N = 5 * n
C_prime = get_similar_posts('Xbox is much better than PS4', N)
C_prime = C_prime.join(df_m.set_index('comment'), on='Comment')
C_prime.head()

Query: Xbox is much better than PS4

Top 50 most similar sentences in corpus:


Unnamed: 0,Comment,Similarity,label,author,subreddit,score,ups,downs,date,created_utc,parent_comment,vector
0,"No, because the xbox one is soooo much better ...",0.9228309,1,awesome7332,gaming,1,1,0,2015-06,2015-06-08 03:38:26,How about we stop arguing about the platforms.,"[0.30111814, -0.6113374, 0.116583675, -0.63111..."
1,"but xbox has the highest quality pixels, so it...",0.88953227,1,its_high_knut,pcmasterrace,2,2,0,2016-08,2016-08-09 20:20:45,They look even better if you play them on your...,"[0.41256794, -0.464011, -0.2718223, -0.6063775..."
2,Xbox or PS4,0.8632204,0,DogblockBernie,Rainbow6,1,1,0,2016-09,2016-09-19 01:26:24,Same here!,"[0.51289624, -0.74949, 0.08234843, -0.94248724..."
3,"Ayy, better than Xbox 7 and PS5",0.8621861,1,DarkShadow1253,pcmasterrace,1,1,0,2015-06,2015-06-15 17:59:29,To make you happy with your specs: YOUR SPECS ...,"[0.083633505, -0.6993774, -0.01547823, -0.6487..."
4,Ps4 or Xbox,0.8430183,1,Karma_y0,AskReddit,1,1,0,2016-05,2016-05-21 21:37:44,If you could ask anyone on the internet someth...,"[0.48774123, -0.7246228, 0.0145492, -0.9653652..."


### Step 2: For each N+1 position item calculate the ILS (diversity) if this item was part of the top-N list.

For every list entry z ∈ [2, N], we collect the items from candidate set Bi that do not occur in positions o < z in Pwi∗ and compute their similarity with set {Pwi∗(k) | k ∈ 1, z }, which contains all new recommendations preceding rank z

In [87]:
# Prepare df for pariwise distance
df_ils = C_prime.copy()
df_ils = df_ils.set_index(['Comment'])

In [88]:
ils = {}
# set ILS for first item
ils[df_ils.head(1)['Similarity'].index.values.item(0)] = df_ils.head(1)['Similarity'].values[0].item(0)
for i in range(2, 51):
    top_n = df_ils.head(i - 1)
    top_n = top_n[['Similarity']]
    bottom = df_ils.tail(len(df_ils) - i + 1)
    bottom = bottom[['Similarity']]
    for item in bottom.index:
        rowData = bottom.loc[[item] , :]
        top_n = top_n.append(rowData)
        ils[item] = sum( [x for x in pdist(top_n)] ) / len(top_n) # ILS Calculation
        top_n= top_n.drop(index=item)

In [89]:
len(ils)

50

### Step 3: Sort the remaining items in reverse (according to ILS rank) to get their dissimilarity rank.

In [90]:
dissimilarity_rank = {k: v for k, v in sorted(ils.items(), key=lambda item: item[1], reverse=True)}

In [91]:
dissimilarity_rank

{'Is it xbox, console, or all?': 1.4611378622055053,
 '"xbox and ps4 only"': 1.4317524043881162,
 'Still, more exclusives than PS4!': 1.4013376869261265,
 "I'll help, are you on PC, PS4 or Xbox One?": 1.3705945344681436,
 'But everyone knows the PS4 is more powerful!': 1.3387863998827727,
 "Well, the ps4 isn't as broken as the Xbox one so it DOESNT need updating.": 1.305928683280945,
 'out of curiosity is there anything you prefer about the PS4 compared to the PC?': 1.2726867713711478,
 'What about an xBox?': 1.2402155205260876,
 'It\'s just good that xbox owners have a thought of "I wish i had a PS3 right now" just before the next generation begins.': 1.2064498648757027,
 'The reason its much more expensive is cuz the ps3 is clearly the better platform': 1.171227728448263,
 'Switch from ps3 to xbox360': 1.1376433089375495,
 'This would be so much better sitting with an Xbox controller!': 1.1033319173715053,
 "But do you really think that PC is better than PS4, these people really seem

### Step 4: Calculate new rank for each item as r = a ∗ P + b ∗ Pd, with P being the original rank, Pd being the dissimilarity rank and a, b being constants in range [0, 1]

In [92]:
from collections import OrderedDict
# a,b ∈ [0,1]
a = 0.5
b = 0.5
new_rank = {}
ordered_dissimilarity_rank = OrderedDict(dissimilarity_rank)
for item in df_ils.index:
    P = C_prime['Similarity'][C_prime['Comment'] == item].values[0]
    Pd = ordered_dissimilarity_rank[item]
#     P = C_prime.index[C_prime['Comment'] == item]
#     Pd = list(ordered_dissimilarity_rank.keys()).index(item)
    new_rank[item] = (a * P) + (b * Pd)

### Step 5: Select the top-N items according to the newly calculated rank

In [94]:
final_ranks = {k: v for k, v in sorted(new_rank.items(), key=lambda item: item[1], reverse=True)}

In [96]:
data = []
for comment, score in final_ranks.items():
    data.append({'Comment': comment,'Rank': score})

df_sim = pd.DataFrame(data)
df_sim = df_sim.set_index(['Comment'])
similarities = []
for item in df_sim.index:
    similarities.append(ordered_dissimilarity_rank[item])

df_sim['Similarity'] = similarities
df_sim = df_sim.head(10)
df_sim.sort_values(by=['Rank'], ascending=False)

Unnamed: 0_level_0,Rank,Similarity
Comment,Unnamed: 1_level_1,Unnamed: 2_level_1
"Is it xbox, console, or all?",1.077644,1.461138
"""xbox and ps4 only""",1.063047,1.431752
"Still, more exclusives than PS4!",1.048312,1.401338
"I'll help, are you on PC, PS4 or Xbox One?",1.033073,1.370595
But everyone knows the PS4 is more powerful!,1.017348,1.338786
"Well, the ps4 isn't as broken as the Xbox one so it DOESNT need updating.",1.001461,1.305929
out of curiosity is there anything you prefer about the PS4 compared to the PC?,0.985972,1.272687
What about an xBox?,0.969859,1.240216
"It's just good that xbox owners have a thought of ""I wish i had a PS3 right now"" just before the next generation begins.",0.95307,1.20645
The reason its much more expensive is cuz the ps3 is clearly the better platform,0.937117,1.171228


In [97]:
# Find the Diversity
n = 10
df_copy = df_sim.copy()
df_copy = df_copy.drop(columns=['Rank'])
dis_similarity = [x for x in pdist(df_copy)]
avg_dissim_greedy = (sum(dis_similarity))/((n/2)*(n-1))
avg_dissim_greedy

0.11818890707322412

In [98]:
# Define Method for getting top-n similar comments/titles
def get_similar_posts(query, n):
    """
    query (string): the text of the post
    n (int): number of posts to recommend
    """
    # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
    top_k = min(n, len(corpus))
    query_embedding = embedder.encode(query, convert_to_tensor=True)
    similarities = []
    pairs = []

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.pytorch_cos_sim(query_embedding, sarcasm_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    print("Query:", query)
    print("\nTop {n} most similar sentences in corpus:".format(n=n))

    for score, idx in zip(top_results[0], top_results[1]):
#         print(corpus[idx], "(Score: {:.4f})".format(score))
        pairs.append(tuple((corpus[idx], score)))
    
    recommend_frame = []
    for val in pairs:
        recommend_frame.append({'Comment':val[0],'Similarity':val[1].cpu().numpy()})
     
    df = pd.DataFrame(recommend_frame)
    df = df.set_index(['Comment'])
    return df

In [99]:
df_control = get_similar_posts('Xbox is much better than PS4', 10)
df_control.head()

Query: Xbox is much better than PS4

Top 10 most similar sentences in corpus:


Unnamed: 0_level_0,Similarity
Comment,Unnamed: 1_level_1
"No, because the xbox one is soooo much better than the ps4",0.9228309
"but xbox has the highest quality pixels, so it should be better than ps4, right?",0.88953227
Xbox or PS4,0.8632204
"Ayy, better than Xbox 7 and PS5",0.8621861
Ps4 or Xbox,0.8430183


In [100]:
n = 10
dis_similarity = [x for x in pdist(df_control)]

avg_dissim_control = (sum(dis_similarity))/((n/2)*(n-1))
avg_dissim_control

0.05261304908328586

In [101]:
percent_change = ((avg_dissim_greedy - avg_dissim_control)/avg_dissim_control)*100
round(percent_change, 2)

124.64

In [None]:
### END OF NOTEBOOK ###