# Total Diversity Effect (TDE) Ranking Recommendations

This notebook implements the TDE Ranking Algorithm that was defined in the [Enhancing Diversity-Accuracy Technique on User-Based Top-N Recommendation Algorithms](https://sci-hub.se/https://ieeexplore.ieee.org/document/6605824/references#references) research paper. The dataset can be found [here](https://www.kaggle.com/timschaum/subreddit-recommender).


In [1]:
import pandas as pd
import numpy as np
from scipy.spatial import distance
from scipy.spatial.distance import pdist
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df_m = pd.read_csv('../../civility/recommender/train-balanced-sarcasm-processed.csv')

In [3]:
# Add all comments to a list
corpus = df_m['comment'].to_list()

In [4]:
from sentence_transformers import SentenceTransformer, util
import torch

embedder = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Embed each comment
import time
start_time = time.time()
sarcasm_embeddings = embedder.encode(corpus, convert_to_tensor=True)
end_time = time.time()
print("Time for computing embeddings:"+ str(end_time-start_time) )

Time for computing embeddings:339.6262834072113


In [5]:
# Add vector embeddings as column in df
vectors = []
for vector in sarcasm_embeddings:
    vectors.append(list(vector.cpu().numpy()))
    
df_m['vector'] = vectors

In [None]:
#S-TDE Rank

In [6]:
# Step 1: Generate a list of Top N+S recommendations (N between 3 and 10; S between 1 and 10)

In [84]:
# Define Method for getting top-n similar comments
def get_similar_posts(query, n):
    """
    query (string): the text of the post
    n (int): number of posts to recommend
    """
    # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
    top_k = min(n, len(corpus))
    query_embedding = embedder.encode(query, convert_to_tensor=True)
    similarities = []
    pairs = []

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.pytorch_cos_sim(query_embedding, sarcasm_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    print("Query:", query)
    print("\nTop {n} most similar sentences in corpus:".format(n=n))

    for score, idx in zip(top_results[0], top_results[1]):
        pairs.append(tuple((corpus[idx], score)))
    
    recommend_frame = []
    for val in pairs:
        recommend_frame.append({'Comment':val[0],'Similarity':val[1].cpu().numpy()})
     
    df = pd.DataFrame(recommend_frame)
    return df

In [85]:
N = 10
S = 5
C_prime = get_similar_posts('Obama is NOT american', N+S)
C_prime = C_prime.join(df_m.set_index('comment'), on='Comment')
C_prime.head()

Query: Obama is NOT american

Top 15 most similar sentences in corpus:


Unnamed: 0,Comment,Similarity,label,author,subreddit,score,ups,downs,date,created_utc,parent_comment,vector
0,Let's not forget Obama's not American.,0.797965,1,wiseaus_stunt_double,hockey,5,5,0,2014-07,2014-07-06 19:00:47,"Look, not to be a homer, but I legitimately wo...","[0.43582183, 0.07268912, -0.31050923, -0.63595..."
1,He isn't American,0.76655585,0,justafanpassingby,soccer,1,1,0,2016-06,2016-06-14 05:32:20,You sound like someone who believes the moon l...,"[0.54248154, -0.18107486, 0.119683914, -0.6661..."
2,Not Obama?,0.7615357,1,ijustwantanfingname,technology,1,1,0,2013-04,2013-04-21 05:06:27,Because politicians never lie... Right?,"[0.3969663, 0.40757543, 0.10303464, -0.6554833..."
3,Well we got Obama and he's not american born,0.75621384,1,pajepper_kepper,Futurology,58,-1,-1,2016-11,2016-11-23 18:28:19,can't. Not born in the US South African-born,"[0.41099304, 0.31186214, -0.24328573, -0.22447..."
4,That's not Obama,0.7377236,1,Chuew12345,pics,1,1,0,2015-10,2015-10-15 02:53:33,Selfie with the president,"[0.13465954, -0.08795648, -0.069007166, -0.425..."


In [59]:
# Step 2: Calculate the TDE of each item as the sum of distances to all other (N+S-1) items on the list

In [86]:
from scipy.spatial import distance

df_distance = C_prime.copy()
TDE = {}
vectors = C_prime['vector'].to_list()

for i, vector in enumerate(vectors):
    other_vectors = C_prime['vector'].to_list()
    other_vectors.remove(vector)
    TDE[i] = 0
    for vec in other_vectors:
        TDE[i] += distance.euclidean(vector, vec)

In [87]:
# Step 3: Remove S items with the lowest TDE score and so generate the Top N recommendations for the current user.

In [88]:
TDE = dict(sorted(TDE.items(), key=lambda item: item[1], reverse=True))
for i in range(S):
    TDE.popitem()

In [89]:
TDE

{9: 104.47456359863281,
 14: 103.1512565612793,
 5: 101.95061302185059,
 7: 96.00982570648193,
 12: 92.7237057685852,
 8: 92.31631851196289,
 13: 91.66197776794434,
 2: 91.14478778839111,
 6: 88.89259362220764,
 4: 87.38392210006714}

In [90]:
result = []
for entry in C_prime.index:
    if entry in TDE.keys():
        result.append({'Comment': C_prime["Comment"].iloc[entry], 'Similarity': C_prime["Similarity"].iloc[entry]})
        
df = pd.DataFrame(result)
df = df.set_index(['Comment'])
df

Unnamed: 0_level_0,Similarity
Comment,Unnamed: 1_level_1
Not Obama?,0.7615357
That's not Obama,0.7377236
Obama is American..?,0.7302852
He must not be American,0.72961265
Obama is not a real human being,0.72588533
Not according to Obama...,0.70943683
American is not a race,0.69935656
Not being American.,0.6946306
Didn't Obama say that too,0.69011813
Why does Obama hate America?,0.6882608


In [91]:
n = 10
dis_similarity = [x for x in pdist(df)]
total_dissim_TDE = sum(dis_similarity)/((n)*((n-1)/2))
total_dissim_TDE

0.028404492802090116

In [92]:
# Define Method for getting top-n similar comments/titles
def get_similar_posts(query, n):
    """
    query (string): the text of the post
    n (int): number of posts to recommend
    """
    # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
    top_k = min(n, len(corpus))
    query_embedding = embedder.encode(query, convert_to_tensor=True)
    similarities = []
    pairs = []

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.pytorch_cos_sim(query_embedding, sarcasm_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    print("Query:", query)
    print("\nTop {n} most similar sentences in corpus:".format(n=n))

    for score, idx in zip(top_results[0], top_results[1]):
        pairs.append(tuple((corpus[idx], score)))
    
    recommend_frame = []
    for val in pairs:
        recommend_frame.append({'Comment':val[0],'Similarity':val[1].cpu().numpy()})
     
    df = pd.DataFrame(recommend_frame)
    df = df.set_index(['Comment'])
    return df

In [93]:
df_control = get_similar_posts('Obama is NOT american', 10)
df_control

Query: Obama is NOT american

Top 10 most similar sentences in corpus:


Unnamed: 0_level_0,Similarity
Comment,Unnamed: 1_level_1
Let's not forget Obama's not American.,0.797965
He isn't American,0.76655585
Not Obama?,0.7615357
Well we got Obama and he's not american born,0.75621384
That's not Obama,0.7377236
Obama is American..?,0.7302852
He must not be American,0.72961265
Obama is not a real human being,0.72588533
Not according to Obama...,0.70943683
American is not a race,0.69935656


In [94]:
n = 10
dis_similarity = [x for x in pdist(df_control)]

total_dissim_control = sum(dis_similarity)/((n)*((n-1)/2))
total_dissim_control

0.034506728914048934

In [95]:
percent_change = ((total_dissim_TDE - total_dissim_control)/total_dissim_control)*100
round(percent_change, 2)

-17.68

In [None]:
### END OF NOTEBOOK ###