### Install Sentence Transformers

- q : Quiet installation (no output)
- U : Update if already installed

In [2]:
!pip install -qU sentence-transformers

## Import packages

- pandas : to work with tables
- json : to work  with JSON files (tweet timeline data)
- datetime : to work with dates (process tweet timestamps)
- math : to make formulas
- tqdm : for progress bars
- seaborn : plotting
- matplotlib : plotting
- sentence_transformers : embedding model
- sklearn : pairwise metrics

In [3]:
import pandas as pd
import json
from datetime import datetime
from datetime import timedelta
import math
from tqdm.auto import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import *

## Utility code for dynamic embedding generation ( with 'all-MiniLM-L12-v2)

In [4]:
!pwd

/kaggle/working


In [5]:
# %%capture
user_data = pd.read_csv('/kaggle/input/twitter-timeline-data/user_data.csv')
user_columns = ['name','screen_name','location','description']
meta_data = ['followers_count','friends_count','created_at']
user_data_1 = user_data[user_columns]
meta_data = user_data[meta_data]
timeline_data = json.load(open('/kaggle/input/twitter-timeline-data/timelines/timelines.json'))
tweet_columns = ['created_at','text','entities["hashtags","user_mentions"]','source','user["screen_name"]','place']
def get_timeline_data(screen_name):
    t_data = []
    date_format = '%a %b %d %H:%M:%S %z %Y'
    for i in timeline_data:
        if len(i)==0:
            continue
        if i[0]['user']['screen_name']==screen_name:
#             print('found')
            for j in i:
                created_at = datetime.strptime(j['created_at'], date_format)
                text = j['text']
                hashtags = " ".join([h['text'] for h in j['entities']['hashtags']])
                mentions = " ".join([m['screen_name'] for m in j['entities']['user_mentions']])
                source = j['source']
                source_1 = source.index('>')
                source_2 = source.find('<', source_1)
#                 print(source_1)
#                 print(source_2)
                
                source = source[source_1+1:source_2]
                place = j['place']
                t_data.append({
                    "created_at":created_at,
                    "text":text,
                    "hashtags":hashtags,
                    "mentions":mentions,
                    "source":source,
                    "place":place
                })
            break
        else:
            continue
    return pd.DataFrame(t_data)
def exponential_decay_weights(datetime_list, alpha):
    latest_datetime = datetime_list[0]
    weights = [math.exp(-alpha * (latest_datetime - dt).days) for dt in datetime_list]
    return weights
model = SentenceTransformer('all-MiniLM-L12-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## Generate Tweet Embeddings of all the tweets

In [6]:
all_tweets = []
for user in tqdm(user_data['screen_name'].drop_duplicates().tolist()):
    d = get_timeline_data(user)
    if len(d)>0:
        all_tweets.extend(d['text'])
all_tweet_embeddings = model.encode(all_tweets)

  0%|          | 0/1058 [00:00<?, ?it/s]

Batches:   0%|          | 0/5902 [00:00<?, ?it/s]

## Static Profile Embedding
- not weighted
- concatenation of user embedding + first K tweet embeddings

In [7]:
def make_static_embeddings(screen_name, k = 20):
    user_df = user_data_1[user_data_1['screen_name']==screen_name].iloc[0].fillna('')
    timeline_df = get_timeline_data(screen_name)
    if len(timeline_df) ==0:
        user_part = ' '.join(user_df.tolist())
        user_embedding = model.encode(user_part, show_progress_bar = False)
        timeline_embedding = model.encode(['NULL'], show_progress_bar = False)
        return np.concatenate([user_embedding,timeline_embedding[0]])
    user_part = ' '.join(user_df.tolist())
    user_embedding = model.encode(user_part, show_progress_bar = False)
    timeline_embedding = model.encode((timeline_df['text']+' '+timeline_df['source']).tolist()[-k:], show_progress_bar = False, batch_size = 128)
    agg_embedding = np.array(sum([timeline_embedding[i] for i in range(len(timeline_embedding))]))
    final = np.concatenate([user_embedding,agg_embedding])
    return final

In [8]:
all_static_embeddings = []
for user in tqdm(user_data['screen_name'].drop_duplicates().tolist()):
    embedding = make_static_embeddings(user)
    all_static_embeddings.append(embedding)
all_static_embeddings = np.array(all_static_embeddings)

  0%|          | 0/1058 [00:00<?, ?it/s]

## Dynamic Profile Embedding
- Weighted
- concatenation of user embedding + weighted sum of user timeline tweets

In [9]:
def make_dynamic_embeddings(screen_name):
    user_df = user_data_1[user_data_1['screen_name']==screen_name].iloc[0].fillna('')
    timeline_df = get_timeline_data(screen_name)
    if len(timeline_df) ==0:
        user_part = ' '.join(user_df.tolist())
        user_embedding = model.encode(user_part, show_progress_bar = False)
        timeline_embedding = model.encode(['NULL'], show_progress_bar = False)
        return np.concatenate([user_embedding,timeline_embedding[0]])
    weights = np.array(exponential_decay_weights(timeline_df['created_at'],0.05))
    user_part = ' '.join(user_df.tolist())
    user_embedding = model.encode(user_part, show_progress_bar = False)
    timeline_embedding = model.encode(timeline_df['text']+' '+timeline_df['source'], show_progress_bar = False, batch_size = 128)
    weighted_embedding = np.array(sum([weights[i]*timeline_embedding[i] for i in range(len(weights))]))
    final = np.concatenate([user_embedding,weighted_embedding])
    return final

In [10]:
all_dynamic_embeddings = []
for user in tqdm(user_data['screen_name'].drop_duplicates().tolist()):
    embedding = make_dynamic_embeddings(user)
    all_dynamic_embeddings.append(embedding)
all_dynamic_embeddings = np.array(all_dynamic_embeddings)

  0%|          | 0/1058 [00:00<?, ?it/s]

In [11]:
# np.save('all_tweet_embeddings.npy', all_tweet_embeddings)
np.save('all_static_embeddings_20.npy', all_static_embeddings)
# np.save('all_dynamic_embeddings.npy', all_dynamic_embeddings)

## Recommendation System

- Takes input the username, number of tweets to recommend
- outputs the tweet recommendations based on the 
    - Static Embeddings
    - Dynamic Embeddings

In [14]:
def recommend_tweets(username, top_k = 5):
    user_static_profile_embedding = make_static_embeddings(username)
    user_dynamic_profile_embedding = make_dynamic_embeddings(username)
    
    tweet_embs = np.concatenate([all_tweet_embeddings, all_tweet_embeddings], axis = 1)
    
    static_recommendations = util.semantic_search(user_static_profile_embedding, tweet_embs, top_k = top_k)
    dynamic_recommendations = util.semantic_search(user_dynamic_profile_embedding, tweet_embs, top_k = top_k)
    return static_recommendations, dynamic_recommendations

In [35]:
#name = user_data_1['screen_name'].sample(1).tolist()[0]
users = user_data_1['screen_name'][-12:].tolist()
# print(users)
s,r = [],[]
for i in users:
    print(i)
    ts, tr = recommend_tweets(i, 5)
    s.append(ts)
    r.append(tr)
    
#s,r = recommend_tweets('RahulGandhi', 5)
#print(name)

Abhishek88242
RahulGandhi
elonmusk
Cristiano
BillGates
SrBachchan
imVkohli
BarackObama
EmmaWatson
jk_rowling
serenawilliams
narendramodi


In [36]:
print("Static Recommendations")
for i in s[0][0]:
    print("Tweet:"+all_tweets[i['corpus_id']])
    print("Score:",i['score'])
    print()
print()
print("Dynamic Recommendations")
for i in r[0][0]:
    print("Tweet:"+all_tweets[i['corpus_id']])
    print("Score:",i['score'])
    print()
print()

Static Recommendations
Tweet:RT @Prafull08038221: https://t.co/QiOccipzw1. Horticulture Students Deserves Equal opportunity.
#Justice_For_Horticulture_Students
#Listen_…
Score: 0.5924944281578064

Tweet:RT @Prafull08038221: https://t.co/QiOccipzw1. Horticulture Students desreves equal opportunity. #Justice_For_Horticulture_Students
#Listen_…
Score: 0.5895190238952637

Tweet:RT @Prafull08038221: https://t.co/QiOccipzw1. Horticulture Students Deserves Equal Opportunity. Revise the Eligibility for the post of SDAO…
Score: 0.5583329796791077

Tweet:RT @Komal52828678: Justice for b.ed students 🙏🙏🙏🙏🙏#BEDprotestforPRT
@Aamitabh2 @dpradhanbjp
@ASTHAKAUSHIK
@bstvLive
@EduMinOfIndia https://…
Score: 0.48357242345809937

Tweet:RT @AnkitYa26500481: We want justice #WeWantBEDInPRT
Score: 0.478013813495636


Dynamic Recommendations
Tweet:RT @Prafull08038221: https://t.co/QiOccipzw1. Horticulture Students Deserves Equal opportunity.
#Justice_For_Horticulture_Students
#Listen_…
Score: 0.59249442815780

In [37]:
# # print("Cumulative Score for static Embeddings:", sum([i['score'] for i in k[0]])/len(s[0]))
# print("Cumulative Score for static Embeddings:", sum([sum([i['score'] for i in k[0]])/len(k[0]) for k in s])/len(s))
# # print("Cumulative Score for Dynamic Embeddings:", sum([i['score'] for i in r[0]])/len(r[0]))
# print("Cumulative Score for Dynamic Embeddings:", sum([sum([i['score'] for i in k[0]])/len(k[0]) for k in r])/len(r))

In [38]:
for i in range(len(users)):
    
    s_score = sum([i['score'] for i in s[i][0]])/len(s[i][0])
    r_score =  sum([i['score'] for i in r[i][0]])/len(r[i][0])
    if r_score > s_score:
        print("Score for,", users[i])
        print("Cumulative Score for Static Embeddings:", sum([i['score'] for i in s[i][0]])/len(s[i][0]))
        print("Cumulative Score for Dynamic Embeddings:", sum([i['score'] for i in r[i][0]])/len(r[i][0]))
        print()

Score for, RahulGandhi
Cumulative Score for Static Embeddings: 0.5371214151382446
Cumulative Score for Dynamic Embeddings: 0.582498562335968

Score for, BillGates
Cumulative Score for Static Embeddings: 0.4641103744506836
Cumulative Score for Dynamic Embeddings: 0.4756736159324646

Score for, SrBachchan
Cumulative Score for Static Embeddings: 0.5411620259284973
Cumulative Score for Dynamic Embeddings: 0.5603666663169861

Score for, BarackObama
Cumulative Score for Static Embeddings: 0.4793795824050903
Cumulative Score for Dynamic Embeddings: 0.505518639087677

Score for, jk_rowling
Cumulative Score for Static Embeddings: 0.4502240717411041
Cumulative Score for Dynamic Embeddings: 0.4620263695716858

Score for, narendramodi
Cumulative Score for Static Embeddings: 0.4856266677379608
Cumulative Score for Dynamic Embeddings: 0.5588220596313477



In [198]:
print("static_similarity_matrix shape:", static_similarity_matrix.shape)
print("ground_truth_embeddings[:min_samples, :] shape:", ground_truth_embeddings[:min_samples, :].shape)



static_similarity_matrix shape: (1058, 1058)
ground_truth_embeddings[:min_samples, :] shape: (1058, 768)


## Inference

- we can see the recommended tweets using dynamic profile embeddings are more relevant 
- the cumulative score for dynamic embedding recommendations is higher than static embedding recommendations, proving that the recommended tweets are more appropriate when dynamic embedding is used

## Engagement and Reach Prediction Framework

- currently uses the pool of 1058 users, but can be scaled to the whole platform
- takes input a tweet content, whose engagement metrics need to be analysed
- outputs the predicted number of impressions and engagements
- comparison for both Static and Dynamic Profiles

In [158]:
def engagement_reach_pred(tweet_content):
    tweet_embedding = model.encode(tweet_content, show_progress_bar = False)
    tweet_embedding = np.concatenate([tweet_embedding, tweet_embedding])
    threshold = 0
    s = util.cos_sim(tweet_embedding, all_static_embeddings).numpy()[0]
    s_arr = s > threshold
    s_indices = np.where(s_arr)[0].tolist()
    
    r = util.cos_sim(tweet_embedding, all_dynamic_embeddings).numpy()[0]
    r_arr = r > threshold
    r_indices = np.where(r_arr)[0].tolist()
    
    ud = user_data['screen_name'].drop_duplicates().tolist()
    s_reach = 0
    for i in s_indices:
        s_reach+=user_data[user_data['screen_name']==ud[i]].drop_duplicates('screen_name')['followers_count'].tolist()[0]
        
    r_reach = 0
    for i in r_indices:
        r_reach+=user_data[user_data['screen_name']==ud[i]].drop_duplicates('screen_name')['followers_count'].tolist()[0]
        
    
    return (s_reach, len(s_indices)), (r_reach, len(r_indices))

**Features**
- This function will predict the number of engagements with the given tweet and number of accounts reached
- It works by predicting the engagement with a user by setting a threshold value for the similarity score between the user's profile embedding and the tweet content embedding
- The users passing the threshold scores are predicted as the ones engaging with the tweet
- The sum of followers of the users will be predicted as the total reach of the tweet
- Added functionality to compare static and dynamic profile embeddings

**Blockers**
- Very less data, with no information about the followrs
- Inavailibility of user activity data (posts liked/retweeted by a user)