### Install Sentence Transformers

- q : Quiet installation (no output)
- U : Update if already installed

In [1]:
!pip install -qU sentence-transformers

## Import packages

- pandas : to work with tables
- json : to work  with JSON files (tweet timeline data)
- datetime : to work with dates (process tweet timestamps)
- math : to make formulas
- tqdm : for progress bars
- seaborn : plotting
- matplotlib : plotting
- sentence_transformers : embedding model
- sklearn : pairwise metrics

In [2]:
import pandas as pd
import json
from datetime import datetime
from datetime import timedelta
import math
from tqdm.auto import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import *



## Utility code for dynamic embedding generation

In [3]:
%%capture
user_data = pd.read_csv('/kaggle/input/twitter-timeline-data/user_data.csv')
user_columns = ['name','screen_name','location','description']
meta_data = ['followers_count','friends_count','created_at']
user_data_1 = user_data[user_columns]
meta_data = user_data[meta_data]
timeline_data = json.load(open('/kaggle/input/twitter-timeline-data/timelines/timelines.json'))
tweet_columns = ['created_at','text','entities["hashtags","user_mentions"]','source','user["screen_name"]','place']
def get_timeline_data(screen_name):
    t_data = []
    date_format = '%a %b %d %H:%M:%S %z %Y'
    for i in timeline_data:
        if len(i)==0:
            continue
        if i[0]['user']['screen_name']==screen_name:
#             print('found')
            for j in i:
                created_at = datetime.strptime(j['created_at'], date_format)
                text = j['text']
                hashtags = " ".join([h['text'] for h in j['entities']['hashtags']])
                mentions = " ".join([m['screen_name'] for m in j['entities']['user_mentions']])
                source = j['source']
                source_1 = source.index('>')
                source_2 = source.find('<', source_1)
#                 print(source_1)
#                 print(source_2)
                
                source = source[source_1+1:source_2]
                place = j['place']
                t_data.append({
                    "created_at":created_at,
                    "text":text,
                    "hashtags":hashtags,
                    "mentions":mentions,
                    "source":source,
                    "place":place
                })
            break
        else:
            continue
    return pd.DataFrame(t_data)
def exponential_decay_weights(datetime_list, alpha):
    latest_datetime = datetime_list[0]
    weights = [math.exp(-alpha * (latest_datetime - dt).days) for dt in datetime_list]
    return weights
model = SentenceTransformer('distiluse-base-multilingual-cased')

## Generate Tweet Embeddings of all the tweets

In [4]:
all_tweets = []
for user in tqdm(user_data['screen_name'].drop_duplicates().tolist()):
    d = get_timeline_data(user)
    if len(d)>0:
        all_tweets.extend(d['text'])
all_tweet_embeddings = model.encode(all_tweets)

  0%|          | 0/1058 [00:00<?, ?it/s]

Batches:   0%|          | 0/5902 [00:00<?, ?it/s]

## Static Profile Embedding
- not weighted
- concatenation of user embedding + first K tweet embeddings

In [5]:
def make_static_embeddings(screen_name, k = 10):
    user_df = user_data_1[user_data_1['screen_name']==screen_name].iloc[0].fillna('')
    timeline_df = get_timeline_data(screen_name)
    if len(timeline_df) ==0:
        user_part = ' '.join(user_df.tolist())
        user_embedding = model.encode(user_part, show_progress_bar = False)
        timeline_embedding = model.encode(['NULL'], show_progress_bar = False)
        return np.concatenate([user_embedding,timeline_embedding[0]])
    user_part = ' '.join(user_df.tolist())
    user_embedding = model.encode(user_part, show_progress_bar = False)
    timeline_embedding = model.encode((timeline_df['text']+' '+timeline_df['source']).tolist()[-k:], show_progress_bar = False, batch_size = 128)
    agg_embedding = np.array(sum([timeline_embedding[i] for i in range(len(timeline_embedding))]))
    final = np.concatenate([user_embedding,agg_embedding])
    return final

In [6]:
all_static_embeddings = []
for user in tqdm(user_data['screen_name'].drop_duplicates().tolist()):
    embedding = make_static_embeddings(user)
    all_static_embeddings.append(embedding)
all_static_embeddings = np.array(all_static_embeddings)

  0%|          | 0/1058 [00:00<?, ?it/s]

## Dynamic Profile Embedding
- Weighted
- concatenation of user embedding + weighted sum of user timeline tweets

In [7]:
def make_dynamic_embeddings(screen_name):
    user_df = user_data_1[user_data_1['screen_name']==screen_name].iloc[0].fillna('')
    timeline_df = get_timeline_data(screen_name)
    if len(timeline_df) ==0:
        user_part = ' '.join(user_df.tolist())
        user_embedding = model.encode(user_part, show_progress_bar = False)
        timeline_embedding = model.encode(['NULL'], show_progress_bar = False)
        return np.concatenate([user_embedding,timeline_embedding[0]])
    weights = np.array(exponential_decay_weights(timeline_df['created_at'],0.05))
    user_part = ' '.join(user_df.tolist())
    user_embedding = model.encode(user_part, show_progress_bar = False)
    timeline_embedding = model.encode(timeline_df['text']+' '+timeline_df['source'], show_progress_bar = False, batch_size = 128)
    weighted_embedding = np.array(sum([weights[i]*timeline_embedding[i] for i in range(len(weights))]))
    final = np.concatenate([user_embedding,weighted_embedding])
    return final

In [8]:
all_dynamic_embeddings = []
for user in tqdm(user_data['screen_name'].drop_duplicates().tolist()):
    embedding = make_dynamic_embeddings(user)
    all_dynamic_embeddings.append(embedding)
all_dynamic_embeddings = np.array(all_dynamic_embeddings)

  0%|          | 0/1058 [00:00<?, ?it/s]

## Recommendation System

- Takes input the username, number of tweets to recommend
- outputs the tweet recommendations based on the 
    - Static Embeddings
    - Dynamic Embeddings

In [9]:
def recommend_tweets(username, top_k = 5):
    user_static_profile_embedding = make_static_embeddings(username)
    user_dynamic_profile_embedding = make_dynamic_embeddings(username)
    
    tweet_embs = np.concatenate([all_tweet_embeddings, all_tweet_embeddings], axis = 1)
    
    static_recommendations = util.semantic_search(user_static_profile_embedding, tweet_embs, top_k = top_k)
    dynamic_recommendations = util.semantic_search(user_dynamic_profile_embedding, tweet_embs, top_k = top_k)
    return static_recommendations, dynamic_recommendations

In [10]:
s,r = recommend_tweets("RahulGandhi")

In [11]:
print("Static Recommendations")
for i in s[0]:
    print("Tweet:"+all_tweets[i['corpus_id']])
    print("Score:",i['score'])
    print()
print()
print("Dynamic Recommendations")
for i in r[0]:
    print("Tweet:"+all_tweets[i['corpus_id']])
    print("Score:",i['score'])
    print()
print()

Static Recommendations
Tweet:RT @ajitanjum: राहुल गांधी ने आज मोदी सरकार के लिए वो सब कहा, जो आज तक किसी ने नहीं कहा था ...
Link 
https://t.co/uZ3x2iZl7w https://t.co/X…
Score: 0.4711124897003174

Tweet:RT @NewsDayZimbabwe: 🔴Zimbabwe Republic Police have stopped Harare mayor Jacob Mafume from commissioning Rufaro this morning.
https://t.co/…
Score: 0.4709045886993408

Tweet:RT @INCBANSAL: शंकराचार्य जी ने हिला डाला सरकार के झूठे प्रचार को https://t.co/SJ1HJRnHgF
Score: 0.4690735638141632

Tweet:PM removed the CBI Director to stop him from investigating Rafale. 

Mr 56 broke the law when he bypassed CJI &amp; LOP… https://t.co/eTbYFYD5Y0
Score: 0.4684559106826782

Tweet:RT @INCBANSAL: चंद्रयान-3 में बहुत बड़ा घोटाला भी  सामने लाकर रखा है? 

राहुल गांधी जी सही बोलते हैं पूंजीपतियों की सरकार है https://t.co/p…
Score: 0.46752190589904785


Dynamic Recommendations
Tweet:RT @INCIndia: जननायक @RahulGandhi जी आज हरियाणा में पहलवानों के बीच पहुंचे। https://t.co/O3QqZFO2lA
Score: 0.5537458062171

In [12]:
print("Cumulative Score for static Embeddings:", sum([i['score'] for i in s[0]])/len(s[0]))
print("Cumulative Score for Dynamic Embeddings:", sum([i['score'] for i in r[0]])/len(r[0]))

Cumulative Score for static Embeddings: 0.4694136917591095
Cumulative Score for Dynamic Embeddings: 0.5392309904098511


In [16]:
#name = user_data_1['screen_name'].sample(1).tolist()[0]
users = user_data_1['screen_name'][-12:].tolist()
# print(users)
s,r = [],[]
for i in users:
    print(i)
    ts, tr = recommend_tweets(i, 5)
    s.append(ts)
    r.append(tr)
    
#s,r = recommend_tweets('RahulGandhi', 5)
#print(name)

Abhishek88242
RahulGandhi
elonmusk
Cristiano
BillGates
SrBachchan
imVkohli
BarackObama
EmmaWatson
jk_rowling
serenawilliams
narendramodi


In [17]:
print("Static Recommendations")
for i in s[0][0]:
    print("Tweet:"+all_tweets[i['corpus_id']])
    print("Score:",i['score'])
    print()
print()
print("Dynamic Recommendations")
for i in r[0][0]:
    print("Tweet:"+all_tweets[i['corpus_id']])
    print("Score:",i['score'])
    print()
print()

Static Recommendations
Tweet:RT @Prafull08038221: https://t.co/QiOccipzw1. Horticulture Students desreves equal opportunity. #Justice_For_Horticulture_Students
#Listen_…
Score: 0.6258661150932312

Tweet:RT @Prafull08038221: https://t.co/QiOccipzw1. Horticulture Students Deserves Equal opportunity.
#Justice_For_Horticulture_Students
#Listen_…
Score: 0.6225931644439697

Tweet:RT @Prafull08038221: https://t.co/QiOccipzw1. Horticulture Students Deserves Equal Opportunity. Revise the Eligibility for the post of SDAO…
Score: 0.5585254430770874

Tweet:RT @Nikhilr48915275: #Justicefor_horticulturist 
@officecmbihar  @NitishKumar
@TejYadav14  @yadavtejashwi
Nitish kumar के ड्रीम कॉलेज से प…
Score: 0.5180680155754089

Tweet:RT @MalawatBk: @kaankit Ye post ncte ki h to students ncte pr bharosa kre ya nhi kre https://t.co/g6Njgl8QGi
Score: 0.4934823513031006


Dynamic Recommendations
Tweet:RT @Prafull08038221: https://t.co/QiOccipzw1. Horticulture Students desreves equal opportunity. #Justice_For_

In [18]:
for i in range(len(users)):
    
    s_score = sum([i['score'] for i in s[i][0]])/len(s[i][0])
    r_score =  sum([i['score'] for i in r[i][0]])/len(r[i][0])
    if r_score > s_score:
        print("Score for,", users[i])
        print("Cumulative Score for Static Embeddings:", sum([i['score'] for i in s[i][0]])/len(s[i][0]))
        print("Cumulative Score for Dynamic Embeddings:", sum([i['score'] for i in r[i][0]])/len(r[i][0]))
        print()

Score for, RahulGandhi
Cumulative Score for Static Embeddings: 0.4694136917591095
Cumulative Score for Dynamic Embeddings: 0.5392309904098511

Score for, elonmusk
Cumulative Score for Static Embeddings: 0.5163802564144134
Cumulative Score for Dynamic Embeddings: 0.5286215901374817

Score for, BillGates
Cumulative Score for Static Embeddings: 0.45171797275543213
Cumulative Score for Dynamic Embeddings: 0.4849822521209717

Score for, SrBachchan
Cumulative Score for Static Embeddings: 0.513542890548706
Cumulative Score for Dynamic Embeddings: 0.5409840822219849

Score for, jk_rowling
Cumulative Score for Static Embeddings: 0.46547062397003175
Cumulative Score for Dynamic Embeddings: 0.5355152606964111

Score for, serenawilliams
Cumulative Score for Static Embeddings: 0.43730775117874143
Cumulative Score for Dynamic Embeddings: 0.46299226880073546



## Inference

- we can see the recommended tweets using dynamic profile embeddings are more relevant 
- the cumulative score for dynamic embedding recommendations is higher than static embedding recommendations, proving that the recommended tweets are more appropriate when dynamic embedding is used

## Engagement and Reach Prediction Framework

- currently uses the pool of 1058 users, but can be scaled to the whole platform
- takes input a tweet content, whose engagement metrics need to be analysed
- outputs the predicted number of impressions and engagements
- comparison for both Static and Dynamic Profiles

In [13]:
def engagement_reach_pred(tweet_content):
    tweet_embedding = model.encode(tweet_content, show_progress_bar = False)
    tweet_embedding = np.concatenate([tweet_embedding, tweet_embedding])
    threshold = 0
    s = util.cos_sim(tweet_embedding, all_static_embeddings).numpy()[0]
    s_arr = s > threshold
    s_indices = np.where(s_arr)[0]
    
    r = util.cos_sim(tweet_embedding, all_dynamic_embeddings).numpy()[0]
    r_arr = r > threshold
    r_indices = np.where(r_arr)[0]
    
    ud = user_data['screen_name'].drop_duplicates().tolist()
    s_reach = 0
    for i in s_indices:
        s_reach+=user_data[user_data['screen_name']==ud[i]].drop_duplicates('screen_name')['followers_count'].tolist()[0]
        
    r_reach = 0
    for i in r_indices:
        r_reach+=user_data[user_data['screen_name']==ud[i]].drop_duplicates('screen_name')['followers_count'].tolist()[0]
        
    
    return (s_reach, len(s_indices)), (r_reach, len(r_indices))

In [14]:
tweet = "Who will win 2025 elections?"
static_d, dynamic_d = engagement_reach_pred(tweet)
print(tweet)
print()
print("Static Reach Prediction:",static_d[0])
print("Static Engagement Prediction:",static_d[1])

print("Dynamic Reach Prediction:",dynamic_d[0])
print("Dynamic Engagement Prediction:",dynamic_d[1])

Who will win 2025 elections?

Static Reach Prediction: 729894011
Static Engagement Prediction: 1038
Dynamic Reach Prediction: 757323667
Dynamic Engagement Prediction: 1044


In [15]:
tweet = "I like playing golf"
static_d, dynamic_d = engagement_reach_pred(tweet)
print(tweet)
print()
print("Static Reach Prediction:",static_d[0])
print("Static Engagement Prediction:",static_d[1])

print("Dynamic Reach Prediction:",dynamic_d[0])
print("Dynamic Engagement Prediction:",dynamic_d[1])

I like playing golf

Static Reach Prediction: 732498050
Static Engagement Prediction: 984
Dynamic Reach Prediction: 732498070
Dynamic Engagement Prediction: 977
