In [1]:
import src.dependencies.injector as sdi
from src.shared.logger_factory import LoggerFactory
from src.process.data_cleaning.data_cleaning_distributions import jaccard_similarity
from src.shared.utils import get_project_root
from src.model.local_neighbourhood import LocalNeighbourhood
# Just for type signatures
from typing import List
from src.model.user import User
from src.model.social_graph.social_graph import SocialGraph
from src.model.cluster import Cluster
import argparse

log = LoggerFactory.logger(__name__)
DEFAULT_PATH = str(get_project_root()) + \
    "/src/scripts/config/create_social_graph_and_cluster_config.yaml"

In [4]:
injector = sdi.Injector.get_injector_from_file(DEFAULT_PATH)
process_module = injector.get_process_module()
dao_module = injector.get_dao_module()
user_activity_getters = {
    "friends": dao_module.get_user_activity_getter(user_activity="friends"),
    "user retweets": dao_module.get_user_activity_getter(user_activity="user retweets"),
    "user retweets ids": dao_module.get_user_activity_getter(user_activity="user retweets ids")
}
user_getter = dao_module.get_user_getter()
user_friend_getter = dao_module.get_user_friend_getter()

def get_user_activities(user_id: str, user_activity: str):
    return user_activity_getters[user_activity].get_user_activity(user_id)

In [3]:
def compute_activity_similarity(user1: str, user2: str, user_activity: str):
    user1_activity = get_user_activities(user1, user_activity)
    user2_activity = get_user_activities(user2, user_activity)
    return jaccard_similarity(user1_activity, user2_activity)

def compute_retweeted_friends_similarity(user1: str, user2: str):
    user1_retweeted_users = get_user_activities(user1, "user retweets")
    user2_retweeted_users = get_user_activities(user2, "user retweets")
    user1_friends = user_friend_getter.get_user_friends_ids(user1)
    user2_friends = user_friend_getter.get_user_friends_ids(user2)
    user1_retweeted_friends = list(set(user1_retweeted_users).intersection(set(user1_friends)))
    user2_retweeted_friends = list(set(user2_retweeted_users).intersection(set(user2_friends)))
    return jaccard_similarity(user1_retweeted_friends, user2_retweeted_friends)

In [5]:
# Clean users function from local_neighbourhood_downloader.py
def clean_user_friends_global(self, user_id, friends_list):
    user = self.user_getter.get_user_by_id(str(user_id))
    log.info("Cleaning Friends List by Follower and Friend")
    t = 0.1

    num_users = len(friends_list)
    clean_friends_list = friends_list
    while (num_users > 1000):
        num_users = len(friends_list)
        clean_friends_list = []
        follower_thresh = t * user.followers_count
        friend_thresh = t * user.friends_count
        print(
            f"Data cleaning with thresholds {follower_thresh, friend_thresh}")
        for id in friends_list:
            num_users -= 1
            curr_user = self.user_getter.get_user_by_id(id)
            if user is not None and curr_user is not None and curr_user.followers_count > follower_thresh and curr_user.friends_count > friend_thresh:
                clean_friends_list.append(id)
                num_users += 1
        log.info(
            f"Increasing Data Cleaning Strength {t}, {num_users} remaining users")
        t += 0.05
    return clean_friends_list, t

In [6]:
def compare_user_friends(seed_user: str, activity1: str, activity2: str, activity_1_thresh: float, clean: bool = True):
    # Get user friends
    friends_map = {}
    sim_map = {}
    user_friends = user_friend_getter.get_user_friends_ids(seed_user)
    if clean:
        user_friends, _ = clean_user_friends_global(seed_user, user_friends)

    for user1 in user_friends:
        friends_map[user1] = []
        sim_map[user1] = 0
        for user2 in user_friends:
            if user1 != user2:
                sim = compute_activity_similarity(user1, user2, activity1)
                if sim >= activity_1_thresh:
                    friends_map[user1].append(user2)
                    # If activity1 sim above threshold, check activity2
                    sim_map[user1] += compute_activity_similarity(user1, user2, activity2)
                    
                    # If activity1 = 'friends', we can also check retweeted friends
                    # sim_map[user1] += compute_retweeted_friends_similarity(user1, user2)
        
        # Obtain average similarity
        sim_map[user1] /= len(friends_map[user1])

    return sim_map

In [7]:
def compare_retweeted_and_friends(user_id: str):
    # Get user friends
    user_friends = user_friend_getter.get_user_friends_ids(user_id)
    user_retweeted_users = get_user_activities(user_id, "user retweets")
    user_retweeted_friends = list(set(user_retweeted_users).intersection(set(user_friends)))

    # Return ratios
    return len(user_retweeted_friends) / len(user_friends), len(user_retweeted_friends) / len(user_retweeted_users)