In [None]:
%load_ext autoreload 
%autoreload 2


import pandas as pd
import glob
import nltk
from transformers import pipeline
from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import expit
import os
import os, json, openai, warnings, random
import numpy as np
import pandas as pd
from IPython.display import Image, display


from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import seaborn as sns
import networkx
from networkx.drawing.nx_agraph import graphviz_layout 
import pylab

In [None]:
# Called  https://mastodon.social/api/v2/instance
instance_stats = {
  "domain": "mastodon.social",
  "title": "Mastodon",
  "version": "4.1.2+nightly-20230627",
  "source_url": "https://github.com/mastodon/mastodon",
  "description": "The original server operated by the Mastodon gGmbH non-profit",
  "usage": {
    "users": {
      "active_month": 221664
    }
  }
}

In [None]:
# s3://ml-team-pocket/social_hackweek_2023/topic_discoveries/
# I used replied_toots_2023_05_27.tar.gz and untar into data folder.
data_path = "../../data"
base_path = "{}/replied_toots_2023_05_27".format(data_path)
datasets = glob.glob("{}/toots_mastodon*.parquet".format(base_path))
toots_df = pd.concat([pd.read_parquet(data) for data in datasets], axis=0).reset_index(drop=True)
print(len(toots_df))
toots_df = toots_df.drop_duplicates(subset=['id'])
display(toots_df.describe())

In [None]:
datasets = glob.glob("{}/status_mastodon*.parquet".format(base_path))
statuses_df = pd.concat([pd.read_parquet(data) for data in datasets], axis=0).reset_index(drop=True)
statuses_df = statuses_df[(statuses_df['content'].apply(len) < 256) & (statuses_df['language'] == 'en') & (statuses_df['content'] != '' )]
statuses_df = statuses_df.drop_duplicates(subset=['id'])
len(statuses_df)

In [None]:
accts = toots_df['account'].apply(lambda acc: acc['acct'])
display(toots_df['replies_count'].value_counts(), toots_df['reblogs_count'].value_counts(), toots_df['favourites_count'].value_counts())

replies_only_df = toots_df.loc[toots_df['replies_count'] > 0]
# out of 167K, only 20K have replies, and 99% are less then 1.
# sns.histplot(data=replies_only_df, y="replies_count")

display(toots_df.describe())
display(replies_only_df.describe())
replies_only_df_pruned = toots_df.loc[toots_df['replies_count'] > 5]
sns.histplot(data=replies_only_df_pruned, x="replies_count", binwidth=3)



In [None]:
from collections import defaultdict
import networkx as nx

def create_graph_from_status(_statuses_df, min_degree=0, max_degree=2**32):
    # Lets collate things by the influencer nodes 
    influencers = defaultdict(set)
    
    for k, status in _statuses_df.iterrows():
        if status['parent_account_id'] == None: # just a very few this is not filled incorrectly i believe
            continue
        if status['parent_account_id'] == status['account']['id']: #self
            continue
        influencers[status['parent_account_id']].add(status['account']['id'])
    
    # pruning
    to_delete = []
    for influencer, followers in influencers.items():
        if min_degree and len(followers) < min_degree or len(followers) > max_degree:
            to_delete.append(influencer)
        
    for infl in to_delete:
        del influencers[infl]
    
    G=nx.Graph()
    for dest, edges in influencers.items():
        for src in edges:
            G.add_edge(src, dest)
            
    return G

In [None]:
# Distribution of degree
status_gph = create_graph_from_status(statuses_df)
deg=nx.degree(status_gph)

degree_sequence = sorted((d for n, d in status_gph.degree()), reverse=True)
dmax = max(degree_sequence)

fig = plt.figure("Degree of a random graph", figsize=(8, 8))
# Create a gridspec for adding subplots of different sizes
axgrid = fig.add_gridspec(5, 4)

ax2 = fig.add_subplot(axgrid[3:, 2:])
ax2.bar(*np.unique(degree_sequence, return_counts=True))
ax2.set_title("Degree histogram")
ax2.set_xlabel("Degree")
ax2.set_ylabel("# of Nodes")

fig.tight_layout()
plt.show()

In [None]:
# Stats
import math

total_toots = len(toots_df)
replies_count_df = toots_df.loc[toots_df['replies_count'] > 0]
with_replies_pct = int(len(replies_count_df)/total_toots * 100)
median = replies_count_df['replies_count'].median()

print("""
Total Toots: {}
With Replies: {}%
Median Replies: {}""".format(total_toots, with_replies_pct, median))

median_degree = degree_sequence[int(len(degree_sequence) / 2)]
avg_degree = sum(degree_sequence)/len(degree_sequence)

total_threads = (statuses_df['parent_account_id'] != statuses_df['parent_account_id'].shift(axis=0)).sum(axis=0)
# closeness = nx.closeness_centrality(G) # This is slow
# Average Closness {} hops - need to convert to hops - otherwise doesn't make sense
cluster_coefficient = nx.average_clustering(status_gph)


print("""
Total Threads: {} 
Median Degree: {} connn/pp
Avg Degree: {}
Cluster Coefficient {}%""".format(total_threads, median_degree,  avg_degree, round(cluster_coefficient * 100, 2)))


In [None]:
def draw_kamada(G):
    nodes = G.nodes()
    degree = G.degree()
    colors = [degree[n] for n in nodes]
    pos = nx.kamada_kawai_layout(G)
    #pos = nx.spring_layout(G, k = 0.2)
    cmap = plt.cm.viridis_r
    cmap = plt.cm.Greys

    vmin = min(colors)
    vmax = max(colors)

    fig = plt.figure(figsize = (15,9), dpi=100)

    nx.draw(G,pos,alpha = 0.8, nodelist = nodes, node_color = 'w', node_size = 10, with_labels= False,font_size = 6, width = 0.2, cmap = cmap, edge_color ='yellow')
    fig.set_facecolor('#0B243B')

    plt.legend()
    plt.show()


def analyze_centrality_1(G, fast=False):

    degrees_df = pd.DataFrame.from_dict({node: val for node, val in nx.degree(status_gph)}, orient='index')
    degrees_df.index.names=['ID']
    degrees_df.columns = ['degree']
    degrees_df.reset_index(level=0, inplace=True)
    analyse = degrees_df

    # Betweenness centrality 
    if not fast:
        bet_cen = nx.betweenness_centrality(G)
        df_bet_cen = pd.DataFrame.from_dict(bet_cen, orient='index')
        df_bet_cen.columns = ['betweenness_centrality']
        df_bet_cen.index.names = ['ID']
        df_bet_cen.reset_index(level=0, inplace=True)
        analyse= pd.merge(analyse, df_bet_cen, on = ['ID'])

    # Clustering coefficient 
    clust_coefficients = nx.clustering(G)
    df_clust = pd.DataFrame.from_dict(clust_coefficients, orient='index')
    df_clust.columns = ['clust_coefficient']
    df_clust.index.names = ['ID']
    df_clust.reset_index(level=0, inplace=True)
    analyse= pd.merge(analyse, df_clust, on = ['ID'])

    # Closeness centrality 
    if not fast:
        clo_cen = nx.closeness_centrality(G)
        df_clo = pd.DataFrame.from_dict(clo_cen, orient='index')
        df_clo.columns = ['closeness_centrality']
        df_clo.index.names = ['ID']
        df_clo.reset_index(level=0, inplace=True)
        analyse= pd.merge(analyse, df_clo, on = ['ID'])

    # Eigenvector centrality
    #eig_cen = nx.eigenvector_centrality(G)
    #eig_cen = nx.katz_centrality(G)
    eig_cen = nx.eigenvector_centrality_numpy(G)
    df_eig = pd.DataFrame.from_dict(eig_cen, orient='index')
    df_eig.columns = ['eigenvector_centrality']
    df_eig.index.names = ['ID']
    df_eig.reset_index(level=0, inplace=True)
    analyse= pd.merge(analyse, df_eig, on = ['ID'])
    return analyse

In [None]:
# Sort betweeness and degree.  A node can have a lot of connection, but influence various by WHO you are connected to.
# Downloaded centrality_15K.parquet from se
should_regenerate = False
if should_regenerate:
    status_gph = create_graph_from_status(statuses_df, False)
    centrality = analyze_centrality_1(status_gph)
    centrality.to_parquet("{}/results/centrality_15K.parquet".format(data_path))
else:
    centrality = pd.read_parquet("{}/results/centrality_15K.parquet".format(data_path))

print('Total Results {}'.format(len(status_gph)))
median_degree = centrality['degree']
print('Top 25 Sorted by Betweeness')
display(centrality.sort_values('betweenness_centrality', ascending=False).head(25))

print('Top 25 Sorted by  Degree')
display(centrality.sort_values('degree', ascending=False).head(50))


# Note to lookup ID - we need to use the `/api/v1/accounts` and look up the URL

In [None]:
def get_graph_from_user(user_id, degrees_to_search = 2):
    _connector_df = statuses_df
    _connector_nodes = _connector_df.loc[_connector_df['parent_account_id'] == user_id].copy()
    _connector_nodes['degree'] = 1
    for deg in range(0, degrees_to_search):
        fofs_ids = _connector_nodes['account'].apply(lambda acc: acc['id']).unique()
        fofs_nodes = _connector_df.loc[_connector_df['parent_account_id'].isin(fofs_ids)].copy()
        fofs_nodes['degree'] = deg + 2
        _connector_nodes = pd.concat([_connector_nodes, fofs_nodes], axis=0)
        
    _connector_gph = create_graph_from_status(_connector_nodes, False)
    return _connector_nodes, _connector_gph

In [None]:
# Can I just look at how this is connecting the 2?

# Take a look at 3 degrees
high_betweeness = '109447331150259202'
connector_nodes, connector_gph = get_graph_from_user(high_betweeness, 3)
draw_kamada(connector_gph)


In [None]:
# Can I just look at how this is connecting the 2?

# Take a look at 3 degrees
# less_connector_nodes = statuses_df.loc[statuses_df['parent_account_id'] == "30437"]

low_betweeness = '30437'
# low_betweeness = '38659'
less_connector_nodes, less_connector_gph = get_graph_from_user(low_betweeness, 3)
draw_kamada(less_connector_gph)

In [None]:
# TBD we did 35K reply threads
topics_threads = pd.read_parquet('{}/results/df_for_discovery_35k_full.parquet'.format(data_path))
print(len(topics_threads))

In [None]:
topics_threads['topics_single'] = topics_threads['topics'].apply(lambda t: t[0] if len(t) > 0  else '')
merged_topics = connector_nodes.merge(topics_threads, left_on='id', right_on='id', how = 'inner')
display(merged_topics.describe())
display(merged_topics.head(10))


In [None]:
# NOTE: notes is from account.  Content is the main content and is stemmed/stopword removed
# Here's we can join centrality matrix and give the top eigenvector 


def get_topics_by_degree(_connector_nodes):
    merged_topics = _connector_nodes.merge(topics_threads, left_on='id', right_on='id', how = 'inner')
    merged_topics = merged_topics.loc[merged_topics['topics_single'] != ""]

    first_degree = merged_topics.loc[merged_topics['degree'] == 1]
    second_degree = merged_topics.loc[merged_topics['degree'] == 2]
    third_degree = merged_topics.loc[merged_topics['degree'] == 3]


    display(first_degree['topics_single'].value_counts())
    display(second_degree['topics_single'].value_counts())
    display(third_degree['topics_single'].value_counts())

# For the connector nodes above - just walk through each of the connections.  Do this by
# (1) Look for first degree connections - group by subjects

# get_topics_by_degree(connector_nodes)
get_topics_by_degree(less_connector_nodes)