In [None]:
%load_ext autoreload 
%autoreload 2

import networkx
import pandas as pd
import glob
import nltk
from transformers import pipeline
from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import expit
import os
import os, json, openai, warnings, random
import numpy as np
import pandas as pd
from IPython.display import Image, display
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
instance_stats = {
  "domain": "mastodon.social",
  "title": "Mastodon",
  "version": "4.1.2+nightly-20230627",
  "source_url": "https://github.com/mastodon/mastodon",
  "description": "The original server operated by the Mastodon gGmbH non-profit",
  "usage": {
    "users": {
      "active_month": 221664
    }
  }
}

In [None]:
base_path = "../../data/replied_toots_2023_05_27/"
datasets = glob.glob("{}/toots_mastodon*.parquet".format(base_path))
toots_df = pd.concat([pd.read_parquet(data) for data in datasets], axis=0).reset_index(drop=True)
print(len(toots_df))
toots_df = toots_df.drop_duplicates(subset=['id'])
toots_df.describe()
# df = df[(df['content'].apply(len) < 256) & (df['language'] == 'en')]
# df = df[~df['content'].isna()].reset_index(drop=True)

In [None]:
datasets = glob.glob("{}/status_mastodon*.parquet".format(base_path))
statuses_df = pd.concat([pd.read_parquet(data) for data in datasets], axis=0).reset_index(drop=True)
statuses_df = statuses_df.drop_duplicates(subset=['id'])
statuses_df.loc[statuses_df['parent_reply_id'].isin(statuses_df['parent_reply_id'])]

In [None]:
import pyarrow
users_path = '../../data/2023-06-27-10kusers-dump.parquet'
batch_size = 1000
user_parquet = pyarrow.parquet.ParquetFile(users_path)
pq_iter = user_parquet.iter_batches(batch_size=batch_size)
r = []
user_df_batches = []
for iter_ in pq_iter:
    batch_df = iter_.to_pandas()
    user_df_batches.append(batch_df)
user_df = pd.concat(user_df_batches, axis=0)

In [None]:
accts = toots_df['account'].apply(lambda acc: acc['acct'])
# sns.histplot(accts, kde=True)
display(toots_df['replies_count'].value_counts(), toots_df['reblogs_count'].value_counts(), toots_df['favourites_count'].value_counts())

replies_only_df = toots_df.loc[toots_df['replies_count'] > 2]
# out of 167K, only 20K have replies, and 99% are less then 1.
# sns.histplot(data=replies_only_df, y="replies_count")
sns.histplot(data=replies_only_df, x="replies_count", binwidth=3)



In [None]:
# Stats to dump 
# Number of nodes
# Average REALY connection per person (Degree)
# Average closeness
# Clustering coeffiecietnt

# import matplotlib.pyplot as plt
# import networkx as nx

# First we have to add edges using the replies - and each reply may not exist?

# toots_df.loc[toots_df['in_reply_to_id'] & toots_df['in_reply_to_account_id']
# toots_reply = toots_df.loc[~toots_df['in_reply_to_account_id'].isnull()].copy()
# toots_reply['account_id_source'] = toots_reply['account'].apply(lambda acc: acc['id'])
# toots_reply = toots_reply.loc[toots_reply['in_reply_to_account_id'] == toots_reply['account_id_source']]

# toots_reply = toots_df.loc[toots_df['replies_count'] > 0]


# a=nx.Graph()

# len(toots_reply)
# for index, tr in toots_reply.iterrows():
#     a.add_edge(tr['in_reply_to_account_id'], tr['account']['id'])
#     break

# nx.draw(a, with_labels=True, font_weight='light')
# toots_reply['in_reply_to_account_id'] != toots_reply['account_id_source']]
# pd.set_option('max_colwidth', 800)
# toots_reply.iloc[5].T

In [None]:
from collections import defaultdict

statuses_df['in_reply_to_account_id'].value_counts()

# Lets collate things by the influencer nodes 
influencers = defaultdict(set)
for k, status in statuses_df.iterrows():
    if status['parent_account_id'] == None: # just a very few this is not filled incorrectly i believe
        continue
    influencers[status['parent_account_id']].add(status['account']['id'])
    
# from pprint import pprint
# pprint(influencers)
    
G=nx.Graph()

for dest, edges in influencers.items():
    for src in edges:
        G.add_edge(src, dest)

In [None]:
deg=nx.degree(a)

degree_sequence = sorted((d for n, d in G.degree()), reverse=True)
dmax = max(degree_sequence)

fig = plt.figure("Degree of a random graph", figsize=(8, 8))
# Create a gridspec for adding subplots of different sizes
axgrid = fig.add_gridspec(5, 4)

# Too slow
# ax0 = fig.add_subplot(axgrid[0:3, :])
# Gcc = G.subgraph(sorted(nx.connected_components(G), key=len, reverse=True)[0])
# pos = nx.spring_layout(Gcc, seed=10396953)
# nx.draw_networkx_nodes(Gcc, pos, ax=ax0, node_size=20)
# nx.draw_networkx_edges(Gcc, pos, ax=ax0, alpha=0.4)
# ax0.set_title("Connected components of G")
# ax0.set_axis_off()

ax1 = fig.add_subplot(axgrid[3:, :2])
ax1.plot(degree_sequence, "b-", marker="o")
ax1.set_title("Degree Rank Plot")
ax1.set_ylabel("Degree")
ax1.set_xlabel("Rank")

ax2 = fig.add_subplot(axgrid[3:, 2:])
ax2.bar(*np.unique(degree_sequence, return_counts=True))
ax2.set_title("Degree histogram")
ax2.set_xlabel("Degree")
ax2.set_ylabel("# of Nodes")

fig.tight_layout()
plt.show()

In [None]:
# Stats
import math

total_toots = len(toots_df)
replies_count_df = toots_df.loc[toots_df['replies_count'] > 0]
with_replies_pct = int(len(replies_count_df)/total_toots * 100)
median = replies_count_df['replies_count'].median()

print("""
Total Toots: {}
With Replies: {}%
Median Replies: {}""".format(total_toots, with_replies_pct, median))

median_degree = degree_sequence[int(len(degree_sequence) / 2)]
total_threads = (statuses_df['parent_account_id'] != statuses_df['parent_account_id'].shift(axis=0)).sum(axis=0)
# closeness = nx.closeness_centrality(G) # This is slow
# Average Closness {} hops - need to convert to hops - otherwise doesn't make sense
cluster_coefficient = nx.average_clustering(G)


print("""
Total Threads: {} 
Median Degree: {} conn/p
Cluster Coefficient {}%""".format(total_threads, median_degree, round(cluster_coefficient * 100, 2)))
