In [8]:
import pandas as pd
import praw
import numpy as np
import networkx as nx
from networkx.algorithms import bipartite
import math
import numpy as np
import igraph as ig
import leidenalg
import pickle
import matplotlib.pyplot as plt

reddit = praw.Reddit("project")
commentFrame = pd.read_pickle("comments.pkl")

variance = commentFrame["sentimentLabel"].var()
print(variance)

0.5405076377351656


In [9]:
graph = nx.Graph()
graph.add_nodes_from(commentFrame["original post"].unique(), bipartite = 0)
graph.add_nodes_from(commentFrame["author"].unique(), bipartite = 1)#average weighted upvotes, average sentiment

author_upvotes = commentFrame.groupby("author")["upvoteScale"].mean().to_dict()

author_sentiment = (
    commentFrame.groupby("author")["AverageUserSentiment"].first().to_dict()
)

nx.set_node_attributes(graph, author_upvotes, "avg_upvotes")
nx.set_node_attributes(graph, author_sentiment, "avg_sentiment")
edges = zip(
    commentFrame["author"],
    commentFrame["original post"],
    commentFrame["body"],
    commentFrame["upvoteScale"]
)
#attributes arnt carried over
graph.add_edges_from((a, b, {"body": c, "upvotes": d}) for a, b, c, d in edges)

uniqueUsers = [node for node, dic in graph.nodes(data = True) if dic["bipartite"] == 1]
biGraph = bipartite.weighted_projected_graph(graph, uniqueUsers)


In [10]:
print(biGraph.number_of_nodes())
print(biGraph.number_of_edges())
userGraph = biGraph.copy()
edges_to_remove = [(u,v) for u, v, d in biGraph.edges(data = True) if d["weight"] <= 1]
userGraph.remove_edges_from(edges_to_remove)

isolates = list(nx.isolates(userGraph))
userGraph.remove_nodes_from(isolates)
print(f"isolates:{len(list(nx.isolates(userGraph)))}")
print(f"nodes:{userGraph.number_of_nodes()}")
print(f"edges:{userGraph.number_of_edges()}")
#userGraph = biGraph.copy() #CHANGE THIS LINE BACK

4441
458234
isolates:0
nodes:608
edges:2344


In [11]:
fullGraph = nx.Graph()
fullGraph.add_nodes_from(userGraph)

for u, v, data in userGraph.edges(data = True):
    raw_weight = data.get("weight", 1)

    deg_u = userGraph.degree(u, weight = "weight")
    deg_v = userGraph.degree(v, weight  = "weight")
    normalized = raw_weight / math.sqrt(deg_u * deg_v) if deg_u > 0 and deg_v > 0 else 0
    fullGraph.add_edge(u, v, weight = normalized)
user_sentiment = commentFrame.groupby("author")['sentimentLabel'].mean().to_dict()
nx.set_node_attributes(fullGraph, user_sentiment, "sentiment")

sentiments = [d['sentiment'] for _, d in fullGraph.nodes(data=True) if "sentiment" in d]
homogenity = 1 - np.var(sentiments)
print(f"homogenity: {homogenity}")
assort = nx.attribute_assortativity_coefficient(fullGraph, "sentiment")
print(f"assortivity: {assort}")



isolates = list(nx.isolates(fullGraph))
print("fullgraph stats")
print(f"nodes: {fullGraph.number_of_nodes()}")
print(f"isolates: {len(isolates)}")
print(f"edges: {userGraph.number_of_edges()}")

components = nx.connected_components(fullGraph)
giant_component = max(components, key = len)
G_Giant = fullGraph.subgraph(giant_component)
print("G_Giant stats")
print(G_Giant.number_of_nodes())
print(G_Giant.number_of_edges())

homogenity: 0.7844978458279479
assortivity: -0.008620384616553295
fullgraph stats
nodes: 608
isolates: 0
edges: 2344
G_Giant stats
567
2320


In [12]:
#fullgraph = G_Giant.copy()
#with open("user_graph.pkl", "wb") as f:
#    pickle.dump(fullgraph, f)

In [None]:
fullgraph = G_Giant.copy()
nodeList = list(fullgraph.nodes())
edges = [(nodeList.index(u), nodeList.index(v)) for u, v in fullgraph.edges()]

g = ig.Graph(edges = edges, directed = False)

if nx.get_edge_attributes(fullgraph, "weight"):
    g.es["weight"] = [fullgraph[u][v].get("weight", 1.0) for u, v in fullgraph.edges()]

partition = leidenalg.find_partition(g, leidenalg.ModularityVertexPartition)

# Number of communities
print("Num communities:", len(partition))

# List nodes per community
for i, comm in enumerate(partition):
    print(f"Community {i}: {comm}")

# Modularity score
print("Modularity:", partition.quality())

with open("user_graph.pkl", "wb") as f:
    pickle.dump(fullGraph, f)
#nx.draw(fullGraph)

Num communities: 10
Community 0: [0, 4, 12, 19, 23, 58, 112, 118, 126, 129, 130, 134, 139, 140, 141, 142, 152, 205, 207, 210, 211, 214, 216, 219, 220, 224, 226, 227, 232, 238, 258, 263, 266, 268, 271, 274, 287, 316, 343, 344, 345, 348, 351, 355, 356, 357, 358, 433, 434, 435, 448, 449, 451, 453, 457, 458, 475, 478, 480, 484, 485, 489, 500, 501, 539, 542, 545, 547, 548, 560, 561]
Community 1: [20, 27, 29, 32, 54, 75, 84, 86, 88, 92, 158, 161, 184, 195, 370, 372, 386, 413, 415, 416, 417, 419, 424, 425, 428, 430, 431, 432, 437, 438, 440, 444, 445, 447, 455, 456, 468, 482, 487, 493, 494, 495, 496, 497, 498, 499, 504, 505, 506, 507, 508, 509, 512, 513, 515, 518, 519, 521, 523, 524, 525, 526, 527, 536, 537, 538, 540, 546, 556, 565]
Community 2: [3, 7, 9, 10, 11, 17, 28, 143, 144, 146, 149, 150, 153, 154, 155, 156, 159, 169, 171, 172, 173, 178, 183, 186, 187, 189, 191, 192, 194, 228, 252, 253, 257, 265, 267, 273, 275, 279, 281, 288, 289, 296, 297, 298, 304, 305, 306, 310, 312, 313, 314, 317, 3

In [14]:
G = fullGraph.copy()
n = G.number_of_nodes()
percent = n/biGraph.number_of_nodes() * 100
m = G.number_of_edges()
density = nx.density(G)
components = list(nx.connected_components(G)) if not G.is_directed() else list(nx.weakly_connected_components(G))
num_components = len(components)
giant_size = max(len(c) for c in components) if components else 0
avg_degree = sum(dict(G.degree()).values())/n

print(f"nodes: {n}, edges: {m}, density: {density:.6f}, percent of nodes left after pruning: {percent:.2f}%")
print(f"components: {num_components}, giant component size: {giant_size}, avg degree: {avg_degree:.2f}")
# degree distribution quick view
deg_seq = sorted([d for _, d in G.degree()], reverse=True)
print("top degrees:", deg_seq[:10])

nodes: 608, edges: 2344, density: 0.012703, percent of nodes left after pruning: 13.69%
components: 20, giant component size: 567, avg degree: 7.71
top degrees: [184, 94, 87, 72, 72, 61, 55, 54, 49, 45]
