In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import networkx as nx

from networkx.algorithms.community import greedy_modularity_communities
import datetime
from tqdm import tqdm
import pandas as pd
import random

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/users/iasamori/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/users/iasamori/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/users/iasamori/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/users/iasamori/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
stop_words = set(stopwords.words('english'))

In [5]:
def preprocess_text(text):
    text = text.lower()
    
    words = word_tokenize(text, language='english', preserve_line=True)
    # words = [word for word in words if word.isalpha() and word not in stop_words]
    words = [word for word in words if word not in stop_words]
    
    return words

In [6]:
def preprocess_text_with_custom_removal(text, frequent_words):
    words = preprocess_text(text)
    # Remove additional frequent words
    words = [word for word in words if word not in frequent_words]
    return ' '.join(words)

In [7]:
def filter_corpus(corpus):
    preprocessed_corpus = [preprocess_text(doc) for doc in corpus]
    
    all_words = [word for doc in preprocessed_corpus for word in doc]
    word_counts = Counter(all_words)
    threshold = 100000 
    frequent_words = {word for word, count in word_counts.items() if count > threshold}
    
    filtered_corpus = [preprocess_text_with_custom_removal(doc, frequent_words) for doc in corpus]

    return filtered_corpus

In [8]:
def get_graph(corpus):
    filtered_corpus = filter_corpus(corpus)
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(filtered_corpus)

    co_occurrence_matrix = (X.T @ X).toarray()
    words = vectorizer.get_feature_names_out()

    G = nx.Graph()
    for i, word1 in enumerate(words):
        for j, word2 in enumerate(words):
            if i != j and co_occurrence_matrix[i, j] > 1:  # Avoid self-loops and zero edges
                # Convert weight to int to avoid JSON serialization errors
                weight = int(co_occurrence_matrix[i, j])
                G.add_edge(word1, word2, weight=weight)

    return G

In [9]:
def generate_community_centrality_df(communities, degree_centrality):

    community_degree_centrality = {}
    
    for i, community in enumerate(communities):
        community_degree_centrality[f"community_{i+1}"] = {}
        community_terms = list(community)
        for term in community_terms:
            community_degree_centrality[f"community_{i+1}"][term] = degree_centrality[term]
    
        community_degree_centrality[f"community_{i+1}"] = dict(sorted(community_degree_centrality[f"community_{i+1}"].items(),
                                                                      key=lambda item: item[1], reverse=True))
    
    rows = []
    
    for community, terms_dict in community_degree_centrality.items():
        for term, value in terms_dict.items():
            rows.append({'community': community, 'term': term, 'value': value})
    
    community_degree_centrality_df = pd.DataFrame(rows)

    return community_degree_centrality_df

In [10]:
data = pd.read_csv('./combined_comments_2.csv')
data = data.dropna(subset=['create_time'])
data = data[(data['create_time'] < 1e10) & (data['create_time'] > 0)]

datetime_list = pd.to_datetime(data['create_time'], unit='s', errors='coerce')
month_year = [f"{i.month}_{i.year}" for i in datetime_list]
data['month_year'] = month_year


In [11]:
data['month_year'].value_counts()

month_year
4_2024     51322
7_2023     44678
8_2023      6267
5_2024      3487
9_2023      1468
6_2024      1377
8_2024       955
10_2023      592
7_2024       466
11_2023      322
1_2024       264
9_2024       237
12_2023      187
2_2024       184
10_2024       87
3_2024        20
Name: count, dtype: int64

In [38]:
data['month_year'].value_counts()

month_year
4_2024     28671
5_2024      1853
8_2024       559
6_2024       509
7_2024       287
9_2024       101
10_2024       24
Name: count, dtype: int64

In [12]:
month_year_list = list(data['month_year'].unique())

for month_year in tqdm(month_year_list):
    month_year_df = data[data['month_year'] == month_year]
    month_text = month_year_df['text'].dropna()
    month_text = list(month_text.values)

    G = get_graph(month_text)
    
    communities = list(greedy_modularity_communities(G))
    degree_centrality = nx.degree_centrality(G)
    
    with open(f'./network_analysis_data/{month_year}_communities.txt', 'w') as f:
        for i, community in enumerate(communities):
            f.write(f"Community {i+1}: {', '.join(community)}\n\n")
    
    community_degree_centrality_df = generate_community_centrality_df(communities, degree_centrality)
    community_degree_centrality_df.to_csv(f"./network_analysis_data/{month_year}_community_degree_centrality.csv", index=False)

100%|██████████| 16/16 [11:40<00:00, 43.80s/it]


In [16]:
G = get_graph(data['text'].dropna())
    
communities = list(greedy_modularity_communities(G))
degree_centrality = nx.degree_centrality(G)

with open(f'./network_analysis_data/all_data_communities.txt', 'w') as f:
    for i, community in enumerate(communities):
        f.write(f"Community {i+1}: {', '.join(community)}\n\n")

community_degree_centrality_df = generate_community_centrality_df(communities, degree_centrality)
community_degree_centrality_df.to_csv(f"./network_analysis_data/all_data_community_degree_centrality.csv", index=False)

In [18]:
G = get_graph(texts)

communities = list(greedy_modularity_communities(G))
degree_centrality = nx.degree_centrality(G)

with open(f'./network_analysis_data/{month_year}_communities.txt', 'w') as f:
    for i, community in enumerate(communities):
        f.write(f"Community {i+1}: {', '.join(community)}\n\n")

community_degree_centrality_df = generate_community_centrality_df(communities, degree_centrality)
community_degree_centrality_df.to_csv(f"./network_analysis_data/{month_year}_community_degree_centrality.csv", index=False)

NameError: name 'texts' is not defined