In [None]:
import os
import json
import glob
import pickle
import statsmodels.api as sm
from concurrent.futures import ThreadPoolExecutor, as_completed
import numpy as np
import pandas as pd
import networkx as nx
import nltk
from textblob import TextBlob
from collections import Counter
import re
import textstat
import liwc
from collections import defaultdict
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import emoji
from tqdm import tqdm
from transformers import pipeline

In [None]:
# feature engineering

In [None]:
## text-based

In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('vader_lexicon')

# textual_feature
def textual_features(text):
    text_length = len(text)
    character_count = sum(c.isalnum() for c in text)
    word_count = len(text.split())
    sentence_count = len(sent_tokenize(text))
    if sentence_count > 0:
        words_per_sentence = word_count / sentence_count
    else:
        words_per_sentence = 0

    special_symbols = len(re.findall(r'[?!#@]', text))
    uppercase_count = sum(1 for c in text if c.isupper())

    sentiment = TextBlob(text)
    sentiment_score = sentiment.sentiment.polarity
    subjectivity_score = sentiment.sentiment.subjectivity
    ari = textstat.automated_readability_index(text)
    cli = textstat.coleman_liau_index(text)
    fkg = textstat.flesch_kincaid_grade(text)
    fre = textstat.flesch_reading_ease(text)
    gfi = textstat.gunning_fog(text)

    return pd.Series({
        "text_length": text_length, 
        "character_count": character_count,
        "word_count": word_count, 
        "sentence_count": sentence_count, 
        "words_per_sentence": words_per_sentence,
        "special_symbols": special_symbols,
        "uppercase_count": uppercase_count,
        "sentiment_score": sentiment_score,
        "subjectivity_score": subjectivity_score,
        "automated_readability_index": ari,
        "coleman_liau_index": cli, 
        "flesch_kincaid_grade": fkg, 
        "flesch_reading_ease": fre,
        "gunning_fog_index": gfi
        })

In [None]:
## dictionary-based

In [None]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from collections import defaultdict
import pandas as pd
import liwc

##  lexicon_feature
nltk.download('punkt')

def count_liwc_categories(text, parse, target_categories):
    tokens = word_tokenize(text.lower())
    counts = defaultdict(int) 

    for token in tokens:
        categories = parse(token)
        for category in categories:
            for target_category, liwc_categories in target_categories.items():
                if category in liwc_categories:
                    counts[target_category] += 1

    all_categories = set(target_categories.keys())
    final_counts = {category: counts.get(category, 0) for category in all_categories}

    return final_counts

def lexicon_feature(text):
    liwc_path = 'LIWC2015 Dictionary.dic'
    parse, category_names = liwc.load_token_parser(liwc_path)
    
    target_categories = {
        'noun': ['pronoun'],
        'verb': ['verb'],
        'adjective': ['adj'],
        'adverb': ['adverb'],
        'auxverb': ['auxverb'],
        'positive': ['posemo'],
        'negative': ['negemo'],
        'anxiety': ['anx'],
        'anger': ['anger'],
        'sad': ['sad']
    }
    
    liwc_counts = count_liwc_categories(text, parse, target_categories)
    liwc_series = pd.Series()
    
    for category in target_categories.keys():
        count = liwc_counts.get(category, 0)
        liwc_series[category] = count

    return liwc_series


In [None]:
## narrative-based

In [None]:
def temporal_embedding(text, parse): 
    text_token = word_tokenize(text.lower())
    vocabulary_count = Counter(category for token in text_token for category in parse(token))

    if 'time' in vocabulary_count and 'cause' in vocabulary_count:
        return 2
    elif 'time' in vocabulary_count or 'cause' in vocabulary_count:
        return 1
    else:
        return 0


In [None]:
def spacial_embedding(text, parse):
    text_token = word_tokenize(text.lower())
    vocabulary_count = Counter(category for token in text_token for category in parse(token))

    if 'space' in vocabulary_count and 'percept' in vocabulary_count:
        return 2
    elif 'space' in vocabulary_count and 'percept' not in vocabulary_count:
        return 1
    else:
        return 0

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
import liwc
import os

def calculate_features(text, parse, motion_categories, affective_categories, cognitive_categories):
    sentences = sent_tokenize(text)
    total_sentences = len(sentences)

    motion_count = []
    affective_process_count = []
    cognitive_insight_count = []

    for sentence in sentences:
        tokens = word_tokenize(sentence)
        categories = [category for token in tokens for category in parse(token)]
        motion_count.append(sum(1 for category in categories if category in motion_categories))
        affective_process_count.append(sum(1 for category in categories if category in affective_categories))
        cognitive_insight_count.append(sum(1 for category in categories if category in cognitive_categories))

    affective_trigram_count = 0
    cognitive_trigram_count = 0

    for i in range(len(sentences) - 2):
        if motion_count[i] > 0 and affective_process_count[i + 1] > 0 and motion_count[i + 2] > 0:
            affective_trigram_count += 1
        if motion_count[i] > 0 and cognitive_insight_count[i + 1] > 0 and motion_count[i + 2] > 0:
            cognitive_trigram_count += 1

    loa = affective_trigram_count / total_sentences if total_sentences > 0 else 0
    loc = cognitive_trigram_count / total_sentences if total_sentences > 0 else 0

    return loa, loc


In [None]:

def ratio_senti(text):
    text_sentence = sent_tokenize(text.lower())

    sentence_data = []
    for index, sentence in enumerate(text_sentence, start=1):
        sentiment = TextBlob(sentence)
        sentiment_score = sentiment.sentiment.polarity
        ratio = index / len(text_sentence)
        sentence_data.append({'Sentence': sentence, 'Ratio': ratio, 'Sentiment_score': sentiment_score})
        
    return pd.DataFrame(sentence_data)


In [None]:
from nltk.tokenize import sent_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet

def ratio_sen(text, parse, positive_categories, negative_categories):
    text_sentence = sent_tokenize(text.lower())
    sentence_data = []

    for index, sentence in enumerate(text_sentence, start=1):
        tokens = word_tokenize(sentence)
        categories = [category for token in tokens for category in parse(token)]
        positive_count = (sum(1 for category in categories if category in positive_categories))
        negative_count = (sum(1 for category in categories if category in negative_categories))
        
        sentiment_intensity = abs(positive_count - negative_count)
        
        ratio = index / len(text_sentence)
        sentence_data.append({'Sentence': sentence, 'Ratio': ratio, 'Sentiment_Intensity': sentiment_intensity})
        
    return pd.DataFrame(sentence_data)

In [None]:
def model_func(df):
    s = df['Ratio'].values
    s2 = s ** 2
    y = df['Sentiment_Intensity'].values

    X = np.column_stack((s, s2))
    X = sm.add_constant(X)
    model = sm.OLS(y, X).fit()

    params = model.params
    p_values = model.pvalues

    return params, p_values


def classify_emotional_genre(df):
    s = df['Ratio'].values
    sentiment_scores = df['Sentiment_Intensity'].values

    if len(s) < 7:
        return 'None'

    params, pvalues = model_func(df)

    if (params[1] > 0 and pvalues[1] < 0.05 and pvalues[2] > 0.05) or (
            params[2] > 0 and pvalues[2] < 0.05): 
        genre = 'Progressive'
    elif (params[1] < 0 and pvalues[1] < 0.05 and pvalues[2] > 0.05) or (
            params[2] < 0 and pvalues[2] < 0.05): 
        genre = 'Regressive'
    elif (params[1] < 0 and params[2] > 0 and pvalues[1] < 0.05 and pvalues[2] < 0.05): 
        genre = 'Comedy'
    elif (params[1] > 0 and params[2] < 0 and pvalues[1] < 0.05 and pvalues[2] < 0.05):
        genre = 'Tragedy'
    elif pvalues[1] >= 0.05:
        genre = 'Stable'
    else:
        genre = 'None'

    return genre


In [None]:
def drama_feature(df):

    if len(df) <= 2:
        return 0  
    
    df['Emotional_change'] = df['Sentiment_score'].diff().abs()
    
    if len(df) > 1 and df['Emotional_change'][1:].notna().any():
        max_change_index = df['Emotional_change'][1:].idxmax()
        max_change_ratio = df.loc[max_change_index, 'Ratio']
    else:
        max_change_ratio = 0
    
    drama_ratio = 1 - max_change_ratio
    
    return drama_ratio


In [None]:
def sentiment_volatility(df):
    if len(df) <= 1:
        return 0
    else:
        diffs = df['Sentiment_score'].diff().dropna().values
        mean_diff = np.mean(diffs)
        SV = np.sqrt(np.mean((diffs - mean_diff) ** 2))
        return SV


In [None]:
def narrative_feature(text):
    liwc_path = 'LIWC2015 Dictionary.dic'
    parse, category_names = liwc.load_token_parser(liwc_path)
    
    tembed = temporal_embedding(text, parse)
    sembed = spacial_embedding(text, parse)

    motion_categories = ['motion']
    affective_process_categories = ['affect']
    cognitive_insight_categories = ['insight']
    positive_categories = ['posemo']
    negative_categories = ['negemo']
    loa_value, loc_value = calculate_features(text, parse, motion_categories, affective_process_categories, cognitive_insight_categories)
    
    sentiment_change = ratio_senti(text)
    sentiment_genre = ratio_sen(text, parse, positive_categories, negative_categories)
    text_genre = classify_emotional_genre(sentiment_genre)
    drama = drama_feature(sentiment_change)
    sv = sentiment_volatility(sentiment_change)
    
    genre_one_hot = {
        "Stable": 0,
        "Progressive": 0,
        "Regressive": 0,
        "Comedy": 0,
        "Tragedy": 0
    }
    if text_genre in genre_one_hot:
        genre_one_hot[text_genre] = 1

    features = {
        "temporal_embedding": tembed, 
        "spacial_embedding": sembed, 
        "loa_value": loa_value,
        "loc_value": loc_value,
        "drama": drama, 
        "sentiment_volatility": sv
    }

    features.update({f"{k}": v for k, v in genre_one_hot.items()})
    
    return pd.Series(features)


In [None]:
## behavior-based

In [None]:
def average_time_between_post(G):
    times = []
    for node, data in G.nodes(data=True):
        time_str = data.get('time', '')
        if time_str != '':
            try:
                time_int = int(time_str)
                times.append(time_int)
            except ValueError:
                continue  

    if len(times) < 2:
        return 0  

    times.sort()
    time_diffs = [times[i+1] - times[i] for i in range(len(times) - 1)]

    return sum(time_diffs) / len(time_diffs)

In [None]:

def cascade_centrality(G):
    roots = [node for node in G.nodes if G.in_degree(node) == 0]
    root_node = roots[0]
    
    repost_users = [attr['user_id'] for node, attr in G.nodes(data=True) if node != root_node]
    repost_count = len(repost_users)
    unique_user_count = len(set(repost_users))
    
    if repost_count == 0:
        return 0

    score = (1 - unique_user_count / repost_count)
    return score



In [None]:

def paticipant_num(G):
    roots = [node for node in G.nodes if G.in_degree(node) == 0]
    root_node = roots[0]

    user_ids = [attr['user_id'] for node, attr in G.nodes(data=True) if node != root_node]
    user_counts = Counter(user_ids)

    if len(user_counts) == 0:
        return 0, 0
    
    average_replies = sum(user_counts.values()) / len(user_counts) 
    max_replies_user = max(user_counts, key=user_counts.get)
    max_replies_count = user_counts[max_replies_user]

    return average_replies, max_replies_count

In [None]:
def behavior_feature(G):
    average_timediff_between_post = average_time_between_post(G)
    cascade_centrality_score = cascade_centrality(G)
    average_replies, max_replies = paticipant_num(G)

    return { 
        "average_time_between_post": average_timediff_between_post,
        "cascade_centrality_score": cascade_centrality_score,
        "avrage_replies": average_replies,
        "max_replies": max_replies
    }

In [None]:
## propagation-based

In [None]:
def depth_breadth(G, root):
    depths = dict(nx.single_source_shortest_path_length(G, root))
    max_depth = max(depths.values())
    breadth = max(len([node for node, depth in depths.items() if depth == d]) for d in set(depths.values()))
    return max_depth, breadth


In [None]:

def structural_virality(G):
    shortest_paths = dict(nx.all_pairs_shortest_path_length(G))

    path_lengths = np.zeros((len(G), len(G)))
    for i, node_i in enumerate(G.nodes()):
        for j, node_j in enumerate(G.nodes()):
            if i != j and node_j in shortest_paths[node_i]:
                path_lengths[i, j] = shortest_paths[node_i][node_j]

    n = len(G.nodes())

    if n > 1:
        virality = (1 / (n * (n - 1))) * np.sum(path_lengths)
    else:
        virality = 0 
        
    return virality


In [None]:

def average_sentiment(G):
    sentiment_scores = []
    
    roots = [node for node in G.nodes if G.in_degree(node) == 0]
    non_root_nodes = [node for node in G.nodes if node not in roots]  

    for node in non_root_nodes:
        tweet_text = G.nodes[node].get('text', '')
        if isinstance(tweet_text, list):
            tweet_text = ' '.join(tweet_text)
        if tweet_text: 
            sentiment = TextBlob(tweet_text)
            sentiment_score = sentiment.sentiment.polarity
            sentiment_scores.append(sentiment_score)

    if sentiment_scores:
        average_sentiment = sum(sentiment_scores) / len(sentiment_scores)
    else:
        average_sentiment = 0
        
    return average_sentiment


In [None]:

def average_enoji(G):
    emoji_scores = []
    
    roots = [node for node in G.nodes if G.in_degree(node) == 0]
    non_root_nodes = [node for node in G.nodes if node not in roots] 
    
    for node in non_root_nodes:
        tweet_text = G.nodes[node].get('text', '')
        if isinstance(tweet_text, list):
            tweet_text = ' '.join(tweet_text)
        if tweet_text: 
            emoji_score = emoji.emoji_count(tweet_text)
            emoji_scores.append(emoji_score)
            
    if emoji_scores:
        average_emoji = sum(emoji_scores) / len(emoji_scores)
    else:
        average_emoji = 0 

    return average_emoji


In [None]:

emotions = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust']
file_path = r'NRC-Emotion-Intensity-Lexicon-v1.txt'

column_names = ['words', 'emotion', 'intensity']
emotion_intensity = pd.read_csv(file_path, delimiter='\t', header=None, names=column_names)
print(emotion_intensity)

In [None]:
import pandas as pd
import re
from collections import defaultdict

emotion_dict = {}   # {word: {sentiment: intensity}}
for index, row in emotion_intensity.iterrows():
    word = row['words']
    emotion = row['emotion']
    intensity = row['intensity']
    if word not in emotion_dict:
        emotion_dict[word] = {}
    emotion_dict[word][emotion] = intensity

def calculate_emotion_vector(text, emotion_dict):
    if isinstance(text, list):
        text = ' '.join(text)
    words = word_tokenize(text.lower())
    emotion_vector = defaultdict(float)
    
    for word in words:
        if word in emotion_dict:
            for emotion, intensity in emotion_dict[word].items():
                emotion_vector[emotion] += intensity

    for emotion in emotions:
        if emotion not in emotion_vector:
            emotion_vector[emotion] = 0.0
    
    total_intensity = sum(emotion_vector.values())
    if total_intensity > 0:
        for emotion in emotion_vector:
            emotion_vector[emotion] /= total_intensity
    
    return dict(emotion_vector)


In [None]:

def average_itensity(G):
    
    roots = [node for node in G.nodes if G.in_degree(node) == 0]
    non_root_nodes = [node for node in G.nodes if node not in roots]
    
    if not non_root_nodes:
        return {emotion: 0 for emotion in emotions}
    
    accumulated_emotion_vector = defaultdict(float)
    count = 0

    for node in non_root_nodes:
        tweet_text = G.nodes[node].get('text', '')
        if isinstance(tweet_text, list):
            tweet_text = ' '.join(tweet_text)
        if tweet_text:
            emotion_vector = calculate_emotion_vector(tweet_text, emotion_dict)
            for emotion, intensity in emotion_vector.items():
                accumulated_emotion_vector[emotion] += intensity
            count += 1
    
    if count > 0:
        average_emotion_vector = {emotion: intensity / count for emotion, intensity in accumulated_emotion_vector.items()}
    else:
        average_emotion_vector = {emotion: 0 for emotion in emotions}
    
    return average_emotion_vector


In [None]:
def propagation_feature(G):
    retweet_reply_num = G.number_of_edges()
    max_depth = 0
    max_breath = 0 
    roots = [node for node in G.nodes if G.in_degree(node) == 0]
    for root in roots:
        depth, breath = depth_breadth(G, root)
        if depth > max_depth:
            max_depth = depth
        if breath > max_breath:
            max_breath = breath
    unique_users = len(set(nx.get_node_attributes(G, 'user_id').values()))
    structural_virality_score = structural_virality(G)
    avg_sentiment = average_sentiment(G)
    avg_emoji = average_enoji(G)
    average_emotion_vector = average_itensity(G)

    return {
        "retweet_reply_num": retweet_reply_num, 
        "max_depth": max_depth,
        "max_breath": max_breath, 
        "unique_users": unique_users,
        "structural_virality_score": structural_virality_score,
        "average_sentiment": avg_sentiment,
        "avrtage_emoji": avg_emoji,
        "anger_intensity": average_emotion_vector['anger'],
        "anticipation_intensity": average_emotion_vector['anticipation'],
        "disgust_intensity": average_emotion_vector['disgust'],
        "fear_intensity": average_emotion_vector['fear'],
        "joy_intensity": average_emotion_vector['joy'],
        "sadness_intensity": average_emotion_vector['sadness'],
        "surprise_intensity": average_emotion_vector['surprise'],
        "trust_intensity": average_emotion_vector['trust'],
    }


In [None]:

def extract_content_feature(data):
    feature_text = data['Text'].apply(textual_features)
    feature_lexi = data['Text'].apply(lexicon_feature)
    feature_narr = data['Text'].apply(narrative_feature)

    text_data = pd.concat([feature_text, feature_lexi, feature_narr], axis=1)
    
    return text_data


In [None]:
def extract_beha_propa_feature(graph_data):
    all_features = []
    for graph_name, G in graph_data.items():

        prop_features = propagation_feature(G)
        beha_features = behavior_feature(G)

        features = {
            'id': graph_name,
            **prop_features,
            **beha_features
        }

        all_features.append(features)

    graphdata = pd.DataFrame(all_features)

    return graphdata


In [None]:
## read data

textdata = extract_content_feature(text_data)
graphdata = extract_beha_propa_feature(graph_data)

In [None]:
from transformers import pipeline

classifier = pipeline('zero-shot-classification', model='facebook/bart-large-mnli', device=0)
topics = ["Politics", "Finance & Business", "Military", "Culture & Sports & Entertainment", "Society & Life", "Disasters & Accidents", "Education & Examinations", "Science & Technology", "Health & Medicine"]

text_data = pd.read_csv('')
text_data['Topic'] = ""

for index, row in data.iterrows():
    text = row['Text']
    result = classifier(text, topics, multi_label=False)
    text_data.at[index, 'Topic'] = result['labels'][0]
    
text_data = data.drop(columns=['Text'])