In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import MinMaxScaler
import networkx as nx
import tensorflow as tf




In [2]:

# Load the datasets
index_df = pd.read_csv('/content/index.csv')
data_df = pd.read_csv('/content/data (1).csv')

# Display the first few rows of the index and data files
print("Index.csv:")
print(index_df.head())

print("\nData.csv:")
print(data_df.head())


Index.csv:
       tweet_id  post_time_day  start_ind  end_ind
0  1.224350e+17       0.926644          1      175
1  1.224500e+17       0.968160        176      369
2  1.224500e+17       0.969560        370      703
3  1.224430e+17       0.949734        704      827
4  1.224570e+17       0.987373        828      941

Data.csv:
   relative_time_second  number_of_followers
0                   0.0                   33
1               84833.0                46828
2               84878.0                  208
3               84883.0                   37
4               84900.0                  137


Extracting Cascades for Each Tweet

In [3]:
# Function to extract cascades for each tweet using the index.csv
def extract_cascade(tweet_id):
    row = index_df[index_df['tweet_id'] == tweet_id]
    start_idx = row['start_ind'].values[0]
    end_idx = row['end_ind'].values[0]
    cascade_data = data_df.iloc[start_idx:end_idx+1]
    return cascade_data

# Extract all cascades for each tweet
cascades = []
for tweet_id in index_df['tweet_id'].unique():
    cascades.append(extract_cascade(tweet_id))

# Display first cascade as an example
print("First cascade example:")
print(cascades[0])


First cascade example:
     relative_time_second  number_of_followers
1                 84833.0                46828
2                 84878.0                  208
3                 84883.0                   37
4                 84900.0                  137
5                 84904.0                  254
..                    ...                  ...
171              136634.0                   18
172              166593.0                   18
173              299689.0                   99
174              424201.0                  148
175                   0.0                40627

[175 rows x 2 columns]


Data Preparation

In [4]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Prepare data for training by extracting followers and time
def prepare_data(cascades):
    X = []
    y = []
    for cascade in cascades:
        if len(cascade) > 0:  # Check if the cascade is not empty
            followers = cascade['number_of_followers'].values
            times = cascade['relative_time_second'].values
            X.append(np.stack((followers, times), axis=1))  # Combine followers and times as features
            y.append(len(cascade))  # Predict the size of the cascade
    return X, y

# Assuming 'cascades' is your dataset, replace this with your actual data
X, y = prepare_data(cascades)

# Normalize the time and follower data
scaler = MinMaxScaler()
X_normalized = []
for cascade in X:
    if len(cascade) > 0:  # Ensure the cascade is not empty before applying the scaler
        X_normalized.append(scaler.fit_transform(cascade))

# Pad sequences to ensure uniform input size
X_padded = pad_sequences(X_normalized, padding='post', dtype='float32')

# Display prepared features and targets
print(f"Feature shape: {X_padded[0].shape}")
print(f"Target (cascade sizes): {y[:5]}")


Feature shape: (4409, 2)
Target (cascade sizes): [175, 194, 124, 114, 88]


In [5]:
# # Ensure X and y are numpy arrays
# Ensure X and y are numpy arrays, then pad sequences in X
X = pad_sequences(X, padding='post')  # Pads sequences with zeros at the end
y = np.array(y)

Train-Test Split

In [6]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Ensure X and y are numpy arrays
# X = np.array(X)
# y = np.array(y)
# # Display shapes of train/test sets
print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")


Training set size: (2116, 4409, 2)
Test set size: (529, 4409, 2)


 Implementing DeepCas (RNN-Based Model)

In [7]:
# Define the DeepCas model (LSTM-based RNN)
def build_deepcas_model(input_shape):
    model = Sequential()
    model.add(LSTM(64, input_shape=input_shape, return_sequences=True))
    model.add(LSTM(64))
    model.add(Dense(1, activation='linear'))
    model.compile(optimizer='adam', loss='mse')
    return model

# Build and train DeepCas model
deepcas_model = build_deepcas_model((X_train.shape[1], 2))
history = deepcas_model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
deepcas_loss = deepcas_model.evaluate(X_test, y_test)
print(f'DeepCas Model Loss: {deepcas_loss}')


  super().__init__(**kwargs)


Epoch 1/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 261ms/step - loss: 97703.9609 - val_loss: 148023.4688
Epoch 2/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 262ms/step - loss: 88914.4453 - val_loss: 146198.4844
Epoch 3/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 253ms/step - loss: 77832.4141 - val_loss: 144590.6406
Epoch 4/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 267ms/step - loss: 87428.0781 - val_loss: 143003.7656
Epoch 5/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 252ms/step - loss: 85607.0547 - val_loss: 141513.7812
Epoch 6/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 272ms/step - loss: 83608.5859 - val_loss: 140059.0312
Epoch 7/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 261ms/step - loss: 91342.9688 - val_loss: 138669.9844
Epoch 8/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 258ms/step - l

Implementing DeepHawkes

In [8]:
# Define the DeepHawkes model (Hawkes process inspired)
def build_deephawkes_model(input_shape):
    model = Sequential()
    model.add(LSTM(64, input_shape=input_shape, return_sequences=True))
    model.add(LSTM(64))
    model.add(Dense(1, activation='linear'))
    model.compile(optimizer='adam', loss='mse')
    return model

# Build and train DeepHawkes model
deephawkes_model = build_deephawkes_model((X_train.shape[1], 2))
history = deephawkes_model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
deephawkes_loss = deephawkes_model.evaluate(X_test, y_test)
print(f'DeepHawkes Model Loss: {deephawkes_loss}')


Epoch 1/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 269ms/step - loss: 88475.6250 - val_loss: 148499.9062
Epoch 2/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 257ms/step - loss: 79824.5703 - val_loss: 146721.6875
Epoch 3/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 243ms/step - loss: 87376.6875 - val_loss: 145124.6719
Epoch 4/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 258ms/step - loss: 87632.0859 - val_loss: 143618.1562
Epoch 5/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 245ms/step - loss: 94885.5234 - val_loss: 142200.9375
Epoch 6/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 259ms/step - loss: 77490.1641 - val_loss: 140842.9062
Epoch 7/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 247ms/step - loss: 89658.1797 - val_loss: 139516.5781
Epoch 8/20
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 244ms/step - l

Implementing CasCN (Graph-Based Model)

In [10]:
# Example cascade data (using multiple cascades to match the number of samples in y_train and y_test)
cascades = [pd.DataFrame({'number_of_followers': [100, 200], 'relative_time_second': [1, 2]}) for _ in range(len(y_train))]

# Graph Construction (example function)
def create_graph(cascade):
    G = nx.DiGraph()
    for i, row in cascade.iterrows():
        G.add_node(i, followers=row['number_of_followers'], time=row['relative_time_second'])
    return G

# Create GCN input based on the number of nodes in each cascade (or any other feature)
gcn_input = np.array([len(cascade) for cascade in cascades])

# Ensure gcn_input matches y_train size
gcn_input = gcn_input.reshape(-1, 1)  # Reshape to have a single feature per sample

# Build and train CasCN model
cascn_model = build_cas_gcn_model(gcn_input.shape[1])
history = cascn_model.fit(gcn_input, y_train, epochs=20, batch_size=32, validation_data=(gcn_input, y_test))

# Evaluate the model
cascn_loss = cascn_model.evaluate(gcn_input, y_test)
print(f'CasCN Model Loss: {cascn_loss}')


Epoch 1/20
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - loss: 0.3858 - val_loss: 0.0844
Epoch 2/20
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0866 - val_loss: 0.0850
Epoch 3/20
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0834 - val_loss: 0.0846
Epoch 4/20
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0858 - val_loss: 0.0846
Epoch 5/20
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0861 - val_loss: 0.0848
Epoch 6/20
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0834 - val_loss: 0.0849
Epoch 7/20
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0882 - val_loss: 0.0848
Epoch 8/20
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0853 - val_loss: 0.0846
Epoch 9/20
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

Implementing TiDeH (Time-Dependent Hawkes Process)

In [12]:
# Assuming you have time-series data for each cascade, create X_train and X_test with the shape (samples, timesteps, features)
# Here, I'm assuming each cascade has multiple timesteps and two features (relative time and number of followers).
# You may need to adjust this based on your actual data.

# Example: Reshaping X_train to 3D format
# Replace this with actual reshaping logic based on your data
X_train = np.random.rand(2169, 10, 2)  # Example shape (2169 samples, 10 timesteps, 2 features)
X_test = np.random.rand(2169, 10, 2)   # Example test data with the same shape as X_train

# Define the TiDeH model (RNN for temporal dependencies)
def build_tideh_model(input_shape):
    model = Sequential()
    model.add(LSTM(64, input_shape=input_shape, return_sequences=True))
    model.add(LSTM(64))
    model.add(Dense(1, activation='linear'))
    model.compile(optimizer='adam', loss='mse')
    return model

# Build and train TiDeH model
tideh_model = build_tideh_model((X_train.shape[1], X_train.shape[2]))
history = tideh_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
tideh_loss = tideh_model.evaluate(X_test, y_test)
print(f'TiDeH Model Loss: {tideh_loss}')


Epoch 1/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - loss: 0.1708 - val_loss: 0.0870
Epoch 2/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 0.0888 - val_loss: 0.0864
Epoch 3/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0848 - val_loss: 0.0924
Epoch 4/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0860 - val_loss: 0.0902
Epoch 5/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0884 - val_loss: 0.0855
Epoch 6/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 0.0866 - val_loss: 0.0874
Epoch 7/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 0.0896 - val_loss: 0.0934
Epoch 8/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 0.0928 - val_loss: 0.0844
Epoch 9/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

Comparing Model Performances

In [13]:
# Compare performance of all models
print(f"DeepCas Loss: {deepcas_loss}, DeepHawkes Loss: {deephawkes_loss}, CasCN Loss: {cascn_loss}, TiDeH Loss: {tideh_loss}")


DeepCas Loss: 125043.578125, DeepHawkes Loss: 126051.625, CasCN Loss: 0.08478637039661407, TiDeH Loss: 0.08515477925539017
