In [3]:
import pandas as pd

# Load the data
file_path = 'Wiki-Vote.csv'
data = pd.read_csv(file_path, delimiter=';') # Assuming the delimiter is ';'

# Calculate in-degree and out-degree
out_degree = data['FromNodeId'].value_counts()
in_degree = data['ToNodeId'].value_counts()

# Combine in-degree and out-degree to calculate total activity
total_activity = pd.DataFrame({'OutDegree': out_degree, 'InDegree': in_degree}).fillna(0)
total_activity['TotalActivity'] = total_activity['OutDegree'] + total_activity['InDegree']

# Step b: Rank nodes by total activity
total_activity = total_activity.sort_values(by='TotalActivity', ascending=False)

# Step c: Select the top 20 most active and bottom 20 least active nodes
top_20_nodes = total_activity.head(20)
bottom_20_nodes = total_activity.tail(20)
selected_nodes = pd.concat([top_20_nodes, bottom_20_nodes])

# Create a subset of the dataset with these 40 nodes
selected_node_ids = selected_nodes.index
filtered_data = data[(data['FromNodeId'].isin(selected_node_ids)) | (data['ToNodeId'].isin(selected_node_ids))]

# Add a Weight column to count the number of interactions between Source and Target
filtered_data = filtered_data.groupby(['FromNodeId', 'ToNodeId']).size().reset_index(name='Weight')
filtered_data.columns = ['Source', 'Target', 'Weight']

# Save the results
selected_nodes.to_csv('Top_Bottom_Nodes.csv', index_label='NodeId')
filtered_data.to_csv('Filtered_Wiki-Vote.csv', index=False)

print("Files have been successfully generated:")
print("- Top and Bottom Nodes: Top_Bottom_Nodes.csv")
print("- Filtered Dataset with Weights: Filtered_Wiki-Vote.csv")


Files have been successfully generated:
- Top and Bottom Nodes: Top_Bottom_Nodes.csv
- Filtered Dataset with Weights: Filtered_Wiki-Vote.csv
