# Twitter URL Network Analysis

This notebook provides a consolidated version of the Twitter URL Network Analysis application. It includes all the functionality for processing Twitter data, building networks, and creating interactive visualizations.

## Setup and Dependencies

In [None]:
import pandas as pd
import igraph as ig
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import json
from datetime import datetime, timedelta
import re
from collections import defaultdict
import numpy as np

## Data Processing Functions

These functions handle the processing of raw Twitter data.

In [None]:
def extract_urls(tweet):
    """Extract URLs from tweet entities if present"""
    if 'entities' in tweet and 'urls' in tweet['entities']:
        return [url['expanded_url'] for url in tweet['entities']['urls']]
    return []

def process_tweets(data):
    """Process raw tweet data into a pandas DataFrame with validation"""
    if not data:
        return pd.DataFrame()
        
    processed_tweets = []
    required_fields = ['author_id', 'created_at', 'id']
    
    for tweet in data:
        if not all(field in tweet for field in required_fields):
            continue
            
        try:
            urls = extract_urls(tweet)
            if urls:
                processed_tweets.append({
                    'author_id': tweet['author_id'],
                    'created_at': datetime.strptime(tweet['created_at'], '%Y-%m-%dT%H:%M:%S.%fZ'),
                    'urls': urls,
                    'id': tweet['id']
                })
        except (ValueError, KeyError):
            continue
    
    return pd.DataFrame(processed_tweets)

def clean_url(url):
    """Clean and normalize URLs"""
    url = re.sub(r'https?://', '', url)
    url = re.sub(r'^www\.', '', url)
    url = re.sub(r'/$', '', url)
    return url.lower()

## Network Building Functions

These functions handle the creation and analysis of the network based on URL sharing patterns using iGraph.

In [None]:
def build_network(df_tweets, time_window_hours=6, min_cooccurrence=2):
    """Build network based on URL sharing patterns using iGraph"""
    G = ig.Graph()
    
    # Sort tweets by timestamp
    df_tweets = df_tweets.sort_values('created_at')
    time_window = timedelta(hours=time_window_hours)
    
    # Track URL sharing patterns
    url_shares = defaultdict(list)
    cooccurrence_count = defaultdict(int)
    unique_authors = set()
    
    # Process tweets
    for _, tweet in df_tweets.iterrows():
        current_time = tweet['created_at']
        author = tweet['author_id']
        unique_authors.add(author)
        
        for url in tweet['urls']:
            url_shares[url].append((author, current_time))
            
            for other_author, other_time in url_shares[url]:
                if other_author != author:
                    time_diff = abs(current_time - other_time)
                    if time_diff <= time_window:
                        pair = tuple(sorted([author, other_author]))
                        cooccurrence_count[pair] += 1
    
    # Add vertices to graph
    author_to_idx = {author: idx for idx, author in enumerate(unique_authors)}
    G.add_vertices(len(unique_authors))
    G.vs["name"] = list(unique_authors)
    
    # Build network based on minimum co-occurrence threshold
    edges = []
    weights = []
    for (author1, author2), count in cooccurrence_count.items():
        if count >= min_cooccurrence:
            edges.append((author_to_idx[author1], author_to_idx[author2]))
            weights.append(count)
    
    G.add_edges(edges)
    G.es["weight"] = weights
    
    # Add node attributes
    degrees = G.degree()
    G.vs["size"] = [max(5, 5 * np.log1p(deg)) for deg in degrees]  # Logarithmic scaling with minimum size 5
    
    # Add community detection using Louvain method
    communities = G.community_multilevel(weights="weight")
    G.vs["group"] = communities.membership
    
    # Filter out isolated vertices
    isolated_vertices = G.vs.select(_degree=0)
    G.delete_vertices(isolated_vertices)
    
    # Renumber community groups sequentially
    if len(G.vs) > 0:
        unique_groups = sorted(set(G.vs["group"]))
        group_map = {old: new for new, old in enumerate(unique_groups)}
        G.vs["group"] = [group_map[group] for group in G.vs["group"]]
    
    # Update vertex indices and attributes
    author_to_idx = {author: idx for idx, author in enumerate(G.vs["name"])}
    
    return G, author_to_idx

## Network Visualization Functions

These functions create network visualizations using either Plotly (interactive web-based) or Matplotlib (static with hover capabilities).

### Visualization Options:
1. **Plotly**: Interactive web-based visualization with zoom, pan, and hover capabilities
2. **Matplotlib**: Static visualization with basic hover functionality, suitable for publication

In [None]:
def create_network_visualization(G, author_to_idx, plot_style='plotly'):
    """Create network visualization using either plotly or matplotlib"""
    # Get the layout using Fruchterman-Reingold algorithm
    layout = G.layout_fruchterman_reingold(weights="weight")
    
    # Extract coordinates
    xs = [coord[0] for coord in layout]
    ys = [coord[1] for coord in layout]
    
    # Get node colors based on communities
    unique_communities = len(set(G.vs["group"]))
    num_colors = max(20, unique_communities)
    colors = plt.get_cmap('tab20')(np.linspace(0, 1, num_colors))[:, :3]  # RGB colors without alpha
    node_colors = [colors[group % len(colors)] for group in G.vs["group"]]
    
    if plot_style == 'plotly':
        # Convert RGB colors to plotly format
        plotly_colors = [f'rgb({int(r*255)},{int(g*255)},{int(b*255)})' for r, g, b in colors]
        plotly_node_colors = [plotly_colors[group % len(plotly_colors)] for group in G.vs["group"]]
        
        # Create edges trace
        edge_x = []
        edge_y = []
        for edge in G.es:
            source, target = edge.tuple
            edge_x.extend([xs[source], xs[target], None])
            edge_y.extend([ys[source], ys[target], None])
        
        edges_trace = go.Scatter(
            x=edge_x, y=edge_y,
            line=dict(width=0.5, color='#888'),
            hoverinfo='none',
            mode='lines',
            showlegend=False
        )
        
        # Create nodes trace
        nodes_trace = go.Scatter(
            x=xs, y=ys,
            mode='markers',
            hoverinfo='text',
            marker=dict(
                size=G.vs["size"],
                color=plotly_node_colors,
                line=dict(width=2)
            ),
            text=[f"User: {G.vs['name'][i]}<br>Degree: {G.degree()[i]}<br>Community: {G.vs['group'][i]}"
                  for i in range(len(G.vs))],
            showlegend=False
        )
        
        fig = go.Figure(
            data=[edges_trace, nodes_trace],
            layout=go.Layout(
                title='Twitter URL Sharing Network (Plotly)',
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20,l=5,r=5,t=40),
                xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
            )
        )
        return fig
    
    elif plot_style == 'matplotlib':
        fig, ax = plt.subplots(figsize=(12, 8))
        
        # Plot edges
        for edge in G.es:
            source, target = edge.tuple
            x_coords = [xs[source], xs[target]]
            y_coords = [ys[source], ys[target]]
            ax.plot(x_coords, y_coords, color='#888', linewidth=0.5, alpha=0.5)
        
        # Plot nodes with direct RGB colors
        scatter = ax.scatter(xs, ys, 
                           s=[s*100 for s in G.vs["size"]],  # Increased scale factor for better visibility
                           c=node_colors,
                           linewidth=2,
                           edgecolor='white')
        
        # Configure plot
        ax.set_title('Twitter URL Sharing Network (Matplotlib)')
        ax.axis('off')
        plt.tight_layout()
        
        return fig
    else:
        raise ValueError("plot_style must be either 'plotly' or 'matplotlib'")

## Example Usage

Let's demonstrate how to use these functions with sample data and show both visualization styles.

In [None]:
# Load sample data
with open('active_user_graph_50.json', 'r') as f:
    sample_data = json.load(f)

# Process tweets
df_tweets = process_tweets(sample_data)
print(f"Processed {len(df_tweets)} tweets with URLs")

# Build network
G, author_to_idx = build_network(df_tweets, time_window_hours=6, min_cooccurrence=2)
print(f"\nNetwork Statistics:")
print(f"Number of Users: {len(G.vs)}")
print(f"Number of Connections: {len(G.es)}")
density = round(2 * len(G.es) / (len(G.vs) * (len(G.vs) - 1)), 4) if len(G.vs) > 1 else 0
print(f"Network Density: {density}")

# Create and display Plotly visualization
print("\nCreating Plotly visualization...")
fig_plotly = create_network_visualization(G, author_to_idx, plot_style='plotly')
fig_plotly.show()

# Create and display Matplotlib visualization
print("\nCreating Matplotlib visualization...")
fig_mpl = create_network_visualization(G, author_to_idx, plot_style='matplotlib')
plt.show()