## Comparative Road Network Analysis: DATA GENERATION

In [11]:
import pandas as pd
import numpy as np
import networkx as nx
from itertools import combinations
import random
                                                                                                     # Setting random seed for reproducibility
random.seed(0)
np.random.seed(0)
                                                                                                     # Loading city names from the file and remove duplicates
with open('cityNames.txt', 'r') as file:
    city_names = [line.strip() for line in file.readlines()]
city_names = list(set(city_names))                                                                   # Removing duplicate city names
print(f"Total cities in 'cityNames.txt': {len(city_names)}")                                         # Printing the total number of cities in the file
                                                                                                     # Limit to the first 200 cities for the graph
city_names = city_names[:200]  
print(f"Using {len(city_names)} cities for the graph.")                                              # Confirming the final number of cities being used
                                                                                                     # Creating an undirected graph with 200 cities (nodes)
G = nx.Graph()
G.add_nodes_from(city_names)
                                                                                                     # Generating a list of all possible unique city pairs (edges)
edges = list(combinations(city_names, 2))
                                                                                                     # Shuffling the list of edges randomly to simulate a random road network
random.shuffle(edges)
                                                                                                     # Assigning a random distance between 100 and 1000 for each edge
distances = np.random.randint(100, 1001, size=len(edges))
                                                                                                     # Creating edge list with distances attached to each edge (city pair)
edge_list = [(u, v, {'distance': d}) for (u, v), d in zip(edges, distances) if u != v]               # Ensuring no self-loops
                                                                                                     # Adding the first 3,000 edges to the graph
G.add_edges_from(edge_list[:3000])
                                                                                                     # Verifying the first 3,000 edges for uniqueness and no self-loops
first_3000_edges = [(u, v) for u, v, _ in edge_list[:3000]]
first_3000_edges_set = set(tuple(sorted([u, v])) for u, v in first_3000_edges)                       # Ensuring uniqueness of edges
first_3000_duplicates_check = len(first_3000_edges_set) == len(first_3000_edges)
                                                                                                     # Checking that there are no self-loops in the first 3,000 edges
first_3000_self_loops_check = all(u != v for u, v in first_3000_edges)
                                                                                                     # Output verification results for first 3,000 edges
print(f"Verification for first 3000 edges:")
print(f"1. No duplicates in the first 3000 edges: {first_3000_duplicates_check}")
print(f"2. No self-loops in the first 3000 edges: {first_3000_self_loops_check}")
                                                                                                      # Ensuring each city has at least 15 connections
while True:
    for city in city_names:
        out_edges = [(city, neighbor) for neighbor in G.neighbors(city)]
        in_edges = [(neighbor, city) for neighbor in G.neighbors(city)]
        if len(out_edges) < 15 or len(in_edges) < 15:
            other_cities = [c for c in city_names if c != city]                                       # List of cities excluding the current one
            random.shuffle(other_cities)
            neighbor = random.choice(other_cities)                                                    # Picking a random neighbor
            distance = np.random.randint(100, 1001)                                                   # Assigning a random distance
            if (city, neighbor) not in out_edges and (neighbor, city) not in in_edges:                # Avoiding duplicate edges
                G.add_edge(city, neighbor, distance=distance)
    if all(len([(city, neighbor) for neighbor in G.neighbors(city)]) >= 15 for city in city_names) and all(len([(neighbor, city) for neighbor in G.neighbors(city)]) >= 15 for city in city_names):
        break              
  
                                                                                                       # Calculating the average number of connections per city
avg_connections = np.mean([len(list(G.neighbors(city))) for city in city_names])
                                                                                                       # Converting the graph edges to a Pandas DataFrame
df = pd.DataFrame([(u, v, d['distance']) for u, v, d in G.edges(data=True)], columns=['city1', 'city2', 'distance'])
                                                                                                        # Ensuring dataset has 10,000 rows by adding more edges if necessary
unique_edges_set = set()                                                                                # Tracking already added edges
while len(df) < 10000:
    city1 = random.choice(city_names)
    city2 = random.choice([c for c in city_names if c != city1])  
    if city1 != city2: 
        edge_pair = tuple(sorted([city1, city2])) 
        if edge_pair not in unique_edges_set:  
            distance = np.random.randint(100, 1001)  
            G.add_edge(city1, city2, distance=distance)
            unique_edges_set.add(edge_pair) 
                                                                                                        # Recreate the DataFrame with the new edges
            df = pd.DataFrame([(u, v, d['distance']) for u, v, d in G.edges(data=True)], columns=['city1', 'city2', 'distance'])

                                                                                                        # Verifying dataset integrity
df['sorted_pair'] = df.apply(lambda x: tuple(sorted([x['city1'], x['city2']])), axis=1)                 # Sorting city pairs for uniqueness check
uniqueness_check = df.duplicated(subset=['sorted_pair']).sum() == 0                                     # Checking for duplicate entries
size_check = len(df) == 10000                                                                           # Ensure dataset has 10,000 rows
connectivity_check = nx.is_connected(G)                                                                 # Verifying the graph is connected (all cities are reachable)
min_connections_check = all(len(list(G.neighbors(city))) >= 15 for city in city_names)                  # Ensuring each city has at least 15 connections
self_loops_check = not (df['city1'] == df['city2']).any()                                               # Ensuring there are no self-loops
                                                                                                        # Output verification results
print(f"Average number of connections per city: {avg_connections:.2f}")
print("Verification:")
print(f"1. Network Structure: Connected road network with distances between cities: {connectivity_check}")
print(f"2. Uniqueness: No duplicate entries for city pairs: {uniqueness_check}")
print(f"3. Size: Dataset has 10000 rows (200 nodes, 3000 unique edges): {size_check}")
print(f"4. Connectivity: Each city has multiple incoming and outgoing paths: {min_connections_check}")
print(f"5. No self-loops: No city is connected to itself: {self_loops_check}")
                                                                                                         # Counting and output the number of unique cities (nodes) in the graph
num_unique_cities = len(G.nodes())
print(f"Number of unique cities (nodes): {num_unique_cities}")
                                                                                                          # Removing the 'sorted_pair' column used for uniqueness check before saving
df.drop(columns=['sorted_pair'], inplace=True)
                                                                                                          # Saving the final DataFrame to a CSV file
df.to_csv('road_network.csv', index=False)


Total cities in 'cityNames.txt': 366
Using 200 cities for the graph.
Verification for first 3000 edges:
1. No duplicates in the first 3000 edges: True
2. No self-loops in the first 3000 edges: True
Average number of connections per city: 30.00
Verification:
1. Network Structure: Connected road network with distances between cities: True
2. Uniqueness: No duplicate entries for city pairs: True
3. Size: Dataset has 10000 rows (200 nodes, 3000 unique edges): True
4. Connectivity: Each city has multiple incoming and outgoing paths: True
5. No self-loops: No city is connected to itself: True
Number of unique cities (nodes): 200


**ABOUT THE DATASET**

- **Graph Size and Structure:** The graph consists of 200 cities and 3000 unique edges, forming a well-connected road network with distances defined between cities.

- **Data Integrity:** There are no duplicates in the first 3000 edges, ensuring clean data input, and no self-loops, meaning no city is connected to itself.

- **Network Characteristics:** The network is fully connected, and every city has multiple incoming and outgoing connections, which ensures robust connectivity.

- **Dataset Completeness:** The dataset contains 10,000 rows **(NOTE ONLY 3000 WERE USED IN FINAL TASKS)**, confirming that the cities and edges are properly represented without redundancy.

- **Verification Accuracy:** All verifications (no duplicates, no self-loops, and connectedness) returned true, assuring the validity of the graph data.

## ROAD NETWORKS: Batch Import for Large Files and Dataset Verification

In [18]:
from neo4j import GraphDatabase
import pandas as pd

# Neo4j connection details
uri = "bolt://localhost:7687"
username = "neo4j"
password = "#Jzee2019"

# Initialize the Neo4j driver
driver = GraphDatabase.driver(uri, auth=(username, password))

def batch_import(file_name, batch_size=1000, max_edges=3000):
    """
    Batch imports a CSV file into Neo4j as city nodes and road relationships.
    Ensures the import happens in manageable batches up to a maximum of 3000 edges.
    """
    # Load the CSV file into a DataFrame
    df = pd.read_csv(file_name)
    
    # Shuffle the DataFrame to ensure randomness in the import process
    df = df.sample(frac=1).reset_index(drop=True)
    
    # Check the total number of rows in the DataFrame
    total_rows = len(df)
    total_batches = (total_rows // batch_size) + (1 if total_rows % batch_size != 0 else 0)
    print(f"Total rows: {total_rows}, Total batches: {total_batches}")
    
    edges_added_total = 0  # Counter to track total edges created

    for batch_num in range(total_batches):
        # Select the batch of rows to process
        batch = df.iloc[batch_num * batch_size:(batch_num + 1) * batch_size]

        # Process the batch
        edges_added = process_batch(batch)
        edges_added_total += edges_added
        
        print(f"Batch {batch_num + 1}/{total_batches} processed, edges added: {edges_added}, total: {edges_added_total}")
        
        # Stop the process after 3000 edges
        if edges_added_total >= max_edges:
            print(f"Reached {max_edges} edges, stopping import.")
            break

    print(f"Total edges added: {edges_added_total}")

def process_batch(batch):
    """
    Processes a batch of rows and imports them into Neo4j.
    Ensures that no duplicate relationships are created.
    """
    query = """
    UNWIND $batch AS row
    MERGE (c1:City {name: row.city1})
    MERGE (c2:City {name: row.city2})
    MERGE (c1)-[r:ROAD {distance: toInteger(row.distance)}]->(c2)
    """
    
    with driver.session() as session:
        # Execute the batch import query
        result = session.run(query, batch=batch.to_dict(orient='records'))
        
        # Consume the result to get statistics
        summary = result.consume()  # This provides the summary of the transaction
        edges_added = summary.counters.relationships_created  # Get the count of created relationships
    
    return edges_added

def verify_import():
    """
    Verifies the import by checking the total number of cities (nodes) and roads (relationships).
    """
    check_counts_query = """
    MATCH (c:City)
    RETURN COUNT(c) AS TotalCities;
    """
    
    check_edges_query = """
    MATCH ()-[r:ROAD]->()
    RETURN COUNT(r) AS TotalEdges;
    """
    
    with driver.session() as session:
        # Count the total number of cities (nodes)
        total_cities = session.run(check_counts_query).single()["TotalCities"]
        
        # Count the total number of edges (relationships)
        total_edges = session.run(check_edges_query).single()["TotalEdges"]
        
        print(f"Total Cities Imported (Nodes): {total_cities}")
        print(f"Total Roads Imported (Edges): {total_edges}")

# File name of the CSV to import
file_name = "road_network.csv"

# Run the batch import process
batch_import(file_name, max_edges=3000)

# Verify the import
print("Data import successful. Running verification checks...")
verify_import()

Total rows: 10000, Total batches: 10
Batch 1/10 processed, edges added: 1000, total: 1000
Batch 2/10 processed, edges added: 1000, total: 2000
Batch 3/10 processed, edges added: 1000, total: 3000
Reached 3000 edges, stopping import.
Total edges added: 3000
Data import successful. Running verification checks...
Total Cities Imported (Nodes): 200
Total Roads Imported (Edges): 3000


**DATA IMPORT**

- **Data Import Process:** The data import was conducted in 10 batches, with 1000 edges processed per batch, completing the import with a total of 3000 edges (as required set limit to 3000 but randomly across the 10 000 rows dataset for a more represantation).

- **Successful Data Import:** The import was successful, adding a total of 200 cities (nodes) and 3000 roads (edges) to the dataset.

- **Efficient Handling:** The system stopped after processing 3000 edges, confirming that the dataset was efficiently imported within the set constraints.

- **Verification Checks:** After importing, the verification checks confirmed that 200 cities and 3000 roads were correctly added without discrepancies.

- **Data Integrity:** The successful import and verification of cities and roads ensure that the dataset is complete and structured as intended for further analysis.