<a href="https://colab.research.google.com/github/ShrikantKGIT/general/blob/main/Distributed_Hard_Drive_Retrieval_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Simple distributed hard drive retrieval system in Python.**

This system will simulate a cluster of independent storage nodes (hard drives) that work together. A central manager will keep track of where data chunks are stored, and a client will be able to read and write files, with the system automatically handling the failure of a node by retrieving data from a replica.

The key concepts demonstrated are:

**Sharding:** Data is split into smaller chunks.

**Replication:** Each chunk is stored on multiple nodes for redundancy.

**Fault Tolerance:** The system can still retrieve a file even if one of the nodes holding its data goes offline.

In [1]:
import hashlib
import random

# --- Part 1: The Individual Storage Node (Simulated Hard Drive) ---
class StorageNode:
    """
    Simulates a single, independent hard drive in the distributed network.
    It's a simple key-value store where the key is the chunk hash.
    """
    def __init__(self, node_id):
        self.node_id = node_id
        self.storage = {}  # In-memory dictionary to store data chunks
        self.is_online = True
        print(f"Node '{self.node_id}' initialized.")

    def write_chunk(self, chunk_hash, data):
        """Writes a data chunk to the node's storage."""
        if not self.is_online:
            raise ConnectionError(f"Node '{self.node_id}' is offline.")
        self.storage[chunk_hash] = data
        # print(f"DEBUG: Node '{self.node_id}' stored chunk '{chunk_hash[:8]}...'")

    def read_chunk(self, chunk_hash):
        """Reads a data chunk from the node's storage."""
        if not self.is_online:
            raise ConnectionError(f"Node '{self.node_id}' is offline.")
        if chunk_hash not in self.storage:
            raise FileNotFoundError(f"Chunk not found on Node '{self.node_id}'.")
        return self.storage[chunk_hash]

    def unplug(self):
        """Simulates the node going offline."""
        self.is_online = False
        print(f"CRITICAL: Node '{self.node_id}' has gone offline!")

    def plug_in(self):
        """Brings the node back online."""
        self.is_online = True
        print(f"INFO: Node '{self.node_id}' is back online.")

# --- Part 2: The Cluster Manager (Keeps track of everything) ---
class ClusterManager:
    """
    Manages the cluster of storage nodes. It knows which nodes are online
    and where each file chunk and its replicas are stored.
    """
    def __init__(self, replication_factor=2):
        self.nodes = {}  # node_id -> node_instance
        self.chunk_map = {}  # chunk_hash -> [node_id_1, node_id_2, ...]
        self.file_map = {} # filename -> [chunk_hash_1, chunk_hash_2, ...]
        self.replication_factor = replication_factor

    def register_node(self, node):
        """Adds a new storage node to the cluster."""
        self.nodes[node.node_id] = node
        print(f"Node '{node.node_id}' registered with the cluster.")

    def get_storage_nodes_for_chunk(self):
        """
        Selects a set of online nodes to store a chunk and its replicas.
        Ensures replicas are on different nodes.
        """
        online_nodes = [n for n in self.nodes.values() if n.is_online]
        if len(online_nodes) < self.replication_factor:
            raise ConnectionError("Not enough online nodes to meet replication factor.")

        # Return a random sample of nodes
        return random.sample(online_nodes, self.replication_factor)

    def add_file_to_index(self, filename, chunk_hashes, chunk_locations):
        """Updates the central index with file and chunk information."""
        self.file_map[filename] = chunk_hashes
        for chunk_hash, nodes in chunk_locations.items():
            self.chunk_map[chunk_hash] = [node.node_id for node in nodes]

    def get_file_chunk_hashes(self, filename):
        """Returns the list of chunk hashes for a given file."""
        return self.file_map.get(filename)

    def get_chunk_locations(self, chunk_hash):
        """Returns a list of node instances where a chunk can be found."""
        node_ids = self.chunk_map.get(chunk_hash, [])
        # Return node instances, prioritizing online ones
        online_nodes = [self.nodes[nid] for nid in node_ids if self.nodes[nid].is_online]
        offline_nodes = [self.nodes[nid] for nid in node_ids if not self.nodes[nid].is_online]
        return online_nodes + offline_nodes # Put online nodes first

# --- Part 3: The Client (API for reading and writing files) ---
class Client:
    """
    Provides a simple API to interact with the distributed storage system.
    """
    CHUNK_SIZE = 1024 # 1 KB chunks

    def __init__(self, cluster_manager):
        self.manager = cluster_manager

    def write(self, filename, data):
        """
        Writes a file to the distributed system.
        1. Splits data into chunks.
        2. For each chunk, asks the manager for nodes to store it on.
        3. Writes the chunk to the selected primary and replica nodes.
        """
        print(f"\nWriting file '{filename}' to the cluster...")
        content_bytes = data.encode('utf-8')
        chunk_hashes = []
        chunk_locations = {}

        # 1. Split data into chunks
        for i in range(0, len(content_bytes), self.CHUNK_SIZE):
            chunk = content_bytes[i:i+self.CHUNK_SIZE]
            chunk_hash = hashlib.sha256(chunk).hexdigest()
            chunk_hashes.append(chunk_hash)

            # 2. Get nodes for this chunk
            try:
                nodes_for_chunk = self.manager.get_storage_nodes_for_chunk()
                chunk_locations[chunk_hash] = nodes_for_chunk

                # 3. Write chunk to all assigned nodes (primary + replicas)
                for node in nodes_for_chunk:
                    node.write_chunk(chunk_hash, chunk)
                print(f"  - Wrote chunk '{chunk_hash[:8]}...' to nodes {[n.node_id for n in nodes_for_chunk]}")

            except ConnectionError as e:
                print(f"Error writing chunk: {e}")
                return False

        # 4. Update the central index
        self.manager.add_file_to_index(filename, chunk_hashes, chunk_locations)
        print(f"File '{filename}' written successfully.")
        return True

    def read(self, filename):
        """
        Reads a file from the distributed system.
        1. Gets the list of chunk hashes for the file.
        2. For each chunk, gets its possible locations.
        3. Tries to read from the first location. If it fails, tries the next (replica).
        4. Reassembles the chunks into the original file data.
        """
        print(f"\nReading file '{filename}' from the cluster...")
        chunk_hashes = self.manager.get_file_chunk_hashes(filename)
        if not chunk_hashes:
            raise FileNotFoundError(f"File '{filename}' not found in the cluster index.")

        file_data = b''
        for chunk_hash in chunk_hashes:
            locations = self.manager.get_chunk_locations(chunk_hash)
            if not locations:
                raise IOError(f"FATAL: All nodes for chunk '{chunk_hash[:8]}...' are offline. Data is lost.")

            chunk_data = None
            for node in locations:
                try:
                    chunk_data = node.read_chunk(chunk_hash)
                    print(f"  - Read chunk '{chunk_hash[:8]}...' successfully from Node '{node.node_id}'")
                    break # Success, move to the next chunk
                except (ConnectionError, FileNotFoundError) as e:
                    print(f"  - Could not read chunk '{chunk_hash[:8]}...' from Node '{node.node_id}': {e}. Trying replica...")

            if chunk_data is None:
                raise IOError(f"FATAL: Failed to read chunk '{chunk_hash[:8]}...' from any of its locations.")

            file_data += chunk_data

        return file_data.decode('utf-8')

# --- Part 4: Example Usage ---
if __name__ == "__main__":
    # 1. Set up the cluster
    manager = ClusterManager(replication_factor=2)
    nodes = [StorageNode(f"Node-{i}") for i in range(4)]
    for n in nodes:
        manager.register_node(n)

    # 2. Create a client to interact with the cluster
    client = Client(manager)

    # 3. Write a file
    my_document = "This is a very important document about distributed systems. It needs to be stored reliably and should survive the failure of a single node."
    client.write("mydoc.txt", my_document)

    # 4. Read it back to verify
    retrieved_doc = client.read("mydoc.txt")
    print(f"\nInitial read successful. Content: '{retrieved_doc}'")

    print("\n" + "="*50 + "\n")

    # 5. --- SIMULATE A NODE FAILURE ---
    # Find a node that holds a piece of our file and take it offline
    first_chunk_hash = manager.get_file_chunk_hashes("mydoc.txt")[0]
    nodes_with_chunk = manager.chunk_map[first_chunk_hash]
    node_to_unplug_id = nodes_with_chunk[0]
    manager.nodes[node_to_unplug_id].unplug()

    # 6. Try to read the file again. The system should automatically use the replica.
    try:
        recovered_doc = client.read("mydoc.txt")
        print("\n--- Final Result ---")
        print(f"Original data:  '{my_document}'")
        print(f"Recovered data: '{recovered_doc}'")

        if my_document == recovered_doc:
            print("\nSUCCESS: Data was automatically recovered from a replica node!")
        else:
            print("\nFAILURE: Recovered data does not match the original.")
    except Exception as e:
        print(f"\nAn error occurred during recovery: {e}")


Node 'Node-0' initialized.
Node 'Node-1' initialized.
Node 'Node-2' initialized.
Node 'Node-3' initialized.
Node 'Node-0' registered with the cluster.
Node 'Node-1' registered with the cluster.
Node 'Node-2' registered with the cluster.
Node 'Node-3' registered with the cluster.

Writing file 'mydoc.txt' to the cluster...
  - Wrote chunk 'a1cd542a...' to nodes ['Node-2', 'Node-3']
File 'mydoc.txt' written successfully.

Reading file 'mydoc.txt' from the cluster...
  - Read chunk 'a1cd542a...' successfully from Node 'Node-2'

Initial read successful. Content: 'This is a very important document about distributed systems. It needs to be stored reliably and should survive the failure of a single node.'


CRITICAL: Node 'Node-2' has gone offline!

Reading file 'mydoc.txt' from the cluster...
  - Read chunk 'a1cd542a...' successfully from Node 'Node-3'

--- Final Result ---
Original data:  'This is a very important document about distributed systems. It needs to be stored reliably and should