1. Write a Python program to read a Hadoop configuration file and display the core components of Hadoop.


In [None]:
import configparser

def display_core_components(config_file_path):
    # Create a ConfigParser object
    config = configparser.ConfigParser()

    # Read the configuration file
    config.read(config_file_path)

    # Get the core components from the configuration file
    core_components = config.get('core-site', 'fs.defaultFS')

    # Display the core components
    print("Core Components of Hadoop:")
    print(core_components)

# Path to the Hadoop configuration file (core-site.xml)
config_file_path = "/path/to/core-site.xml"

# Call the function to display the core components
display_core_components(config_file_path)


2. Implement a Python function that calculates the total file size in a Hadoop Distributed File System (HDFS) directory.


In [None]:
from pywebhdfs.webhdfs import PyWebHdfsClient

def calculate_directory_size(hdfs_host, hdfs_port, hdfs_user, hdfs_directory):
    # Create a PyWebHdfsClient object
    client = PyWebHdfsClient(host=hdfs_host, port=hdfs_port, user_name=hdfs_user)

    # Get the file status of the HDFS directory
    directory_status = client.list_dir(hdfs_directory)

    # Initialize the total file size
    total_size = 0

    # Iterate through each file in the directory
    for file_info in directory_status['FileStatuses']['FileStatus']:
        # Get the file size
        file_size = int(file_info['length'])

        # Add the file size to the total
        total_size += file_size

    return total_size

# HDFS connection details
hdfs_host = "your_hdfs_host"
hdfs_port = 50070  # Default port for HDFS NameNode
hdfs_user = "your_hdfs_username"

# HDFS directory for which to calculate the total size
hdfs_directory = "/path/to/hdfs_directory"

# Call the function to calculate the total file size
total_file_size = calculate_directory_size(hdfs_host, hdfs_port, hdfs_user, hdfs_directory)

# Display the total file size
print("Total File Size: {} bytes".format(total_file_size))


3. Create a Python program that extracts and displays the top N most frequent words from a large text file using the MapReduce approach.

In [None]:
import configparser

def display_core_components(config_file_path):
    # Create a ConfigParser object
    config = configparser.ConfigParser()

    # Read the configuration file
    config.read(config_file_path)

    # Get the core components from the configuration file
    core_components = config.get('core-site', 'fs.defaultFS')

    # Display the core components
    print("Core Components of Hadoop:")
    print(core_components)

# Path to the Hadoop configuration file (core-site.xml)
config_file_path = "/path/to/core-site.xml"

# Call the function to display the core components
display_core_components(config_file_path)


4. Write a Python script that checks the health status of the NameNode and DataNodes in a Hadoop cluster using Hadoop's REST API.

In [None]:
import requests
import json

def check_hadoop_cluster_health(namenode_host, namenode_port):
    # API endpoint URLs
    namenode_health_url = f"http://{namenode_host}:{namenode_port}/jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus"
    datanode_health_url = f"http://{namenode_host}:{namenode_port}/jmx?qry=Hadoop:service=DataNode,name=DataNodeInfo"

    # Check NameNode health
    namenode_health = check_health_status(namenode_health_url)
    print("NameNode Health Status:")
    print(namenode_health)

    # Check DataNode health
    datanode_health = check_health_status(datanode_health_url)
    print("\nDataNode Health Status:")
    print(datanode_health)

def check_health_status(url):
    try:
        # Send GET request to Hadoop REST API
        response = requests.get(url)
        response.raise_for_status()

        # Parse JSON response
        response_data = response.json()

        # Extract health status from response
        health_status = response_data['beans'][0]['State']

        return health_status

    except requests.exceptions.RequestException as e:
        print("Error occurred while checking health status:", str(e))
        return None

# Hadoop cluster details
namenode_host = "your_namenode_host"
namenode_port = 50070  # Default port for Hadoop NameNode REST API

# Call the function to check Hadoop cluster health
check_hadoop_cluster_health(namenode_host, namenode_port)


5. Develop a Python program that lists all the files and directories in a specific HDFS path.

In [None]:
from pywebhdfs.webhdfs import PyWebHdfsClient

def list_hdfs_path(hdfs_host, hdfs_port, hdfs_user, hdfs_path):
    # Create a PyWebHdfsClient object
    client = PyWebHdfsClient(host=hdfs_host, port=hdfs_port, user_name=hdfs_user)

    # List the files and directories in the HDFS path
    directory_contents = client.list_dir(hdfs_path)

    # Iterate through each item in the directory contents
    for item in directory_contents['FileStatuses']['FileStatus']:
        # Check if it is a file or a directory
        if item['type'] == 'DIRECTORY':
            print("[Directory] {}".format(item['pathSuffix']))
        else:
            print("[File]      {}".format(item['pathSuffix']))

# HDFS connection details
hdfs_host = "your_hdfs_host"
hdfs_port = 50070  # Default port for HDFS NameNode
hdfs_user = "your_hdfs_username"

# HDFS path to list
hdfs_path = "/path/to/hdfs_directory"

# Call the function to list the files and directories
list_hdfs_path(hdfs_host, hdfs_port, hdfs_user, hdfs_path)


6. Implement a Python program that analyzes the storage utilization of DataNodes in a Hadoop cluster and identifies the nodes with the highest and lowest storage capacities.

In [None]:
import requests
import json

def analyze_data_node_storage(hdfs_host, hdfs_port):
    # API endpoint URL for getting DataNode information
    datanode_info_url = f"http://{hdfs_host}:{hdfs_port}/jmx?qry=Hadoop:service=DataNode,name=DataNodeInfo"

    # Send GET request to Hadoop REST API
    response = requests.get(datanode_info_url)

    # Parse JSON response
    response_data = response.json()

    # Extract DataNode information from response
    datanode_info = response_data['beans'][0]

    # Get storage information of each DataNode
    storage_info = json.loads(datanode_info['StorageInfo'])

    # Extract storage capacity and utilization of each DataNode
    data_nodes = storage_info['DataNodeStorageInfo']
    data_nodes_info = []
    for data_node in data_nodes:
        node_name = data_node['datanode']
        capacity = data_node['capacity']
        remaining = data_node['remaining']
        utilization = 100 - (float(remaining) / float(capacity)) * 100

        data_nodes_info.append({
            'node_name': node_name,
            'capacity': capacity,
            'remaining': remaining,
            'utilization': utilization
        })

    # Sort DataNodes based on storage capacity
    sorted_data_nodes = sorted(data_nodes_info, key=lambda x: x['capacity'])

    # Print DataNodes with highest and lowest storage capacities
    print("DataNode with highest storage capacity:")
    print(sorted_data_nodes[-1])

    print("\nDataNode with lowest storage capacity:")
    print(sorted_data_nodes[0])

# HDFS cluster details
hdfs_host = "your_hdfs_host"
hdfs_port = 50070  # Default port for Hadoop NameNode REST API

# Call the function to analyze DataNode storage utilization
analyze_data_node_storage(hdfs_host, hdfs_port)
