# 1. Write a Python program to read a Hadoop configuration file and display the core components of Hadoop.

In [2]:
import configparser

def display_hadoop_components(config_file):
    # Create a ConfigParser object
    config = configparser.ConfigParser()

    # Read the configuration file
    config.read(config_file)

    # Get the sections in the configuration file
    sections = config.sections()

    # Check if the sections contain Hadoop core components
    core_components = ['namenode', 'datanode', 'secondarynamenode', 'resourcemanager', 'nodemanager']

    # Display the core components of Hadoop
    print("Core Components of Hadoop:")
    for section in sections:
        if section.lower() in core_components:
            print(section)

# Specify the path to your Hadoop configuration file
hadoop_config_file = '/path/to/hadoop/conf/hadoop-env.sh'

# Call the function to display the core components
display_hadoop_components(hadoop_config_file)

Core Components of Hadoop:


# 2. Implement a Python function that calculates the total file size in a Hadoop Distributed File System (HDFS) directory.

In [None]:
from hdfs import InsecureClient

def calculate_directory_size(hdfs_url, directory):
    # Create an HDFS client
    client = InsecureClient(hdfs_url)

    # Get the file status of the directory
    directory_status = client.status(directory)

    # Check if the directory exists and is a directory
    if directory_status['type'] != 'DIRECTORY':
        print(f"{directory} is not a valid directory.")
        return

    # Calculate the total size of the directory
    total_size = 0

    # Get the file status of each file in the directory
    file_statuses = client.list(directory, status=True)

    for file_status in file_statuses:
        if file_status['type'] == 'FILE':
            total_size += file_status['length']

    return total_size

# Specify the HDFS URL and the directory path
hdfs_url = 'http://localhost:50070'
directory_path = '/user/myuser/mydirectory'

# Call the function to calculate the total file size
total_size = calculate_directory_size(hdfs_url, directory_path)

if total_size:
    print(f"Total file size in {directory_path}: {total_size} bytes")


# 3. Create a Python program that extracts and displays the top N most frequent words from a large text file using the MapReduce approach.

In [None]:
from mrjob.job import MRJob
from mrjob.step import MRStep
import heapq

class TopNWords(MRJob):

    def configure_args(self):
        super(TopNWords, self).configure_args()
        self.add_passthru_arg('--n', type=int, default=10, help='Number of top words to display')

    def steps(self):
        return [
            MRStep(mapper=self.mapper_get_words,
                   combiner=self.combiner_count_words,
                   reducer=self.reducer_count_words),
            MRStep(reducer=self.reducer_find_top_n)
        ]

    def mapper_get_words(self, _, line):
        for word in line.split():
            yield word.lower(), 1

    def combiner_count_words(self, word, counts):
        yield word, sum(counts)

    def reducer_count_words(self, word, counts):
        yield None, (sum(counts), word)

    def reducer_find_top_n(self, _, word_counts):
        n = self.options.n
        top_n = heapq.nlargest(n, word_counts)
        for count, word in top_n:
            yield word, count

if __name__ == '__main__':
    TopNWords.run()


# 4. Write a Python script that checks the health status of the NameNode and DataNodes in a Hadoop cluster using Hadoop's REST API.

In [None]:
import requests

def check_namenode_health(namenode_url):
    url = f"{namenode_url}/jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        state = data['beans'][0]['State']
        
        if state == 'active':
            print("NameNode is active and healthy.")
        else:
            print("NameNode is not active.")
    else:
        print("Failed to fetch NameNode status.")

def check_datanode_health(namenode_url):
    url = f"{namenode_url}/jmx?qry=Hadoop:service=DataNode,name=DataNodeInfo"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        live_nodes = data['beans'][0]['LiveNodes']
        dead_nodes = data['beans'][0]['DeadNodes']
        
        print(f"Number of live DataNodes: {len(live_nodes)}")
        print(f"Number of dead DataNodes: {len(dead_nodes)}")
        
        if len(dead_nodes) == 0:
            print("All DataNodes are healthy.")
        else:
            print("Some DataNodes are dead.")
    else:
        print("Failed to fetch DataNode status.")

# Specify the URL of the NameNode's web interface
namenode_url = 'http://localhost:9870'

# Check the health status of the NameNode
check_namenode_health(namenode_url)

# Check the health status of the DataNodes
check_datanode_health(namenode_url)


# 5. Develop a Python program that lists all the files and directories in a specific HDFS path

In [None]:
from hdfs import InsecureClient

def list_hdfs_path(hdfs_url, hdfs_path):
    # Create an HDFS client
    client = InsecureClient(hdfs_url)

    # Check if the HDFS path exists
    if not client.status(hdfs_path, strict=False):
        print(f"{hdfs_path} does not exist.")
        return

    # List all files and directories in the HDFS path
    file_info = client.list(hdfs_path, status=True)

    # Display the files and directories
    print(f"Files and Directories in {hdfs_path}:")
    for item in file_info:
        item_path = item['path']
        item_type = item['type']
        item_size = item['length'] if item_type == 'FILE' else ''
        print(f"{item_type}: {item_path} ({item_size} bytes)")

# Specify the HDFS URL and the HDFS path
hdfs_url = 'http://localhost:50070'
hdfs_path = '/user/myuser/mydirectory'

# Call the function to list the files and directories
list_hdfs_path(hdfs_url, hdfs_path)


# 6. . Implement a Python program that analyzes the storage utilization of DataNodes in a Hadoop cluster and identifies the nodes with the highest and lowest storage capacities.

In [None]:
from hdfs import InsecureClient

def analyze_storage_utilization(hdfs_url):
    # Create an HDFS client
    client = InsecureClient(hdfs_url)

    # Get the information of all DataNodes
    datanodes_info = client.get_datanode_storage_report()

    # Sort DataNodes based on storage capacity
    sorted_datanodes = sorted(datanodes_info, key=lambda x: x['remaining'])

    # Display the storage utilization for each DataNode
    print("Storage Utilization of DataNodes:")
    for datanode in sorted_datanodes:
        name = datanode['name']
        used = datanode['used']
        remaining = datanode['remaining']
        capacity = datanode['capacity']
        utilization = (used / capacity) * 100

        print(f"Node: {name}")
        print(f"Storage Capacity: {capacity} bytes")
        print(f"Used: {used} bytes")
        print(f"Remaining: {remaining} bytes")
        print(f"Utilization: {utilization:.2f}%")
        print()

    # Identify the node with the highest and lowest storage capacities
    highest_capacity_node = sorted_datanodes[-1]
    lowest_capacity_node = sorted_datanodes[0]

    print("Node with Highest Storage Capacity:")
    print(f"Node: {highest_capacity_node['name']}")
    print(f"Storage Capacity: {highest_capacity_node['capacity']} bytes")
    print()

    print("Node with Lowest Storage Capacity:")
    print(f"Node: {lowest_capacity_node['name']}")
    print(f"Storage Capacity: {lowest_capacity_node['capacity']} bytes")

# Specify the HDFS URL
hdfs_url = 'http://localhost:50070'

# Call the function to analyze storage utilization
analyze_storage_utilization(hdfs_url)


# 7.  Create a Python script that interacts with YARN's ResourceManager API to submit a Hadoop job, monitor its progress, and retrieve the final output.

In [None]:
import requests
import time

def submit_hadoop_job(resource_manager_url, jar_path, input_path, output_path):
    # Submit the Hadoop job
    submit_url = f"{resource_manager_url}/ws/v1/cluster/apps/new-application"
    response = requests.post(submit_url)

    if response.status_code == 200:
        data = response.json()
        app_id = data['application-id']
        print(f"Job submitted successfully. Application ID: {app_id}")
    else:
        print("Failed to submit the Hadoop job.")
        return

    # Prepare the job configuration
    job_config = {
        "application-id": app_id,
        "application-name": "Hadoop Job",
        "am-container-spec": {
            "commands": {
                "command": f"hadoop jar {jar_path} <input_args>"
            },
            "local-resources": {
                "entry": [
                    {
                        "key": "input",
                        "value": input_path
                    },
                    {
                        "key": "output",
                        "value": output_path
                    }
                ]
            }
        }
    }

    # Submit the job configuration
    job_submit_url = f"{resource_manager_url}/ws/v1/cluster/apps"
    response = requests.post(job_submit_url, json=job_config)

    if response.status_code == 202:
        print("Job configuration submitted successfully.")
    else:
        print("Failed to submit the job configuration.")
        return

    # Monitor job progress
    while True:
        job_status_url = f"{resource_manager_url}/ws/v1/cluster/apps/{app_id}"
        response = requests.get(job_status_url)

        if response.status_code == 200:
            data = response.json()
            state = data['app']['state']
            final_status = data['app']['finalStatus']

            print(f"Job state: {state}")
            print(f"Final status: {final_status}")

            if state == 'FINISHED':
                print("Job completed successfully.")
                break
            elif state == 'FAILED' or state == 'KILLED':
                print("Job failed or was killed.")
                break

        time.sleep(5)

    # Retrieve the final output
    final_output_url = f"{resource_manager_url}/ws/v1/cluster/apps/{app_id}/appattempts"
    response = requests.get(final_output_url)

    if response.status_code == 200:
        data = response.json()
        app_attempt_id = data['appAttempts']['appAttempt'][0]['appAttemptId']
        final_output = f"{output_path}/{app_attempt_id}/container_*/stdout"

        print("Final output:")
        print(final_output)
    else:
        print("Failed to retrieve the final output.")

# Specify the URL of the YARN ResourceManager
resource_manager_url = 'http://localhost:8088'

# Specify the path to the Hadoop job JAR file, input path, and output path
jar_path = '/path/to/hadoop-job.jar'
input_path = '/input/path'
output_path = '/output/path'

# Call the function to submit the Hadoop job, monitor its progress, and retrieve the final output
submit_hadoop_job(resource_manager_url, jar_path, input_path, output_path)


# 8. Create a Python script that interacts with YARN's ResourceManager API to submit a Hadoop job, set resource requirements, and track resource usage during job execution.

In [None]:
import requests
import time

def submit_hadoop_job(resource_manager_url, jar_path, input_path, output_path, num_containers, container_memory, container_vcores):
    # Submit the Hadoop job
    submit_url = f"{resource_manager_url}/ws/v1/cluster/apps/new-application"
    response = requests.post(submit_url)

    if response.status_code == 200:
        data = response.json()
        app_id = data['application-id']
        print(f"Job submitted successfully. Application ID: {app_id}")
    else:
        print("Failed to submit the Hadoop job.")
        return

    # Prepare the job configuration
    job_config = {
        "application-id": app_id,
        "application-name": "Hadoop Job",
        "am-container-spec": {
            "commands": {
                "command": f"hadoop jar {jar_path} <input_args>"
            },
            "local-resources": {
                "entry": [
                    {
                        "key": "input",
                        "value": input_path
                    },
                    {
                        "key": "output",
                        "value": output_path
                    }
                ]
            }
        },
        "resource": {
            "memory": container_memory,
            "vCores": container_vcores
        },
        "instances": num_containers
    }

    # Submit the job configuration
    job_submit_url = f"{resource_manager_url}/ws/v1/cluster/apps"
    response = requests.post(job_submit_url, json=job_config)

    if response.status_code == 202:
        print("Job configuration submitted successfully.")
    else:
        print("Failed to submit the job configuration.")
        return

    # Monitor resource usage during job execution
    while True:
        job_status_url = f"{resource_manager_url}/ws/v1/cluster/apps/{app_id}/appattempts"
        response = requests.get(job_status_url)

        if response.status_code == 200:
            data = response.json()
            app_attempts = data['appAttempts']['appAttempt']
            
            if not app_attempts:
                print("No application attempts found.")
                break

            latest_attempt = app_attempts[-1]
            latest_attempt_id = latest_attempt['appAttemptId']
            container_report_url = f"{resource_manager_url}/ws/v1/cluster/apps/{app_id}/appattempts/{latest_attempt_id}/containers"
            response = requests.get(container_report_url)

            if response.status_code == 200:
                data = response.json()
                containers = data['containers']['container']
                
                if not containers:
                    print("No containers found.")
                    break

                print("Resource Usage:")
                for container in containers:
                    container_id = container['id']
                    container_state = container['state']
                    container_memory = container['allocatedMB']
                    container_vcores = container['allocatedVCores']

                    print(f"Container ID: {container_id}")
                    print(f"State: {container_state}")
                    print(f"Allocated Memory: {container_memory} MB")
                    print(f"Allocated vCores: {container_vcores}")
                    print()

                time.sleep(10)
            else:
                print("Failed to fetch container report.")
                break
        else:
            print("Failed to fetch job attempts.")
            break

# Specify the URL of the YARN ResourceManager
resource_manager_url = 'http://localhost:8088'

# Specify the path to the Hadoop job JAR file, input path, and output path
jar_path = '/path/to/hadoop-job.jar'
input_path = '/input/path'
output_path = '/output/path'

# Specify the resource requirements
num_containers = 2
container_memory = 1024  # in MB
container_vcores = 1

# Call the function to submit the Hadoop job, set resource requirements, and track resource usage
submit_hadoop_job(resource_manager_url, jar_path, input_path, output_path, num_containers, container_memory, container_vcores)


# 9. Write a Python program that compares the performance of a MapReduce job with different input split sizes, showcasing the impact on overall job execution time.

In [None]:
from mrjob.job import MRJob
from mrjob.step import MRStep
import time

class PerformanceComparison(MRJob):

    def configure_args(self):
        super(PerformanceComparison, self).configure_args()
        self.add_passthru_arg('--split-size', type=int, default=64, help='Input split size in MB')

    def mapper(self, _, line):
        # Emit each word as a key with a count of 1
        words = line.split()
        for word in words:
            yield word, 1

    def combiner(self, word, counts):
        # Sum the counts of each word
        yield word, sum(counts)

    def reducer(self, word, counts):
        # Output the total count for each word
        yield word, sum(counts)

    def steps(self):
        return [
            MRStep(mapper=self.mapper,
                   combiner=self.combiner,
                   reducer=self.reducer)
        ]

if __name__ == '__main__':
    start_time = time.time()

    # Specify the path to the input file
    input_file = 'input.txt'

    # Specify the split size in MB (change this to different split sizes for comparison)
    split_size = 64

    # Run the MapReduce job
    mr_job = PerformanceComparison(args=[input_file, '--split-size', str(split_size)])
    with mr_job.make_runner() as runner:
        runner.run()

        # Print the word count results
        for word, count in mr_job.parse_output(runner.cat_output()):
            print(f"{word}: {count}")

    end_time = time.time()

    # Calculate and print the execution time
    execution_time = end_time - start_time
    print(f"\nExecution time: {execution_time:.2f} seconds")
