1. Write a Python program to read a Hadoop configuration file and display the core components of Hadoop.

In [None]:
import configparser

def display_hadoop_components(config_file_path):
    config = configparser.ConfigParser()
    config.read(config_file_path)

    if 'core-site' in config:
        core_site = config['core-site']
        if 'fs.defaultFS' in core_site:
            print("Hadoop Distributed File System (HDFS) configured at:", core_site['fs.defaultFS'])

    if 'yarn-site' in config:
        yarn_site = config['yarn-site']
        if 'yarn.resourcemanager.address' in yarn_site:
            print("Resource Manager configured at:", yarn_site['yarn.resourcemanager.address'])

    if 'mapred-site' in config:
        mapred_site = config['mapred-site']
        if 'mapreduce.framework.name' in mapred_site:
            print("MapReduce Framework configured:", mapred_site['mapreduce.framework.name'])

# Example usage
config_file = '/path/to/hadoop-config-file.xml'
display_hadoop_components(config_file)


2. Implement a Python function that calculates the total file size in a Hadoop Distributed File System (HDFS) directory.

In [None]:
import pyarrow.hdfs as hdfs

def calculate_total_file_size(hdfs_host, hdfs_port, directory):
    total_size = 0
    fs = hdfs.connect(host=hdfs_host, port=hdfs_port)

    # List all files in the directory
    files = fs.ls(directory, detail=True)

    # Iterate through each file and sum up the file sizes
    for file in files:
        total_size += file['size']

    fs.close()

    return total_size

#Driver usage
hdfs_host = 'your_hdfs_host'
hdfs_port = your_hdfs_port
directory_path = '/your/hdfs/directory'

total_size = calculate_total_file_size(hdfs_host, hdfs_port, directory_path)
print("Total file size:", total_size)


3. Create a Python program that extracts and displays the top N most frequent words from a large text file using the MapReduce approach.

In [None]:
from mrjob.job import MRJob
from mrjob.step import MRStep
import re

WORD_RE = re.compile(r"\b\w+\b")  # Regular expression to match words

class TopNWords(MRJob):

    def mapper_get_words(self, _, line):
        words = WORD_RE.findall(line)
        for word in words:
            yield word.lower(), 1

    def combiner_count_words(self, word, counts):
        yield word, sum(counts)

    def reducer_count_words(self, word, counts):
        yield None, (sum(counts), word)

    def reducer_find_top_n(self, _, word_count_pairs):
        n = 10  # Specify the value of N for top N words
        top_n = sorted(word_count_pairs, reverse=True)[:n]
        for count, word in top_n:
            yield word, count

    def steps(self):
        return [
            MRStep(mapper=self.mapper_get_words,
                   combiner=self.combiner_count_words,
                   reducer=self.reducer_count_words),
            MRStep(reducer=self.reducer_find_top_n)
        ]

if __name__ == '__main__':
    TopNWords.run()


#Run the above output in the terminal by saving file as top_most_frequent_elements.py
python top_most_frequent_elements.py -r hadoop --hadoop-streaming-jar /path/to/hadoop-streaming.jar input_file.txt


4. Write a Python script that checks the health status of the NameNode and DataNodes in a Hadoop cluster using Hadoop's REST API.

In [None]:
import requests

def check_namenode_health(namenode_host, namenode_port):
    url = f"http://{namenode_host}:{namenode_port}/jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus"
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()
        live_nodes = data["beans"][0]["LiveNodes"]
        dead_nodes = data["beans"][0]["DeadNodes"]

        print("NameNode Health Status:")
        print("Live Nodes:", live_nodes)
        print("Dead Nodes:", dead_nodes)
    else:
        print("Error accessing NameNode status")

def check_datanode_health(namenode_host, namenode_port):
    url = f"http://{namenode_host}:{namenode_port}/dfshealth.html"
    response = requests.get(url)

    if response.status_code == 200:
        if "Datanodes available" in response.text:
            print("DataNode Health Status: All DataNodes are available")
        else:
            print("DataNode Health Status: Some DataNodes are not available")
    else:
        print("Error accessing DataNode status")

# Example usage
nn_host = 'your_namenode_host'
nn_port = your_namenode_port

check_namenode_health(nn_host, nn_port)
check_datanode_health(nn_host, nn_port)


5. Develop a Python program that lists all the files and directories in a specific HDFS path.

In [None]:
import pyarrow.hdfs as hdfs

def list_hdfs_files(hdfs_host, hdfs_port, hdfs_path):
    fs = hdfs.connect(host=hdfs_host, port=hdfs_port)

    # List all files and directories in the given HDFS path
    files = fs.ls(hdfs_path)

    for file in files:
        print(file)

    fs.close()

# Example usage
hdfs_host = 'your_hdfs_host'
hdfs_port = your_hdfs_port
hdfs_directory = '/your/hdfs/directory'

list_hdfs_files(hdfs_host, hdfs_port, hdfs_directory)


6. Implement a Python program that analyzes the storage utilization of DataNodes in a Hadoop cluster and identifies the nodes with the highest and lowest storage capacities.

In [None]:
import requests

def analyze_storage_utilization(namenode_host, namenode_port):
    url = f"http://{namenode_host}:{namenode_port}/dfshealth.html"
    response = requests.get(url)

    if response.status_code == 200:
        data = response.text

        # Extract the storage utilization information
        start_pos = data.find('<table id="dfstable"')
        end_pos = data.find('</table>', start_pos)
        table_html = data[start_pos:end_pos + len('</table>')]

        # Parse the table HTML to extract the relevant data
        storage_data = []
        rows = table_html.split('</tr>')
        for row in rows:
            cols = row.split('</td>')
            node_name = cols[0].split('>')[-1]
            used_storage = cols[1].split('>')[-1]
            percent_used = cols[3].split('>')[-1]
            storage_data.append((node_name, used_storage, percent_used))

        # Sort the storage data by used storage in descending order
        sorted_data = sorted(storage_data, key=lambda x: int(x[1]), reverse=True)

        # Display the nodes with highest and lowest storage capacities
        print("Node with Highest Storage Capacity:")
        print("Node Name:", sorted_data[0][0])
        print("Used Storage:", sorted_data[0][1])
        print("Percentage Used:", sorted_data[0][2])

        print("\nNode with Lowest Storage Capacity:")
        print("Node Name:", sorted_data[-1][0])
        print("Used Storage:", sorted_data[-1][1])
        print("Percentage Used:", sorted_data[-1][2])

    else:
        print("Error accessing HDFS health page")


# Example usage
nn_host = 'your_namenode_host'
nn_port = your_namenode_port

analyze_storage_utilization(nn_host, nn_port)


7. Create a Python script that interacts with YARN's ResourceManager API to submit a Hadoop job, monitor its progress, and retrieve the final output.

In [None]:
import requests
import time

# YARN ResourceManager API endpoint
rm_api = 'http://<resourcemanager-host>:<resourcemanager-port>/ws/v1/cluster'

# Function to submit a Hadoop job
def submit_job(jar_file, main_class, input_path, output_path):
    url = f'{rm_api}/apps/new-application'
    response = requests.post(url)

    if response.status_code == 200:
        data = response.json()
        app_id = data['application-id']

        # Submit the job using the obtained application ID
        url = f'{rm_api}/apps/{app_id}/submit'
        headers = {'Content-Type': 'application/json'}
        payload = {
            "application-id": app_id,
            "application-name": "Hadoop Job",
            "am-container-spec": {
                "commands": {
                    "command": f"yarn jar {jar_file} {main_class} {input_path} {output_path}"
                }
            },
            "application-type": "MAPREDUCE"
        }

        response = requests.post(url, json=payload, headers=headers)
        if response.status_code == 202:
            print(f"Job submitted with application ID: {app_id}")
            return app_id
        else:
            print("Failed to submit the job")
            return None
    else:
        print("Failed to create a new application")
        return None

# Function to monitor job progress
def monitor_job(app_id):
    url = f'{rm_api}/apps/{app_id}'
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()
        state = data['app']['state']
        progress = data['app']['progress']

        print(f"Job state: {state}")
        print(f"Job progress: {progress}%")

        while state not in ['FINISHED', 'FAILED', 'KILLED']:
            time.sleep(5)
            response = requests.get(url)
            data = response.json()
            state = data['app']['state']
            progress = data['app']['progress']
            print(f"Job state: {state}")
            print(f"Job progress: {progress}%")

        print("Job completed!")
    else:
        print("Failed to fetch job information")

# Function to retrieve job output
def retrieve_output(app_id, output_path):
    url = f'{rm_api}/apps/{app_id}/state'
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()
        container_id = data['containerId']

        url = f'{rm_api}/proxy/{container_id}/ws/v1/mapreduce/jobs'
        response = requests.get(url)

        if response.status_code == 200:
            data = response.json()
            job_id = data['jobs']['job'][0]['id']

            url = f'{rm_api}/proxy/{container_id}/ws/v1/mapreduce/jobs/{job_id}/jobattempts'
            response = requests.get(url)

            if response.status_code == 200:
                data = response.json()
                attempt_id = data['jobAttempts']['jobAttempt'][0]['id']

                url = f'{rm_api}/proxy/{container_id}/ws/v1/mapreduce/jobs/{job_id}/jobattempts/{attempt_id}/counters'
                response = requests.get(url)

                if response.status_code == 200:
                    data = response.json()
                    counters = data['jobCounters']['counterGroup']

                    for counter in counters:
                        if counter['counterGroupName'] == 'org.apache.hadoop.mapreduce.FileSystemCounter':
                            for item in counter['counter']:
                                if item['name'] == 'FILE_BYTES_READ':
                                    print(f"Total bytes read: {item['totalCounterValue']}")
                                elif item['name'] == 'FILE_BYTES_WRITTEN':
                                    print(f"Total bytes written: {item['totalCounterValue']}")
                else:
                    print("Failed to retrieve job counters")
            else:
                print("Failed to retrieve job attempt information")
        else:
            print("Failed to retrieve job information")
    else:
        print("Failed to fetch container ID")

# Example usage
jar_file = '/path/to/hadoop-job.jar'
main_class = 'com.example.hadoop.JobMain'
input_path = '/input/data'
output_path = '/output/result'

app_id = submit_job(jar_file, main_class, input_path, output_path)
if app_id:
    monitor_job(app_id)
    retrieve_output(app_id, output_path)


8. Create a Python script that interacts with YARN's ResourceManager API to submit a Hadoop job, set resource requirements, and track resource usage during job execution.

In [None]:
import requests
import time

# YARN ResourceManager API endpoint
rm_api = 'http://<resourcemanager-host>:<resourcemanager-port>/ws/v1/cluster'

# Function to submit a Hadoop job with resource requirements
def submit_job_with_resources(jar_file, main_class, input_path, output_path, num_containers, container_memory, container_vcores):
    url = f'{rm_api}/apps/new-application'
    response = requests.post(url)

    if response.status_code == 200:
        data = response.json()
        app_id = data['application-id']

        # Submit the job using the obtained application ID and set resource requirements
        url = f'{rm_api}/apps/{app_id}/submit'
        headers = {'Content-Type': 'application/json'}
        payload = {
            "application-id": app_id,
            "application-name": "Hadoop Job",
            "am-container-spec": {
                "commands": {
                    "command": f"yarn jar {jar_file} {main_class} {input_path} {output_path}"
                },
                "resource": {
                    "memory": container_memory,
                    "vCores": container_vcores
                },
                "instances": num_containers
            },
            "application-type": "MAPREDUCE"
        }

        response = requests.post(url, json=payload, headers=headers)
        if response.status_code == 202:
            print(f"Job submitted with application ID: {app_id}")
            return app_id
        else:
            print("Failed to submit the job")
            return None
    else:
        print("Failed to create a new application")
        return None

# Function to monitor job resource usage
def monitor_resource_usage(app_id):
    url = f'{rm_api}/apps/{app_id}'
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()
        state = data['app']['state']
        progress = data['app']['progress']

        print(f"Job state: {state}")
        print(f"Job progress: {progress}%")

        while state not in ['FINISHED', 'FAILED', 'KILLED']:
            url = f'{rm_api}/apps/{app_id}/appattempts'
            response = requests.get(url)

            if response.status_code == 200:
                data = response.json()
                attempts = data['appAttempts']['appAttempt']
                latest_attempt = attempts[-1]
                container_id = latest_attempt['containerId']

                url = f'{rm_api}/nodes/{container_id}/containers'
                response = requests.get(url)

                if response.status_code == 200:
                    data = response.json()
                    containers = data['containers']['container']

                    for container in containers:
                        print(f"Container ID: {container['id']}")
                        print(f"Allocated Memory: {container['allocatedMB']} MB")
                        print(f"Allocated vCores: {container['allocatedVCores']}")
                        print(f"Used Memory: {container['usedMemoryMB']} MB")
                        print(f"Used vCores: {container['usedVirtualCores']}")
                        print("-----------------------")
                else:
                    print("Failed to fetch container information")
            else:
                print("Failed to fetch job attempt information")

            time.sleep(5)
            response = requests.get(url)
            data = response.json()
            state = data['app']['state']
            progress = data['app']['progress']
            print(f"Job state: {state}")
            print(f"Job progress: {progress}%")

        print("Job completed!")
    else:
        print("Failed to fetch job information")

# Example usage
jar_file = '/path/to/hadoop-job.jar'
main_class = 'com.example.hadoop.JobMain'
input_path = '/input/data'
output_path = '/output/result'
num_containers = 5
container_memory = 1024
container_vcores = 2

app_id = submit_job_with_resources(jar_file, main_class, input_path, output_path, num_containers, container_memory, container_vcores)
if app_id:
    monitor_resource_usage(app_id)


9. Write a Python program that compares the performance of a MapReduce job with different input split sizes, showcasing the impact on overall job execution time.

In [None]:
from mrjob.job import MRJob
import time

# MapReduce job to calculate word count
class WordCountJob(MRJob):

    def configure_args(self):
        super(WordCountJob, self).configure_args()
        self.add_passthru_arg('--split-size', type=int, default=64,
                              help='Input split size in megabytes')

    def mapper(self, _, line):
        words = line.strip().split()
        for word in words:
            yield word.lower(), 1

    def combiner(self, word, counts):
        yield word, sum(counts)

    def reducer(self, word, counts):
        yield word, sum(counts)

    def steps(self):
        return [
            self.mr(self.mapper, self.combiner, self.reducer)
        ]


# Function to run the WordCountJob with different input split sizes
def compare_input_split_sizes(input_file, split_sizes):
    print("Comparing input split sizes:")
    print("-----------------------------")

    for split_size in split_sizes:
        job = WordCountJob(args=[input_file, f'--split-size={split_size}'])

        start_time = time.time()
        with job.make_runner() as runner:
            runner.run()
            elapsed_time = time.time() - start_time

        print(f"Split Size: {split_size} MB")
        print(f"Job Execution Time: {elapsed_time:.2f} seconds")
        print("-----------------------------")


# Example usage
input_file = '/path/to/input.txt'
split_sizes = [64, 128, 256]  # Specify different input split sizes in megabytes

compare_input_split_sizes(input_file, split_sizes)
