In [None]:
from configparser import ConfigParser

def read_hadoop_config(file_path):
    # Create a ConfigParser object
    config = ConfigParser()
    
    # Read the configuration file
    config.read(file_path)
    
    # Get the core section
    if config.has_section('core'):
        core_components = config.options('core')
        
        # Display the core components
        print("Core Components of Hadoop:")
        for component in core_components:
            print(component)
    else:
        print("No core section found in the configuration file.")

# Provide the path to your Hadoop configuration file
config_file_path = 'path/to/your/hadoop-config-file.conf'

# Call the function to read and display the core components
read_hadoop_config(config_file_path)


In [None]:
import pyarrow.hdfs as hdfs

def calculate_directory_size(hdfs_host, hdfs_port, hdfs_directory):
    # Create a connection to the HDFS
    hdfs_client = hdfs.connect(host=hdfs_host, port=hdfs_port)
    
    # Get the file status of the HDFS directory
    dir_status = hdfs_client.get_path_info(hdfs_directory)
    
    # Check if the path is a directory
    if not dir_status['kind'] == 'directory':
        print(f"{hdfs_directory} is not a directory.")
        return
    
    # Recursive function to calculate the total size
    def calculate_size(path):
        size = 0
        
        # Get the status of the path
        status = hdfs_client.get_path_info(path)
        
        # If it's a file, return its size
        if status['kind'] == 'file':
            return status['size']
        
        # If it's a directory, recursively calculate the size of its contents
        if status['kind'] == 'directory':
            for file in hdfs_client.ls(path):
                file_path = path + '/' + file
                size += calculate_size(file_path)
        
        return size
    
    # Call the recursive function to calculate the total size
    total_size = calculate_size(hdfs_directory)
    
    # Print the total size
    print(f"Total size of {hdfs_directory}: {total_size} bytes")

# Set the HDFS host, port, and directory path
hdfs_host = 'localhost'
hdfs_port = 9000
hdfs_directory = '/path/to/hdfs/directory'

# Call the function to calculate the total size
calculate_directory_size(hdfs_host, hdfs_port, hdfs_directory)


In [None]:
from mrjob.job import MRJob
from mrjob.step import MRStep
import re
from heapq import nlargest

WORD_RE = re.compile(r"[\w']+")

class MRWordFrequencyCount(MRJob):
    
    def configure_args(self):
        super(MRWordFrequencyCount, self).configure_args()
        self.add_passthru_arg('-n', '--top_n', type=int, default=10, help='Specify the number of most frequent words to display')

    def steps(self):
        return [
            MRStep(mapper=self.mapper_get_words,
                   combiner=self.combiner_count_words,
                   reducer=self.reducer_count_words),
            MRStep(reducer=self.reducer_find_top_words)
        ]

    def mapper_get_words(self, _, line):
        words = WORD_RE.findall(line)
        for word in words:
            yield word.lower(), 1

    def combiner_count_words(self, word, counts):
        yield word, sum(counts)

    def reducer_count_words(self, word, counts):
        yield None, (sum(counts), word)

    def reducer_find_top_words(self, _, word_count_pairs):
        top_n = self.options.top_n
        top_words = nlargest(top_n, word_count_pairs)
        for count, word in top_words:
            yield word, count

if __name__ == '__main__':
    MRWordFrequencyCount.run()


In [None]:
import requests

def check_namenode_health(nn_host, nn_port):
    url = f"http://{nn_host}:{nn_port}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo"
    
    try:
        response = requests.get(url)
        if response.status_code == 200:
            json_data = response.json()
            
            if json_data['beans']:
                live_nodes = json_data['beans'][0]['LiveNodes']
                dead_nodes = json_data['beans'][0]['DeadNodes']
                
                print(f"NameNode Health Status:")
                print(f"Live Nodes: {live_nodes}")
                print(f"Dead Nodes: {dead_nodes}")
            else:
                print("No data available for NameNode health check.")
        else:
            print(f"Error accessing NameNode: {response.status_code}")
    except requests.exceptions.RequestException as e:
        print(f"Error accessing NameNode: {e}")

def check_datanode_health(nn_host, nn_port):
    url = f"http://{nn_host}:{nn_port}/jmx?qry=Hadoop:service=DataNode,name=DataNodeInfo"
    
    try:
        response = requests.get(url)
        if response.status_code == 200:
            json_data = response.json()
            
            if json_data['beans']:
                live_nodes = json_data['beans'][0]['LiveNodes']
                dead_nodes = json_data['beans'][0]['DeadNodes']
                
                print(f"DataNode Health Status:")
                print(f"Live Nodes: {live_nodes}")
                print(f"Dead Nodes: {dead_nodes}")
            else:
                print("No data available for DataNode health check.")
        else:
            print(f"Error accessing DataNode: {response.status_code}")
    except requests.exceptions.RequestException as e:
        print(f"Error accessing DataNode: {e}")

if __name__ == '__main__':
    # Set the NameNode host and port
    nn_host = 'localhost'
    nn_port = 50070
    
    # Check NameNode health status
    check_namenode_health(nn_host, nn_port)
    
    # Check DataNode health status
    check_datanode_health(nn_host, nn_port)


In [None]:
import pyarrow.hdfs as hdfs

def list_hdfs_path(hdfs_host, hdfs_port, hdfs_path):
    # Create a connection to the HDFS
    hdfs_client = hdfs.connect(host=hdfs_host, port=hdfs_port)
    
    # List the files and directories in the HDFS path
    files = hdfs_client.ls(hdfs_path)
    
    # Display the files and directories
    print(f"Files and directories in {hdfs_path}:")
    for file in files:
        print(file)

# Set the HDFS host, port, and path
hdfs_host = 'localhost'
hdfs_port = 9000
hdfs_path = '/path/to/hdfs/directory'

# Call the function to list the files and directories
list_hdfs_path(hdfs_host, hdfs_port, hdfs_path)



In [None]:
import requests

def analyze_storage_utilization(hdfs_host, hdfs_port):
    url = f"http://{hdfs_host}:{hdfs_port}/jmx?qry=Hadoop:service=DataNode,name=FSDatasetState-UndefinedStorageId"
    
    try:
        response = requests.get(url)
        if response.status_code == 200:
            json_data = response.json()
            
            if json_data['beans']:
                data_nodes = json_data['beans'][0]['Storage']
                
                # Calculate storage utilization for each DataNode
                storage_utilization = []
                for data_node in data_nodes:
                    node_name = data_node['DataNodeName']
                    capacity = data_node['Capacity']
                    dfs_used = data_node['DfsUsed']
                    utilization_percentage = (dfs_used / capacity) * 100
                    
                    storage_utilization.append((node_name, utilization_percentage))
                
                # Sort DataNodes by storage utilization
                sorted_utilization = sorted(storage_utilization, key=lambda x: x[1])
                
                # Print DataNodes with highest and lowest storage capacities
                print("DataNodes Storage Utilization:")
                print(f"Highest Utilization: {sorted_utilization[-1][0]} - {sorted_utilization[-1][1]:.2f}%")
                print(f"Lowest Utilization: {sorted_utilization[0][0]} - {sorted_utilization[0][1]:.2f}%")
            else:
                print("No data available for DataNode storage utilization analysis.")
        else:
            print(f"Error accessing DataNode: {response.status_code}")
    except requests.exceptions.RequestException as e:
        print(f"Error accessing DataNode: {e}")

if __name__ == '__main__':
    # Set the HDFS host and port
    hdfs_host = 'localhost'
    hdfs_port = 50070
    
    # Analyze storage utilization
    analyze_storage_utilization(hdfs_host, hdfs_port)


In [None]:
import requests
import time

def submit_hadoop_job(resourcemanager_host, resourcemanager_port, job_file):
    url = f"http://{resourcemanager_host}:{resourcemanager_port}/ws/v1/cluster/apps/new-application"
    
    try:
        response = requests.post(url)
        if response.status_code == 200:
            json_data = response.json()
            application_id = json_data['application-id']
            
            # Submit the Hadoop job
            submit_url = f"http://{resourcemanager_host}:{resourcemanager_port}/ws/v1/cluster/apps"
            headers = {'Content-Type': 'application/json'}
            data = {
                "application-id": application_id,
                "application-name": "MyHadoopJob",
                "am-container-spec": {
                    "commands": {
                        "command": f"hadoop jar {job_file}"
                    },
                    "resource": {
                        "memory": 1024,
                        "vCores": 1
                    }
                },
                "unmanaged-AM": False,
                "max-app-attempts": 2
            }
            
            response = requests.post(submit_url, headers=headers, json=data)
            if response.status_code == 202:
                print(f"Hadoop job submitted. Application ID: {application_id}")
                return application_id
            else:
                print(f"Error submitting Hadoop job: {response.status_code}")
        else:
            print(f"Error creating new application: {response.status_code}")
    except requests.exceptions.RequestException as e:
        print(f"Error interacting with ResourceManager: {e}")

def monitor_job_progress(resourcemanager_host, resourcemanager_port, application_id):
    url = f"http://{resourcemanager_host}:{resourcemanager_port}/ws/v1/cluster/apps/{application_id}"
    
    try:
        while True:
            response = requests.get(url)
            if response.status_code == 200:
                json_data = response.json()
                state = json_data['app']['state']
                final_status = json_data['app']['finalStatus']
                
                print(f"Job State: {state}")
                
                if final_status != "UNDEFINED":
                    print(f"Job Final Status: {final_status}")
                    break
            else:
                print(f"Error getting job progress: {response.status_code}")
            
            time.sleep(5)  # Wait for 5 seconds before checking progress again
    except requests.exceptions.RequestException as e:
        print(f"Error interacting with ResourceManager: {e}")

def retrieve_job_output(resourcemanager_host, resourcemanager_port, application_id):
    url = f"http://{resourcemanager_host}:{resourcemanager_port}/proxy/{application_id}/ws/v1/mapreduce/jobs"
    
    try:
        response = requests.get(url)
        if response.status_code == 200:
            json_data = response.json()
            if 'jobs' in json_data and 'job' in json_data['jobs']:
                job_id = json_data['jobs']['job'][0]['id']
                job_output_url = f"{url}/{job_id}/jobattempts"
                
                response = requests.get(job_output_url)
                if response.status_code == 200:
                    json_data = response.json()
                    if 'jobAttempts' in json_data and 'jobAttempt' in json_data['jobAttempts']:
                        latest_attempt = json_data['jobAttempts']['jobAttempt'][0]['id']
                        output_url = f"{job_output_url}/{latest_attempt}/logs"
                        
                        response = requests.get(output_url)
                        if response.status_code == 200:
                            job_output = response.text
                            print(f"Job Output:\n{job_output}")
                        else:
                            print(f"Error retrieving job output: {response.status_code}")
                    else:
                        print("No job attempts found.")
                else:
                    print(f"Error retrieving job attempts: {response.status_code}")
            else:
                print("No job found.")
        else:
            print(f"Error retrieving job details: {response.status_code}")
    except requests.exceptions.RequestException as e:
        print(f"Error interacting with ResourceManager: {e}")

if __name__ == '__main__':
    # Set the ResourceManager host and port
    resourcemanager_host = 'localhost'
    resourcemanager_port = 8088
    
    # Set the path to the Hadoop job JAR file
    job_file = '/path/to/hadoop/job.jar'
    
    # Submit the Hadoop job
    application_id = submit_hadoop_job(resourcemanager_host, resourcemanager_port, job_file)
    
    if application_id:
        # Monitor job progress
        monitor_job_progress(resourcemanager_host, resourcemanager_port, application_id)
        
        # Retrieve job output
        retrieve_job_output(resourcemanager_host, resourcemanager_port, application_id)


In [None]:
import requests
import time

def submit_hadoop_job(resourcemanager_host, resourcemanager_port, job_file, job_args):
    url = f"http://{resourcemanager_host}:{resourcemanager_port}/ws/v1/cluster/apps/new-application"
    
    try:
        response = requests.post(url)
        if response.status_code == 200:
            json_data = response.json()
            application_id = json_data['application-id']
            
            # Submit the Hadoop job
            submit_url = f"http://{resourcemanager_host}:{resourcemanager_port}/ws/v1/cluster/apps"
            headers = {'Content-Type': 'application/json'}
            data = {
                "application-id": application_id,
                "application-name": "MyHadoopJob",
                "am-container-spec": {
                    "commands": {
                        "command": f"hadoop jar {job_file} {' '.join(job_args)}"
                    },
                    "resource": {
                        "memory": 1024,
                        "vCores": 1
                    }
                },
                "unmanaged-AM": False,
                "max-app-attempts": 2
            }
            
            response = requests.post(submit_url, headers=headers, json=data)
            if response.status_code == 202:
                print(f"Hadoop job submitted. Application ID: {application_id}")
                return application_id
            else:
                print(f"Error submitting Hadoop job: {response.status_code}")
        else:
            print(f"Error creating new application: {response.status_code}")
    except requests.exceptions.RequestException as e:
        print(f"Error interacting with ResourceManager: {e}")

def track_resource_usage(resourcemanager_host, resourcemanager_port, application_id):
    url = f"http://{resourcemanager_host}:{resourcemanager_port}/ws/v1/cluster/apps/{application_id}"
    
    try:
        while True:
            response = requests.get(url)
            if response.status_code == 200:
                json_data = response.json()
                state = json_data['app']['state']
                final_status = json_data['app']['finalStatus']
                allocated_resources = json_data['app']['allocatedResources']
                
                print(f"Job State: {state}")
                print(f"Job Final Status: {final_status}")
                print(f"Allocated Resources: {allocated_resources}")
                
                if final_status != "UNDEFINED":
                    break
            else:
                print(f"Error getting job progress: {response.status_code}")
            
            time.sleep(5)  # Wait for 5 seconds before checking progress again
    except requests.exceptions.RequestException as e:
        print(f"Error interacting with ResourceManager: {e}")

if __name__ == '__main__':
    # Set the ResourceManager host and port
    resourcemanager_host = 'localhost'
    resourcemanager_port = 8088
    
    # Set the path to the Hadoop job JAR file and job arguments
    job_file = '/path/to/hadoop/job.jar'
    job_args = ['arg1', 'arg2']
    
    # Submit the Hadoop job
    application_id = submit_hadoop_job(resourcemanager_host, resourcemanager_port, job_file, job_args)
    
    if application_id:
        # Track resource usage
        track_resource_usage(resourcemanager_host, resourcemanager_port, application_id)


In [None]:
import requests
import time

def submit_hadoop_job(resourcemanager_host, resourcemanager_port, job_file, job_args):
    url = f"http://{resourcemanager_host}:{resourcemanager_port}/ws/v1/cluster/apps/new-application"
    
    try:
        response = requests.post(url)
        if response.status_code == 200:
            json_data = response.json()
            application_id = json_data['application-id']
            
            # Submit the Hadoop job
            submit_url = f"http://{resourcemanager_host}:{resourcemanager_port}/ws/v1/cluster/apps"
            headers = {'Content-Type': 'application/json'}
            data = {
                "application-id": application_id,
                "application-name": "MyHadoopJob",
                "am-container-spec": {
                    "commands": {
                        "command": f"hadoop jar {job_file} {' '.join(job_args)}"
                    },
                    "resource": {
                        "memory": 1024,
                        "vCores": 1
                    }
                },
                "unmanaged-AM": False,
                "max-app-attempts": 2
            }
            
            response = requests.post(submit_url, headers=headers, json=data)
            if response.status_code == 202:
                print(f"Hadoop job submitted. Application ID: {application_id}")
                return application_id
            else:
                print(f"Error submitting Hadoop job: {response.status_code}")
        else:
            print(f"Error creating new application: {response.status_code}")
    except requests.exceptions.RequestException as e:
        print(f"Error interacting with ResourceManager: {e}")

def track_resource_usage(resourcemanager_host, resourcemanager_port, application_id):
    url = f"http://{resourcemanager_host}:{resourcemanager_port}/ws/v1/cluster/apps/{application_id}"
    
    try:
        while True:
            response = requests.get(url)
            if response.status_code == 200:
                json_data = response.json()
                state = json_data['app']['state']
                final_status = json_data['app']['finalStatus']
                allocated_resources = json_data['app']['allocatedResources']
                
                print(f"Job State: {state}")
                print(f"Job Final Status: {final_status}")
                print(f"Allocated Resources: {allocated_resources}")
                
                if final_status != "UNDEFINED":
                    break
            else:
                print(f"Error getting job progress: {response.status_code}")
            
            time.sleep(5)  # Wait for 5 seconds before checking progress again
    except requests.exceptions.RequestException as e:
        print(f"Error interacting with ResourceManager: {e}")

if __name__ == '__main__':
    # Set the ResourceManager host and port
    resourcemanager_host = 'localhost'
    resourcemanager_port = 8088
    
    # Set the path to the Hadoop job JAR file and job arguments
    job_file = '/path/to/hadoop/job.jar'
    job_args = ['arg1', 'arg2']
    
    # Submit the Hadoop job
    application_id = submit_hadoop_job(resourcemanager_host, resourcemanager_port, job_file, job_args)
    
    if application_id:
        # Track resource usage
        track_resource_usage(resourcemanager_host, resourcemanager_port, application_id)


In [None]:
from mrjob.job import MRJob
import time

class MRWordCount(MRJob):
    
    def configure_args(self):
        super(MRWordCount, self).configure_args()
        self.add_passthru_arg('-s', '--split_size', type=int, default=100, help='Specify the input split size')
    
    def mapper(self, _, line):
        words = line.split()
        for word in words:
            yield word, 1
    
    def reducer(self, word, counts):
        yield word, sum(counts)
    
    def job_runner_kwargs(self):
        kwargs = super(MRWordCount, self).job_runner_kwargs()
        split_size = self.options.split_size
        kwargs['jobconf'] = {'mapreduce.input.fileinputformat.split.maxsize': str(split_size)}
        return kwargs

if __name__ == '__main__':
    # Set the input file path
    input_file = '/path/to/your/input_file.txt'
    
    # Set the different input split sizes to compare
    split_sizes = [100, 500, 1000]
    
    for split_size in split_sizes:
        start_time = time.time()
        
        # Run the MRJob with the specified split size
        MRWordCount(args=[input_file, '-s', str(split_size)]).run_job()
        
        end_time = time.time()
        execution_time = end_time - start_time
        
        print(f"Execution time for split size {split_size}: {execution_time} seconds")
