In [None]:
import configparser

def read_hadoop_config(file_path):
    # Create a ConfigParser object
    config = configparser.ConfigParser()

    # Read the configuration file
    config.read(file_path)

    # Get the core section
    if 'core-site' in config:
        core_section = config['core-site']
        
        # Display the core components of Hadoop
        if 'fs.defaultFS' in core_section:
            print('File System: ', core_section['fs.defaultFS'])
        if 'hadoop.tmp.dir' in core_section:
            print('Temporary Directory: ', core_section['hadoop.tmp.dir'])
        # Add more components as per your requirement

    else:
        print('No core-site section found in the configuration file.')

# Provide the path to your Hadoop configuration file
config_file_path = 'path/to/hadoop/conf/core-site.xml'

# Call the function to read and display the core components of Hadoop
read_hadoop_config(config_file_path)


In [None]:
from pywebhdfs.webhdfs import PyWebHdfsClient

def calculate_total_file_size(directory_path, hdfs_host, hdfs_port, hdfs_user):
    # Create a PyWebHdfsClient object
    hdfs = PyWebHdfsClient(host=hdfs_host, port=hdfs_port, user_name=hdfs_user)

    # Retrieve file information for the directory
    file_info = hdfs.list_dir(directory_path)['FileStatuses']['FileStatus']

    total_size = 0

    # Iterate through the files in the directory
    for file in file_info:
        if file['type'] == 'FILE':
            # Add the file size to the total
            total_size += file['length']

    return total_size

# Provide the HDFS directory path, host, port, and user
directory_path = '/path/to/hdfs/directory'
hdfs_host = 'localhost'
hdfs_port = 50070
hdfs_user = 'hadoop'

# Call the function to calculate the total file size
total_size = calculate_total_file_size(directory_path, hdfs_host, hdfs_port, hdfs_user)

print("Total File Size:", total_size, "bytes")


In [None]:
from mrjob.job import MRJob
from mrjob.step import MRStep
import re

WORD_REGEX = re.compile(r"[\w']+")

class TopNWords(MRJob):

    def configure_args(self):
        super(TopNWords, self).configure_args()
        self.add_passthru_arg('-n', '--topN', type=int, default=10, help='Number of top words to display')

    def steps(self):
        return [
            MRStep(mapper=self.mapper_get_words,
                   combiner=self.combiner_count_words,
                   reducer=self.reducer_count_words),
            MRStep(reducer=self.reducer_find_topN)
        ]

    def mapper_get_words(self, _, line):
        words = WORD_REGEX.findall(line)
        for word in words:
            yield word.lower(), 1

    def combiner_count_words(self, word, counts):
        yield word, sum(counts)

    def reducer_count_words(self, word, counts):
        yield None, (sum(counts), word)

    def reducer_find_topN(self, _, word_counts):
        topN = self.options.topN
        sorted_word_counts = sorted(word_counts, reverse=True)
        for i in range(topN):
            if i < len(sorted_word_counts):
                count, word = sorted_word_counts[i]
                yield word, count

if __name__ == '__main__':
    TopNWords.run()


In [None]:
import requests

# Specify the Namenode and ResourceManager REST API endpoints
namenode_url = 'http://<namenode_host>:<namenode_port>/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo'
datanode_url = 'http://<datanode_host>:<datanode_port>/jmx?qry=Hadoop:service=DataNode,name=FSDatasetState'

def check_namenode_health():
    response = requests.get(namenode_url)
    if response.status_code == 200:
        json_data = response.json()
        live_nodes = json_data['beans'][0]['LiveNodes']
        if live_nodes:
            print("Namenode is healthy. Live Nodes: ", live_nodes)
        else:
            print("Namenode is unhealthy. No Live Nodes found.")
    else:
        print("Failed to retrieve Namenode health information.")

def check_datanode_health():
    response = requests.get(datanode_url)
    if response.status_code == 200:
        json_data = response.json()
        num_live_data_nodes = json_data['beans'][0]['NumLiveDataNodes']
        if num_live_data_nodes > 0:
            print("DataNodes are healthy. Number of Live DataNodes: ", num_live_data_nodes)
        else:
            print("DataNodes are unhealthy. No Live DataNodes found.")
    else:
        print("Failed to retrieve DataNode health information.")

# Call the functions to check the health status
check_namenode_health()
check_datanode_health()


In [None]:
from pywebhdfs.webhdfs import PyWebHdfsClient

def list_hdfs_path(hdfs_path, hdfs_host, hdfs_port, hdfs_user):
    # Create a PyWebHdfsClient object
    hdfs = PyWebHdfsClient(host=hdfs_host, port=hdfs_port, user_name=hdfs_user)

    # List the files and directories in the HDFS path
    response = hdfs.list_dir(hdfs_path)

    # Iterate through the file statuses and print their names
    for file_status in response['FileStatuses']['FileStatus']:
        name = file_status['pathSuffix']
        if file_status['type'] == 'DIRECTORY':
            name += '/'
        print(name)

# Provide the HDFS path, host, port, and user
hdfs_path = '/path/to/hdfs/directory'
hdfs_host = 'localhost'
hdfs_port = 50070
hdfs_user = 'hadoop'

# Call the function to list the files and directories
list_hdfs_path(hdfs_path, hdfs_host, hdfs_port, hdfs_user)


In [None]:
import requests

# Specify the DataNode REST API endpoint
datanode_url = 'http://<datanode_host>:<datanode_port>/jmx?qry=Hadoop:service=DataNode,name=FSDatasetState'

def analyze_storage_utilization():
    response = requests.get(datanode_url)
    if response.status_code == 200:
        json_data = response.json()
        datanodes = json_data['beans'][0]['StorageInfo']['0']['DataNodeVolumeInfo']
        if datanodes:
            # Sort DataNodes based on storage utilization
            sorted_datanodes = sorted(datanodes, key=lambda x: x['usedSpace'], reverse=True)
            
            # Print DataNode with highest storage capacity
            highest_datanode = sorted_datanodes[0]
            print("DataNode with highest storage capacity:")
            print("Host: ", highest_datanode['hostName'])
            print("Storage Capacity: ", highest_datanode['capacity'])
            print("Used Space: ", highest_datanode['usedSpace'])
            print()
            
            # Print DataNode with lowest storage capacity
            lowest_datanode = sorted_datanodes[-1]
            print("DataNode with lowest storage capacity:")
            print("Host: ", lowest_datanode['hostName'])
            print("Storage Capacity: ", lowest_datanode['capacity'])
            print("Used Space: ", lowest_datanode['usedSpace'])
        else:
            print("No DataNodes found.")
    else:
        print("Failed to retrieve DataNode information.")

# Call the function to analyze storage utilization
analyze_storage_utilization()


In [None]:
import requests
import time

# Specify the ResourceManager REST API endpoint
resource_manager_url = 'http://<resource_manager_host>:<resource_manager_port>/ws/v1/cluster'

def submit_hadoop_job(jar_path, main_class, input_path, output_path):
    # Create the Hadoop job payload
    payload = {
        'application-id': '',
        'application-name': 'Hadoop Job',
        'am-container-spec': {
            'commands': {
                'command': 'hadoop jar {} {} {} {}'.format(jar_path, main_class, input_path, output_path)
            },
            'local-resources': {},
            'environment': {}
        }
    }

    # Submit the Hadoop job
    response = requests.post(resource_manager_url + '/apps', json=payload)
    if response.status_code == 202:
        application_id = response.json()['application-id']
        print("Hadoop job submitted successfully. Application ID: ", application_id)
        return application_id
    else:
        print("Failed to submit the Hadoop job.")
        return None

def monitor_job_progress(application_id):
    while True:
        # Retrieve job status
        response = requests.get(resource_manager_url + '/apps/' + application_id)
        if response.status_code == 200:
            status = response.json()['app']['state']
            print("Job status: ", status)
            
            # Check if the job is completed
            if status == 'FINISHED':
                break
            elif status in ['FAILED', 'KILLED']:
                print("Job execution failed or was killed.")
                return

        else:
            print("Failed to retrieve job status.")
            return

        time.sleep(5)  # Wait for 5 seconds before checking the status again

def retrieve_job_output(application_id):
    # Retrieve the job output
    response = requests.get(resource_manager_url + '/apps/' + application_id + '/state')
    if response.status_code == 200:
        final_output = response.json()['app']['finalStatus']
        print("Job output: ", final_output)
    else:
        print("Failed to retrieve job output.")

# Specify the jar path, main class, input path, and output path
jar_path = '/path/to/your/hadoop/job.jar'
main_class = 'com.example.YourMainClass'
input_path = '/path/to/your/input'
output_path = '/path/to/your/output'

# Submit the Hadoop job and retrieve the application ID
application_id = submit_hadoop_job(jar_path, main_class, input_path, output_path)

if application_id:
    # Monitor job progress
    monitor_job_progress(application_id)

    # Retrieve job output
    retrieve_job_output(application_id)


In [None]:
import requests
import time

# Specify the ResourceManager REST API endpoint
resource_manager_url = 'http://<resource_manager_host>:<resource_manager_port>/ws/v1/cluster'

def submit_hadoop_job(jar_path, main_class, input_path, output_path, memory_mb, vcores):
    # Create the Hadoop job payload
    payload = {
        'application-id': '',
        'application-name': 'Hadoop Job',
        'am-container-spec': {
            'commands': {
                'command': 'hadoop jar {} {} {} {}'.format(jar_path, main_class, input_path, output_path)
            },
            'local-resources': {},
            'environment': {}
        },
        'resource': {
            'memory': memory_mb,
            'vCores': vcores
        }
    }

    # Submit the Hadoop job
    response = requests.post(resource_manager_url + '/apps', json=payload)
    if response.status_code == 202:
        application_id = response.json()['application-id']
        print("Hadoop job submitted successfully. Application ID: ", application_id)
        return application_id
    else:
        print("Failed to submit the Hadoop job.")
        return None

def monitor_resource_usage(application_id):
    while True:
        # Retrieve application statistics
        response = requests.get(resource_manager_url + '/apps/' + application_id + '/statistics')
        if response.status_code == 200:
            statistics = response.json()['appStatInfo']['resourceInfo']
            memory_mb = statistics['memoryMB']
            vcores = statistics['vCores']
            print("Memory Usage: ", memory_mb, " MB")
            print("vCores Usage: ", vcores)
        else:
            print("Failed to retrieve resource usage statistics.")
            return

        # Check if the job is completed
        response = requests.get(resource_manager_url + '/apps/' + application_id)
        if response.status_code == 200:
            status = response.json()['app']['state']
            print("Job status: ", status)
            if status == 'FINISHED':
                break
            elif status in ['FAILED', 'KILLED']:
                print("Job execution failed or was killed.")
                return
        else:
            print("Failed to retrieve job status.")
            return

        time.sleep(5)  # Wait for 5 seconds before checking the status again

# Specify the jar path, main class, input path, output path, memory, and vcores
jar_path = '/path/to/your/hadoop/job.jar'
main_class = 'com.example.YourMainClass'
input_path = '/path/to/your/input'
output_path = '/path/to/your/output'
memory_mb = 1024
vcores = 2

# Submit the Hadoop job and retrieve the application ID
application_id = submit_hadoop_job(jar_path, main_class, input_path, output_path, memory_mb, vcores)

if application_id:
    # Monitor resource usage during job execution
    monitor_resource_usage(application_id)


In [None]:
from mrjob.job import MRJob
import time

class PerformanceComparisonJob(MRJob):

    def configure_args(self):
        super(PerformanceComparisonJob, self).configure_args()
        self.add_passthru_arg('-s', '--split-size', type=int, default=64, help='Input split size in MB')

    def mapper(self, _, line):
        # Map function implementation
        # ...

    def reducer(self, key, values):
        # Reduce function implementation
        # ...

    def mapper_init(self):
        self.start_time = time.time()

    def mapper_final(self):
        split_size_mb = self.options.split_size
        elapsed_time = time.time() - self.start_time
        self.increment_counter('Execution Time', f'Split Size: {split_size_mb}MB', elapsed_time)

if __name__ == '__main__':
    PerformanceComparisonJob.run()
