<a href="https://colab.research.google.com/github/SwathilekhaV/forage-jpmc-swe-task-1/blob/main/NASA_SWATHI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!unzip nasa.zip

Archive:  nasa.zip
  inflating: nearest-earth-objects(1910-2024).csv  


In [6]:
import multiprocessing
import pandas as pd

# Define the map function to count hazardous objects
def map_func_hazardous(record):
    return 1 if record['is_hazardous'] else 0

# Define the map function to get NEOs with the least miss distance
def map_func_miss_distance(record):
    return (record['neo_id'], record['miss_distance'])  # Tuple of neo_id and miss_distance

# Define the map function to get names with the largest estimated diameter
def map_func_diameter(record):
    return (record['name'], record['estimated_diameter_max'])  # Tuple of name and diameter

# Reduce function to count hazardous objects
def reduce_func_hazardous(pairs):
    return sum(pairs)

# Reduce function to get the top 3 least miss distances
def reduce_func_miss_distance(pairs):
    return sorted(pairs, key=lambda x: x[1])[:3]  # Sort by miss_distance and take the top 3

# Reduce function to get the top 3 largest diameters
def reduce_func_diameter(pairs):
    return sorted(pairs, key=lambda x: x[1], reverse=True)[:3]  # Sort by diameter (descending) and take top 3

# Main function to run the MapReduce job
if __name__ == "__main__":
    # Load the dataset (adjust the file path as needed)
    df = pd.read_csv('/content/nearest-earth-objects(1910-2024).csv')  # Adjust to your CSV file path

    # Set up a multiprocessing pool for the Map phase

    # 1. Count hazardous NEOs
    with multiprocessing.Pool() as pool:
        # Apply map_func_hazardous to each record in parallel
        hazardous_map = pool.map(map_func_hazardous, df.to_dict(orient='records'))
    total_hazardous = reduce_func_hazardous(hazardous_map)
    print(f"Total number of hazardous NEOs: {total_hazardous}")

    # 2. Get 3 NEOs with the least miss distance
    with multiprocessing.Pool() as pool:
        # Apply map_func_miss_distance to each record in parallel
        miss_distance_map = pool.map(map_func_miss_distance, df.to_dict(orient='records'))
    top_3_miss_distance = reduce_func_miss_distance(miss_distance_map)
    print(f"Top 3 NEOs with least miss distance: {top_3_miss_distance}")

    # 3. Get 3 names with the largest estimated diameter
    with multiprocessing.Pool() as pool:
        # Apply map_func_diameter to each record in parallel
        diameter_map = pool.map(map_func_diameter, df.to_dict(orient='records'))
    top_3_diameter = reduce_func_diameter(diameter_map)
    print(f"Top 3 NEOs with largest estimated diameter: {top_3_diameter}")


Total number of hazardous NEOs: 43162
Top 3 NEOs with least miss distance: [(54087809, 6745.532515957), (54445916, 8098.256295645), (54051131, 9316.925424026)]
Top 3 NEOs with largest estimated diameter: [('1036 Ganymed (A924 UB)', 83.9537266171), ('1036 Ganymed (A924 UB)', 83.9537266171), ('433 Eros (A898 PA)', 49.2084832235)]
