# Code to extract JSON data (Question 4)

In [2]:
import requests
import csv

def download_data(url):
    """
    Download data from the given URL and return the JSON response.
    """
    response = requests.get(url)
    return response.json()

def process_data(data):
    """
    Process the JSON data and extract the desired fields.
    Returns a list of dictionaries representing the processed data.
    """
    processed_data = []
    for item in data:
        # Extract the desired fields from the JSON data
        processed_item = {
            'id': item.get('id', ''),
            'name': item.get('name', ''),
            'year': item.get('year', ''),
            'mass': item.get('mass', ''),
            'reclat': item.get('reclat', ''),
            'reclong': item.get('reclong', ''),
            'geolocation_type': item['geolocation']['type'] if 'geolocation' in item and 'type' in item['geolocation'] else '',
            'geolocation_coordinates': item['geolocation']['coordinates'] if 'geolocation' in item and 'coordinates' in item['geolocation'] else ''
        }
        processed_data.append(processed_item)
    return processed_data

def save_as_csv(data, filename):
    """
    Save the processed data as a CSV file with the given filename.
    """
    keys = data[0].keys()
    with open(filename, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=keys)
        writer.writeheader()
        writer.writerows(data)
    print(f"Data saved as {filename}")

# Main program
url = "https://data.nasa.gov/resource/y77d-th95.json"
data = download_data(url)
processed_data = process_data(data)
save_as_csv(processed_data, "nasa_data.csv")



Data saved as nasa_data.csv


# Drawing insights

In [9]:
import pandas as pd

In [11]:
#viewing data
df=pd.read_csv('nasa_data.csv')
df.head(10) 


Unnamed: 0,id,name,year,mass,reclat,reclong,geolocation_type,geolocation_coordinates
0,1,Aachen,1880-01-01T00:00:00.000,21.0,50.775,6.08333,Point,"[6.08333, 50.775]"
1,2,Aarhus,1951-01-01T00:00:00.000,720.0,56.18333,10.23333,Point,"[10.23333, 56.18333]"
2,6,Abee,1952-01-01T00:00:00.000,107000.0,54.21667,-113.0,Point,"[-113, 54.21667]"
3,10,Acapulco,1976-01-01T00:00:00.000,1914.0,16.88333,-99.9,Point,"[-99.9, 16.88333]"
4,370,Achiras,1902-01-01T00:00:00.000,780.0,-33.16667,-64.95,Point,"[-64.95, -33.16667]"
5,379,Adhi Kot,1919-01-01T00:00:00.000,4239.0,32.1,71.8,Point,"[71.8, 32.1]"
6,390,Adzhi-Bogdo (stone),1949-01-01T00:00:00.000,910.0,44.83333,95.16667,Point,"[95.16667, 44.83333]"
7,392,Agen,1814-01-01T00:00:00.000,30000.0,44.21667,0.61667,Point,"[0.61667, 44.21667]"
8,398,Aguada,1930-01-01T00:00:00.000,1620.0,-31.6,-65.23333,Point,"[-65.23333, -31.6]"
9,417,Aguila Blanca,1920-01-01T00:00:00.000,1440.0,-30.86667,-64.55,Point,"[-64.55, -30.86667]"


In [21]:
#looking for null values
print(df['year'].isna().sum())

1


In [22]:
# dropping null values
df.dropna(subset=['year'], inplace=True)

In [None]:
df['year']

In [41]:
#since year is not in correct format, changing it
df['year'] = df['year'].astype(str).apply(lambda x: int(x[:4]))

In [42]:
df['year']

0      1880
1      1951
2      1952
3      1976
4      1902
       ... 
995    1934
996    2011
997    1869
998    1922
999    1905
Name: year, Length: 999, dtype: int64

**Earth meteorites that fell before the year 2000**

In [36]:
# Earth meteorites that fell before the year 2000
meteorites_before_2000 = df[(df['year'] < 2000)][['id', 'name', 'year']].reset_index(drop=True)
print(meteorites_before_2000)

        id       name  year
0        1     Aachen  1880
1        2     Aarhus  1951
2        6       Abee  1952
3       10   Acapulco  1976
4      370    Achiras  1902
..     ...        ...   ...
924  24004   Timochin  1807
925  24009   Tirupati  1934
926  24011      Tjabe  1869
927  24012   Tjerebon  1922
928  24019  Tomakovka  1905

[929 rows x 3 columns]


**earth meteorites co-ordinates who fell before the year 1970**

In [38]:
# earth meteorites co-ordinates who fell before the year 1970
meteorites_before_1970 = df[(df['year'] < 1970)][['id', 'name', 'year','geolocation_coordinates']].reset_index(drop=True)
print(meteorites_before_1970)

        id       name  year geolocation_coordinates
0        1     Aachen  1880       [6.08333, 50.775]
1        2     Aarhus  1951    [10.23333, 56.18333]
2        6       Abee  1952        [-113, 54.21667]
3      370    Achiras  1902     [-64.95, -33.16667]
4      379   Adhi Kot  1919            [71.8, 32.1]
..     ...        ...   ...                     ...
775  24004   Timochin  1807            [35.2, 54.5]
776  24009   Tirupati  1934    [79.41667, 13.63333]
777  24011      Tjabe  1869   [111.53333, -7.08333]
778  24012   Tjerebon  1922   [106.58333, -6.66667]
779  24019  Tomakovka  1905       [34.76667, 47.85]

[780 rows x 4 columns]


**Earth meteorites whose mass was more than 10000 kg**

In [39]:
# Earth meteorites whose mass was more than 10000 kg
meteorites_mass_over_10000kg = df[(df['mass'] < 10000)][['id', 'name', 'year','mass']].reset_index(drop=True)
print(meteorites_mass_over_10000kg)

        id       name  year    mass
0        1     Aachen  1880    21.0
1        2     Aarhus  1951   720.0
2       10   Acapulco  1976  1914.0
3      370    Achiras  1902   780.0
4      379   Adhi Kot  1919  4239.0
..     ...        ...   ...     ...
716  23984  Tianzhang  1986  2232.0
717  23999  Tillaberi  1970  3000.0
718  24009   Tirupati  1934   230.0
719  54823    Tissint  2011  7000.0
720  24019  Tomakovka  1905   600.0

[721 rows x 4 columns]
