<a href="https://colab.research.google.com/github/UNEEBASHAIKH/NASA-MeteorSense-AI-2025/blob/Data-Cleaning/Meteor_Madness.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
import requests
import pandas as pd

cad_url = "https://ssd-api.jpl.nasa.gov/cad.api"
params = {
    "dist-max": "0.05",
    "date-min": "2025-01-01",
    "date-max": "2026-12-31",
    "sort": "date",
    "body": "Earth",
    "limit": 10000,
    "fullname": "true"
}
cad_data = requests.get(cad_url, params=params).json()
cols = cad_data["fields"]
df = pd.DataFrame(cad_data["data"], columns=cols)
# Now get more details for each asteroid
details = []
for des in df["des"].head(10000):  # limit to 10 for speed
    sbdb_url = f"https://ssd-api.jpl.nasa.gov/sbdb.api?sstr={des}"
    detail = requests.get(sbdb_url).json()
    info = {
        "des": des,
        "diameter": detail.get("phys_par", {}).get("diameter"),
        "orbit_class": detail.get("orbit_class", {}).get("name"),
        "pha": detail.get("pha"),
    }
    details.append(info)

extra_df = pd.DataFrame(details)
df = df.merge(extra_df, on="des", how="left")
print(len(df))




1424


In [None]:
# renaming the columns fro understanding
rename_map = {
    "des": "Asteroid ID",
    "orbit_id": "Orbit Number",
    "jd": "Julian Date",
    "cd": "Close Approach Date",
    "dist": "Distance from Earth (AU)",
    "dist_min": "Minimum Distance (AU)",
    "dist_max": "Maximum Distance (AU)",
    "v_rel": "Relative Velocity (km/s)",
    "v_inf": "Velocity w/o Gravity (km/s)",
    "t_sigma_f": "Timing Uncertainty (days)",
    "h": "Absolute Magnitude (H)",
    "fullname": "Asteroid Name",
    "diameter": "Estimated Diameter (km)",
    "orbit_class": "Orbit Type",
    "pha": "Potentially Hazardous (Y/N)"
}

df = df.rename(columns=rename_map)


In [4]:
# Claening
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

In [5]:
 #Handle missing values

df = df.copy()
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

In [19]:
# type casting
df['Orbit Number'] = pd.to_numeric(df['Orbit Number'], errors='coerce').astype('Int64')
df['Julian Date'] = pd.to_numeric(df['Julian Date'], errors='coerce')
df['Close Approach Date'] = pd.to_datetime(df['Close Approach Date'], errors='coerce')


In [20]:
# Handle outliers (IQR method) ********** thsi part is Optional
import numpy as np
def cap_outliers(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return np.clip(series, lower, upper)

for col in num_cols:
    df[col] = cap_outliers(df[col])

In [21]:
#Cleaning the strings
for col in cat_cols:
    df[col] = df[col].astype(str).str.strip().str.title()

In [22]:
# Converting the txt to boo so that it is easy for ML to understand
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
if 'Potentially Hazardous' in df.columns:
    df['Potentially Hazardous'] = df['Potentially Hazardous'].map({
        'True': 1, 'Yes': 1, 'Y': 1,
        'False': 0, 'No': 0, 'N': 0
    }).fillna(0).astype(int)

if 'Orbit Type' in df.columns:
    le = LabelEncoder()
    df['Orbit Type'] = le.fit_transform(df['Orbit Type'])

In [23]:
# Scaling it for ML  ****** this one is Importat for model to  treats all features fairly
if len(num_cols) > 0:
    scaler = MinMaxScaler()
    df[num_cols] = scaler.fit_transform(df[num_cols])

In [None]:
# Ensure the columns are numeric
for col in ['Estimated Diameter (km)', 'Relative Velocity (km/s)', 'Distance from Earth (AU)']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Now compute the Risk_Score
if all(col in df.columns for col in ['Estimated Diameter (km)', 'Relative Velocity (km/s)', 'Distance from Earth (AU)']):
    df['Risk_Score'] = (
        df['Estimated Diameter (km)'] *
        df['Relative Velocity (km/s)'] /
        (df['Distance from Earth (AU)'] + 1e-6)
    )


In [12]:
# EDA from this part apart

In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns