<a href="https://colab.research.google.com/github/Nihadkaipalli/Predictive-Modeling-for-Asteroid-and-Interstellar-Object-Collision-Risk-Using-Machine-Learning/blob/main/Predictive_Modeling_for_Asteroid_Collision_Risk_Using_Machine_Learning_and_Orbital_Dynamics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **1. Install Necessary Libraries**

In [2]:
!pip install dash pandas numpy matplotlib seaborn scikit-learn tensorflow

Collecting dash
  Downloading dash-2.18.2-py3-none-any.whl.metadata (10 kB)
Collecting Werkzeug<3.1 (from dash)
  Downloading werkzeug-3.0.6-py3-none-any.whl.metadata (3.7 kB)
Collecting dash-html-components==2.0.0 (from dash)
  Downloading dash_html_components-2.0.0-py3-none-any.whl.metadata (3.8 kB)
Collecting dash-core-components==2.0.0 (from dash)
  Downloading dash_core_components-2.0.0-py3-none-any.whl.metadata (2.9 kB)
Collecting dash-table==5.0.0 (from dash)
  Downloading dash_table-5.0.0-py3-none-any.whl.metadata (2.4 kB)
Collecting retrying (from dash)
  Downloading retrying-1.3.4-py3-none-any.whl.metadata (6.9 kB)
Downloading dash-2.18.2-py3-none-any.whl (7.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m54.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dash_core_components-2.0.0-py3-none-any.whl (3.8 kB)
Downloading dash_html_components-2.0.0-py3-none-any.whl (4.1 kB)
Downloading dash_table-5.0.0-py3-none-any.whl (3.9 kB)
Downloadi

In [57]:
# Import necessary libraries
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt


# **2. Load and Preprocess Dataset**

In [58]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##2.1 Load Dataset

In [64]:
# Load the dataset from Google Drive
file_path = '/content/drive/My Drive/Asteroid Collision Dataset.csv'
df = pd.read_csv(file_path, low_memory=False)
print("Dataset loaded successfully!")

# Inspect initial columns
print("Initial Columns:")
print(df.columns)

Dataset loaded successfully!
Initial Columns:
Index(['spkid', 'full_name', 'epoch', 'e', 'a', 'q', 'i', 'om', 'w', 'ma', 'n',
       'tp', 'per_y', 'moid', 'moid_ld', 'moid_jup', 't_jup', 'sigma_e',
       'sigma_a', 'sigma_q', 'sigma_i', 'sigma_om', 'sigma_w', 'sigma_ma',
       'sigma_tp', 'sigma_per', 'class', 'condition_code', 'H', 'G',
       'diameter', 'extent', 'albedo', 'rot_per', 'GM', 'BV', 'UB', 'IR',
       'spec_B', 'H_sigma', 'diameter_sigma', 'neo', 'pha', 'orbit_id',
       'epoch_mjd', 'epoch_cal', 'equinox', 'ad', 'tp_cal', 'per', 'sigma_ad',
       'sigma_n', 'data_arc', 'first_obs', 'last_obs', 'n_obs_used', 'rms',
       'two_body', 'A1_sigma', 'A2_sigma', 'A3_sigma', 'DT_sigma'],
      dtype='object')


## 2.2 Data Cleaning

In [66]:
# Drop irrelevant columns
df = df.drop(columns=['full_name', 'equinox', 'orbit_id'], errors='ignore')

# Clean and handle non-numeric columns
columns_to_clean = ['H', 'diameter', 'albedo', 'rot_per', 'e', 'a', 'q', 'i', 'om', 'w', 'ma', 'n']
for col in columns_to_clean:
    df[col] = df[col].astype(str).str.extract(r'([-+]?\d*\.?\d+)')  # Extract numeric parts
    df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert to float

##2.3 Handling missing values

In [67]:
# Handle missing values
imputer = SimpleImputer(strategy='mean')
df[columns_to_clean] = imputer.fit_transform(df[columns_to_clean])

##2.3 Normalizing features

In [68]:
# Normalize relevant columns
scaler = StandardScaler()
df[columns_to_clean] = scaler.fit_transform(df[columns_to_clean])

In [73]:
# Parse 'epoch_cal' with the correct format
df['epoch_cal'] = pd.to_datetime(df['epoch_cal'], format='%Y-%m-%d.%f', errors='coerce')

# Check for NaT values
print(f"Missing or invalid dates in 'epoch_cal': {df['epoch_cal'].isnull().sum()}")

# Handle NaT values, if necessary
df['epoch_cal'] = df['epoch_cal'].fillna(pd.Timestamp('1900-01-01'))


Missing or invalid dates in 'epoch_cal': 656017


##2.4 Create Target and Encode Features

In [74]:
# Encode binary columns
df['neo'] = df['neo'].map({'Y': 1, 'N': 0})
df['two_body'] = df['two_body'].map({'T': 1, 'F': 0})

# Convert date columns to datetime format
date_columns = ['epoch_cal', 'tp_cal', 'first_obs', 'last_obs']
for col in date_columns:
    df[col] = pd.to_datetime(df[col], errors='coerce')

# Create numeric features from dates
df['obs_duration'] = (df['last_obs'] - df['first_obs']).dt.days
df['days_since_epoch'] = (pd.to_datetime('now') - df['epoch_cal']).dt.days

In [75]:
print(df['epoch_cal'].head(10))

0   1900-01-01
1   1900-01-01
2   1900-01-01
3   1900-01-01
4   1900-01-01
5   1900-01-01
6   1900-01-01
7   1900-01-01
8   1900-01-01
9   1900-01-01
Name: epoch_cal, dtype: datetime64[ns]


#**3. Feature Engineering**

##3.1 Derived Features

In [38]:
# Add derived features
df['aphelion'] = df['a'] * (1 + df['e'])
df['perihelion_ratio'] = df['q'] / df['a']

##3.2 Gravitational Influence

In [53]:
# Constants for gravitational influence calculation
G = 6.674e-11  # Gravitational constant (m^3 kg^-1 s^-2)
M_sun = 1.989e30  # Mass of the Sun (kg)
M_jupiter = 1.898e27  # Mass of Jupiter (kg)
M_earth = 5.972e24  # Mass of Earth (kg)
AU_to_m = 1.496e11  # Conversion from AU to meters
asteroid_density = 2500  # Average asteroid density (kg/m^3)

def calculate_asteroid_mass(diameter_km):
    radius_m = (diameter_km * 1000) / 2  # Convert km to meters
    volume = (4 / 3) * np.pi * radius_m**3
    asteroid_density = 2500  # Average density in kg/m^3
    return volume * asteroid_density

def gravitational_force(mass_asteroid, mass_body, distance_au):
    G = 6.674e-11  # Gravitational constant
    AU_to_m = 1.496e11  # Astronomical Unit to meters
    distance_m = distance_au * AU_to_m
    return (G * mass_asteroid * mass_body) / (distance_m**2)

def total_gravitational_influence(row):
    M_sun = 1.989e30  # Mass of the Sun
    M_jupiter = 1.898e27  # Mass of Jupiter
    M_earth = 5.972e24  # Mass of Earth
    try:
        mass_asteroid = calculate_asteroid_mass(row['diameter'])
        sun_force = gravitational_force(mass_asteroid, M_sun, row['a'])
        jupiter_force = gravitational_force(mass_asteroid, M_jupiter, abs(row['a'] - 5.2))
        earth_force = gravitational_force(mass_asteroid, M_earth, abs(row['a'] - 1))
        return sun_force + jupiter_force + earth_force
    except Exception:
        return np.nan

# Apply the function
if 'gravitational_influence' not in df.columns:
    df['gravitational_influence'] = df.apply(total_gravitational_influence, axis=1)

# Add orbital period feature
def orbital_period(semi_major_axis_au):
    G = 6.674e-11  # Gravitational constant
    M_sun = 1.989e30  # Mass of the Sun
    AU_to_m = 1.496e11  # Astronomical Unit to meters
    semi_major_axis_m = semi_major_axis_au * AU_to_m

    # Safeguard against invalid values
    try:
        if semi_major_axis_m <= 0:
            return np.nan  # Return NaN for invalid input
        return 2 * np.pi * np.sqrt((semi_major_axis_m**3) / (G * M_sun)) / (60 * 60 * 24)  # Convert seconds to days
    except Exception as e:
        print(f"Error calculating orbital period: {e}")
        return np.nan

# Apply the function
df['orbital_period_days'] = df['a'].apply(orbital_period)

In [54]:
print("Missing values in orbital_period_days:")
print(df['orbital_period_days'].isnull().sum())

# Drop rows with NaN values in orbital_period_days, if necessary
df = df.dropna(subset=['orbital_period_days'])


Missing values in orbital_period_days:
2


#**4. Exploratory Data Analysis (EDA)**

##4.1 Dataset Overview

In [40]:
# Dataset overview
print("Dataset Overview:")
print(df.head())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Dataset statistics
print("\nDataset Statistics:")
print(df.describe())

# Check data types
print("\nData Types:")
print(df.dtypes)

Dataset Overview:
     spkid          full_name      epoch         e         a         q  \
0  3246901          (1935 UZ)  2428097.5  0.251325  2.149638  1.609380   
1  3246902          (1937 CK)  2460600.5  0.138476  2.320503  1.999169   
2  3246903          (1939 RR)  2429540.5  0.253684  2.904675  2.167804   
3  3246904          (1942 RH)  2430612.5  0.145324  2.257216  1.929189   
4  3399586         (1960 SB1)  2460600.5  0.497848  2.980188  1.496508   

           i          om           w          ma  ...  class_ATE  class_HYA  \
0   4.728640  134.342070  281.563658  342.283793  ...      False      False   
1   6.549856  273.834058  130.418938   26.673482  ...      False      False   
2  13.139290  322.997330   64.837530  324.193410  ...      False      False   
3   4.223982  205.857738  134.532417    3.933410  ...      False      False   
4   9.579659  196.301529  224.419191  219.607359  ...      False      False   

   class_IEO  class_IMB  class_MBA  class_MCA  aphelion  perih

In [48]:
# Identify problematic columns with non-numeric values
for col in df.columns:
    if df[col].dtype == 'object':
        print(f"Non-numeric values found in column: {col}")
        print(df[col].unique()[:10])  # Display the first 10 unique values

# Strip spaces and clean non-numeric characters
columns_to_clean = ['H', 'diameter', 'albedo', 'rot_per', 'e', 'a', 'q', 'i', 'om', 'w', 'ma', 'n']
for col in columns_to_clean:
    df[col] = df[col].astype(str).str.extract('([-+]?\d*\.?\d+)')  # Extract numeric parts
    df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert to float

# Drop irrelevant columns
df = df.drop(columns=['full_name', 'equinox', 'orbit_id'], errors='ignore')

# Encode 'spec_B' using one-hot encoding
df = pd.get_dummies(df, columns=['spec_B'], prefix='spec_B', drop_first=True)

# Encode binary columns
df['neo'] = df['neo'].map({'Y': 1, 'N': 0})
df['two_body'] = df['two_body'].map({'T': 1, 'F': 0})

# Convert date columns to datetime format
date_columns = ['epoch_cal', 'tp_cal', 'first_obs', 'last_obs']
for col in date_columns:
    df[col] = pd.to_datetime(df[col], errors='coerce')

# Create numeric features from dates
df['obs_duration'] = (df['last_obs'] - df['first_obs']).dt.days
df['days_since_epoch'] = (pd.to_datetime('now') - df['epoch_cal']).dt.days

# Verify all numeric columns
df_cleaned = df.select_dtypes(include=[np.number])

In [56]:
# Visualization Enhancements

# Correlation Heatmap - Focused on Key Features
key_features = [
    'pha', 'a', 'e', 'i', 'q', 'om', 'w', 'ma',
    'diameter', 'albedo', 'gravitational_influence',
    'orbital_period_days', 'obs_duration', 'days_since_epoch'
]
plt.figure(figsize=(12, 10))
sns.heatmap(df[key_features].corr(), annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Feature Correlation Heatmap (Key Features)")
plt.show()

# Gravitational Influence Distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['gravitational_influence'], kde=True, bins=30, color='blue')
plt.title("Distribution of Gravitational Influence")
plt.xlabel("Gravitational Influence (N)")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()

# Relationship between Semi-Major Axis and Gravitational Influence
plt.figure(figsize=(10, 6))
sns.scatterplot(x='a', y='gravitational_influence', hue='pha', palette='viridis', data=df)
plt.title("Gravitational Influence vs Semi-Major Axis")
plt.xlabel("Semi-Major Axis (AU)")
plt.ylabel("Gravitational Influence (N)")
plt.grid(True)
plt.legend(title="Potentially Hazardous Asteroid (PHA)", loc='upper right')
plt.show()

# Orbital Period vs Gravitational Influence
plt.figure(figsize=(10, 6))
sns.scatterplot(x='orbital_period_days', y='gravitational_influence', hue='pha', palette='coolwarm', data=df)
plt.title("Gravitational Influence vs Orbital Period")
plt.xlabel("Orbital Period (days)")
plt.ylabel("Gravitational Influence (N)")
plt.grid(True)
plt.legend(title="Potentially Hazardous Asteroid (PHA)", loc='upper right')
plt.show()

# Observation Duration vs Gravitational Influence
plt.figure(figsize=(10, 6))
sns.scatterplot(x='obs_duration', y='gravitational_influence', hue='pha', palette='plasma', data=df)
plt.title("Observation Duration vs Gravitational Influence")
plt.xlabel("Observation Duration (days)")
plt.ylabel("Gravitational Influence (N)")
plt.grid(True)
plt.legend(title="Potentially Hazardous Asteroid (PHA)", loc='upper right')
plt.show()


ValueError: could not convert string to float: 'N'

<Figure size 1200x1000 with 0 Axes>

In [49]:
# Visualization Enhancements
# Correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Feature Correlation Heatmap")
plt.show()

# Gravitational influence distribution
sns.histplot(df['gravitational_influence'], kde=True, bins=30)
plt.title("Distribution of Gravitational Influence")
plt.xlabel("Gravitational Influence (N)")
plt.ylabel("Frequency")
plt.show()

# Relationship between semi-major axis and gravitational influence
sns.scatterplot(x='a', y='gravitational_influence', hue='pha', data=df)
plt.title("Gravitational Influence vs Semi-Major Axis")
plt.xlabel("Semi-Major Axis (AU)")
plt.ylabel("Gravitational Influence (N)")
plt.show()

ValueError: could not convert string to float: 'MCA'

<Figure size 1200x1000 with 0 Axes>

##4.2 Distribution of Numerical Features

In [None]:
from sklearn.impute import SimpleImputer

# Impute missing values with the mean of each column
imputer = SimpleImputer(strategy='mean')
numerical_data_imputed = pd.DataFrame(imputer.fit_transform(numerical_data), columns=numerical_data.columns)

# Re-run Isolation Forest on imputed data
iso_forest = IsolationForest(contamination=0.05, random_state=42)
iso_forest.fit(numerical_data_imputed)
outlier_predictions = iso_forest.predict(numerical_data_imputed)

# Add predictions to the original DataFrame
df['iso_outlier'] = (outlier_predictions == -1)
print(f"Number of outliers detected: {df['iso_outlier'].sum()}")






ValueError: Shape of passed values is (656017, 47), indices imply (656017, 54)