In [3]:
# Mount Google Drive (if using Google Colab)
from google.colab import drive
drive.mount('/content/drive')

# Import libraries
import pandas as pd
import numpy as np

# Load the dataset
file_path = '/content/drive/My Drive/Asteroid Collision Dataset V2.csv'
df = pd.read_csv(file_path, low_memory=False)

# Inspect the dataset
print("Dataset Shape:", df.shape)
print("First 5 Rows:\n", df.head())


Dataset Shape: (677494, 79)
First 5 Rows:
    diameter  extent  albedo  rot_per  GM  BV  UB  IR spec_B  spec_T  ...  \
0       NaN     NaN     NaN      NaN NaN NaN NaN NaN    NaN     NaN  ...   
1       NaN     NaN     NaN      NaN NaN NaN NaN NaN    NaN     NaN  ...   
2       NaN     NaN     NaN      NaN NaN NaN NaN NaN    NaN     NaN  ...   
3       NaN     NaN     NaN      NaN NaN NaN NaN NaN    NaN     NaN  ...   
4       NaN     NaN     NaN      NaN NaN NaN NaN NaN    NaN     NaN  ...   

        rms  two_body  A1  A1_sigma  A2  A2_sigma  A3  A3_sigma  DT  DT_sigma  
0  0.000533       NaN NaN       NaN NaN       NaN NaN       NaN NaN       NaN  
1  0.071263       NaN NaN       NaN NaN       NaN NaN       NaN NaN       NaN  
2  0.000002       NaN NaN       NaN NaN       NaN NaN       NaN NaN       NaN  
3       NaN         T NaN       NaN NaN       NaN NaN       NaN NaN       NaN  
4  0.339380       NaN NaN       NaN NaN       NaN NaN       NaN NaN       NaN  

[5 rows x 79 column

# 3. Data Cleaning and Preprocessing

## 3.1. Missing Data Analysis

In [4]:
# Analyze missing data
missing_summary = df.isnull().mean().sort_values(ascending=False) * 100
missing_summary = missing_summary[missing_summary > 0]
print("\nMissing Data Summary (Percentage of Missing Values):")
print(missing_summary)


Missing Data Summary (Percentage of Missing Values):
DT_sigma          100.000000
extent            100.000000
GM                100.000000
BV                100.000000
spec_T            100.000000
IR                100.000000
K1                100.000000
K2                100.000000
PC                100.000000
DT                100.000000
UB                100.000000
M1                100.000000
G                 100.000000
M2                100.000000
name               99.999852
prefix             99.999410
A3                 99.998967
A3_sigma           99.998967
A1                 99.997491
A1_sigma           99.997491
spec_B             99.992620
A2                 99.976531
A2_sigma           99.976531
n_del_obs_used     99.910700
n_dop_obs_used     99.910700
rot_per            99.801179
two_body           99.062427
albedo             99.040582
diameter_sigma     98.987002
diameter           98.986559
H_sigma            15.064635
sigma_per           0.937868
sigma_ad          

## 3.2. Drop Columns with Excessive Missing Values

In [5]:
# Drop columns with >90% missing values or irrelevant
cols_to_drop_missing = missing_summary[missing_summary > 90].index.tolist()
irrelevant_columns = ['producer', 'equinox', 'orbit_id', 'pdes', 'full_name', 'name', 'prefix']
cols_to_drop = list(set(cols_to_drop_missing + irrelevant_columns))

df.drop(columns=cols_to_drop, inplace=True, errors='ignore')
print("\nDropped Columns:")
print(df.head())


Dropped Columns:
 - PC
 - A2
 - M2
 - name
 - G
 - n_dop_obs_used
 - orbit_id
 - producer
 - K1
 - K2
 - diameter
 - GM
 - two_body
 - A1_sigma
 - diameter_sigma
 - pdes
 - A2_sigma
 - A1
 - spec_T
 - full_name
 - equinox
 - A3_sigma
 - albedo
 - spec_B
 - A3
 - M1
 - n_del_obs_used
 - UB
 - prefix
 - rot_per
 - DT_sigma
 - extent
 - DT
 - IR
 - BV


In [6]:
print(df.head())

          a         e          i          om           w         q        ad  \
0  3.344072  0.333618  17.631497  191.717418  341.109523  2.228428  4.459715   
1  2.149638  0.251325   4.728640  134.342070  281.563658  1.609380  2.689895   
2  2.320503  0.138476   6.549856  273.834058  130.418938  1.999169  2.641838   
3  2.904675  0.253684  13.139290  322.997330   64.837530  2.167804  3.641545   
4  2.257216  0.145324   4.223982  205.857738  134.532417  1.929189  2.585243   

      per_y  data_arc  condition_code  ...  sigma_w  sigma_ma  sigma_ad  \
0  6.115353      34.0             8.0  ...   1.1161   0.47857  0.032768   
1  3.151781       4.0             9.0  ...  33.7820  11.31700  0.435020   
2  3.534936      31.0             7.0  ...   2.0890  23.32100  0.004895   
3  4.950561      27.0             NaN  ...      NaN       NaN       NaN   
4  3.391313      10.0             9.0  ...  10.3150   8.20790  0.094289   

    sigma_n sigma_tp  sigma_per  class   first_obs    last_obs      

In [7]:
# Missing Data Analysis
missing_summary = df.isnull().mean().sort_values(ascending=False) * 100
missing_summary = missing_summary[missing_summary > 0]
print("\nMissing Data Summary (Percentage of Missing Values):")
print(missing_summary)


Missing Data Summary (Percentage of Missing Values):
H_sigma           15.064635
sigma_per          0.937868
sigma_ad           0.937868
pha                0.937720
moid_ld            0.937573
moid_jup           0.937573
sigma_tp           0.937573
moid               0.937573
sigma_w            0.937573
sigma_om           0.937573
sigma_q            0.937573
sigma_i            0.937573
sigma_a            0.937573
sigma_e            0.937573
sigma_ma           0.937573
sigma_n            0.937573
data_arc           0.392771
H                  0.146717
condition_code     0.001033
per                0.000295
per_y              0.000295
neo                0.000295
t_jup              0.000295
ad                 0.000295
rms                0.000148
dtype: float64


## 3.3. Convert Columns to Numeric

In [8]:
from sklearn.impute import SimpleImputer

# Convert columns to numeric where possible
columns_to_clean = ['H', 'e', 'a', 'q', 'i', 'om', 'w', 'ma', 'n']
for col in columns_to_clean:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

## 3.4. Impute Missing Values

In [10]:
# Impute missing values in numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
imputer = SimpleImputer(strategy='mean')
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

# 3. Feature Engineering

## 3.1. Binary Mapping

In [None]:
# Map binary columns
binary_mapping = {'Y': 1, 'N': 0}
for col in ['neo', 'pha']:
    if col in df.columns:
        df[col] = df[col].map(binary_mapping)

## 3.2. Handle Dates

In [None]:
# Handle date columns
date_cols = ['epoch_cal', 'tp_cal', 'first_obs', 'last_obs']
for col in date_cols:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors='coerce', format='%Y-%m-%d')
        df[f'{col}_year'] = df[col].dt.year
        df[f'{col}_month'] = df[col].dt.month
        df[f'{col}_day'] = df[col].dt.day
        df.drop(columns=[col], inplace=True)

In [None]:
remaining_categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
print("\nRemaining Categorical Columns after Date conversion:", remaining_categorical_cols)

## 3.3. One-Hot Encoding

In [None]:
#Identify Remaining Categorical Columns
remaining_categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
print("\nRemaining Categorical Columns after Binary Mapping:", remaining_categorical_cols)

In [None]:
# One-hot encode the 'class' column
if 'class' in df.columns:
    df = pd.get_dummies(df, columns=['class'], prefix='class', drop_first=True)

In [None]:
# Check unique values in the 'class' column
if 'class' in df.columns:
    print("Unique values in 'class' column:", df['class'].unique())

## 3.4. Derive New Features

In [21]:
GRAVITATIONAL_CONSTANT = 39.478

# Add derived features
df['relative_orbital_distance'] = np.abs(df['q'] - df['a'] * (1 - df['e']))
df['orbital_velocity'] = np.sqrt(GRAVITATIONAL_CONSTANT / df['a'])
df['adjusted_eccentricity'] = df['e'] * df['a'] / df['q']

# Perturbed features
for col in ['a', 'e', 'q', 'i']:
    sigma_col = f'sigma_{col}'
    if sigma_col in df.columns:
        df[f'{col}_perturbed'] = df[col] + np.random.normal(0, df[sigma_col])

  result = getattr(ufunc, method)(*inputs, **kwargs)


#4. Define Features and Target

## 4.1. Create Collision Risk

In [None]:
# Create collision_risk based on MOID
df['collision_risk'] = np.where(df['moid'] < 0.05, 1, 0)

In [None]:
# Check if collision_risk was created successfully
print(df[['moid', 'collision_risk']])

## 4.2. Define Features (X) and Target (y)

In [None]:
# Define features and target
features_to_drop = ['moid', 'moid_ld']  # Exclude MOID-related features
X = df.drop(columns=features_to_drop + ['collision_risk'])
y = df['collision_risk']

# Ensure only numeric features
X = X.select_dtypes(include=[np.number])

# 5. Data Splitting and Scaling

## 5.1. Split Dataset

In [22]:
from sklearn.model_selection import train_test_split

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

NameError: name 'X' is not defined

## 5.2 Reimpute Missing values

In [None]:

# Re-impute missing values in training and testing sets
imputer = SimpleImputer(strategy='mean')
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

## 5.3. Resample Using SMOTE

In [22]:
!pip install imbalanced-learn



In [24]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("Resampled Training Set Shape:", X_train_resampled.shape, y_train_resampled.shape)

ValueError: Input X contains NaN.
SMOTE does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

## Verify Resampling

In [None]:
# Verify class distribution
print("Class Distribution After Resampling:\n", pd.Series(y_train_resampled).value_counts())

## Check for Non-Numeric Data:

In [None]:
non_numeric_cols = X_train.select_dtypes(exclude=[np.number]).columns.tolist()
print("Non-Numeric Columns in X_train:", non_numeric_cols)

## Check for Missing Values in the Dataset

In [None]:
print("Missing Values in the Entire Dataset:\n", df.isnull().sum())

# Random Forest

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd
import matplotlib.pyplot as plt

In [22]:
# Check class distribution
class_distribution = y.value_counts(normalize=True) * 100
print("Class Distribution (%):\n", class_distribution)

Class Distribution (%):
 collision_risk
0    97.280271
1     2.719729
Name: proportion, dtype: float64


## Drop moid and moid_ld From Features

In [23]:
# Remove MOID-related features
X_no_moid = X.drop(columns=['moid', 'moid_ld'])

# Verify the shape of the new feature matrix
print("Feature Matrix Shape Without MOID:", X_no_moid.shape)

Feature Matrix Shape Without MOID: (677494, 33)


In [24]:
# Derived Features Implementation

# 1. Relative Orbital Distance
X_no_moid['relative_orbital_distance'] = np.abs(X['q'] - X['a'] * (1 - X['e']))

# 2. Orbital Velocity
GRAVITATIONAL_CONSTANT = 39.478  # Gravitational constant in AU^3 / day^2
X_no_moid['orbital_velocity'] = np.sqrt(GRAVITATIONAL_CONSTANT / X['a'])

# 3. Adjusted Eccentricity
X_no_moid['adjusted_eccentricity'] = X['e'] * X['a'] / X['q']

# 4. Perturbed Parameters
for col in ['a', 'e', 'q', 'i']:
    sigma_col = f'sigma_{col}'
    if sigma_col in X.columns:
        X_no_moid[f'{col}_perturbed'] = X[col] + np.random.normal(0, X[sigma_col])

print("Derived features added to the dataset.")

Derived features added to the dataset.


  result = getattr(ufunc, method)(*inputs, **kwargs)


In [25]:
# Add derived features to the resampled training and testing sets
derived_features = ['relative_orbital_distance', 'orbital_velocity', 'adjusted_eccentricity'] + \
                   [f'{col}_perturbed' for col in ['a', 'e', 'q', 'i'] if f'{col}_perturbed' in X_no_moid.columns]

X_train_no_moid[derived_features] = X_no_moid.loc[X_train_no_moid.index, derived_features]
X_test_no_moid[derived_features] = X_no_moid.loc[X_test_no_moid.index, derived_features]

NameError: name 'X_train_no_moid' is not defined

## Resample the Dataset Using SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
# Initialize SMOTE
smote = SMOTE(random_state=42)

In [None]:
# Resample with SMOTE after adding derived features
X_resampled_no_moid, y_resampled_no_moid = smote.fit_resample(X_train_no_moid, y_train)

print("Resampled dataset includes derived features.")

## Train the Random Forest Model

## 6.1. Train Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest
rf_model = RandomForestClassifier(
    n_estimators=100, random_state=42, class_weight='balanced', n_jobs=-1
)
rf_model.fit(X_train_resampled, y_train_resampled)

# Evaluate the Model

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

## Predictions on the Original Test Set

In [None]:
# Predict and evaluate
y_pred = rf_model.predict(X_test)

## Classification Report

In [None]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

## Confusion Matrix

In [None]:
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Feature Importance Analysis

In [None]:
# Feature importance analysis
import pandas as pd
import matplotlib.pyplot as plt

# Feature importance analysis
feature_importance_derived = pd.DataFrame({
    'Feature': X_train_no_moid.columns,
    'Importance': rf_model_derived.feature_importances_
}).sort_values(by='Importance', ascending=False)

# Display top 10 features
print("Top 10 Most Important Features With Derived Features:")
print(feature_importance_derived.head(10))

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance_derived['Feature'][:10], feature_importance_derived['Importance'][:10])
plt.gca().invert_yaxis()
plt.title("Top 10 Feature Importances With Derived Features")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()
