In [None]:
#Downloading dataset
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!kaggle datasets download -d brsdincer/asteroid-classification-for-hazardous-prediction

asteroid-classification-for-hazardous-prediction.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
#Unzipping the dataset file
import zipfile
zip_ref = zipfile.ZipFile('/content/asteroid-classification-for-hazardous-prediction.zip', 'r')
zip_ref.extractall('/content')
zip_ref.close()

# Data Cleaning

In [None]:
import pandas as pd
import numpy as np

In [31]:
df=pd.read_csv('/content/Asteroid_Updated.csv')

  df=pd.read_csv('/content/Asteroid_Updated.csv')


In [None]:
df

Unnamed: 0,name,a,e,i,om,w,q,ad,per_y,data_arc,...,UB,IR,spec_B,spec_T,G,moid,class,n,per,ma
0,Ceres,2.769165,0.076009,10.594067,80.305532,73.597694,2.558684,2.979647,4.608202,8822.0,...,0.426,,C,G,0.12,1.594780,MBA,0.213885,1683.145708,77.372096
1,Pallas,2.772466,0.230337,34.836234,173.080063,310.048857,2.133865,3.411067,4.616444,72318.0,...,0.284,,B,B,0.11,1.233240,MBA,0.213503,1686.155999,59.699133
2,Juno,2.669150,0.256942,12.988919,169.852760,248.138626,1.983332,3.354967,4.360814,72684.0,...,0.433,,Sk,S,0.32,1.034540,MBA,0.226019,1592.787285,34.925016
3,Vesta,2.361418,0.088721,7.141771,103.810804,150.728541,2.151909,2.570926,3.628837,24288.0,...,0.492,,V,V,0.32,1.139480,MBA,0.271609,1325.432765,95.861936
4,Astraea,2.574249,0.191095,5.366988,141.576605,358.687607,2.082324,3.066174,4.130323,63507.0,...,0.411,,S,S,,1.095890,MBA,0.238632,1508.600458,282.366289
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
839709,,2.812945,0.664688,4.695700,183.310012,234.618352,0.943214,4.682676,4.717914,17298.0,...,,,,,,0.032397,APO,0.208911,1723.217927,156.905910
839710,,2.645238,0.259376,12.574937,1.620020,339.568072,1.959126,3.331350,4.302346,16.0,...,,,,,,0.956145,MBA,0.229090,1571.431965,13.366251
839711,,2.373137,0.202053,0.732484,176.499082,198.026527,1.893638,2.852636,3.655884,5.0,...,,,,,,0.893896,MBA,0.269600,1335.311579,355.351127
839712,,2.260404,0.258348,9.661947,204.512448,148.496988,1.676433,2.844376,3.398501,10.0,...,,,,,,0.680220,MBA,0.290018,1241.302609,15.320134


In [None]:
df.columns

Index(['name', 'a', 'e', 'i', 'om', 'w', 'q', 'ad', 'per_y', 'data_arc',
       'condition_code', 'n_obs_used', 'H', 'neo', 'pha', 'diameter', 'extent',
       'albedo', 'rot_per', 'GM', 'BV', 'UB', 'IR', 'spec_B', 'spec_T', 'G',
       'moid', 'class', 'n', 'per', 'ma'],
      dtype='object')

# Columns according to the dataset:
a- semi-major axis(au)

e- eccentricity

i- Inclination with respect to x-y ecliptic plane(deg)

om- Longitude of the ascending node

w- argument of perihelion

q- argument of perihelion

ad- aphelion distance(au)

per-y- Orbital period(YEARS)

data_arc- data arc range (it is not important to calculate)

condition_code- condition code

n_obs_used- (it is not important to calculate)

H- Absolute Magnitude parameter

neo- Near Earth Object / N or Y

pha- Physically Hazardous Asteroid - N or Y

diameter

extent

albedo- albedo value

rot_per- rotation period


BV- Color

UB- Color

spec_B- Class Type Speci.

spec_T- Class Type General


moid- Earth Minimum orbit Intersection Distance(au)

class- Class

n- Mean motion(deg/d)

per- orbital Period(d)

ma- Mean anomaly(deg)



In [None]:
#analyzing datatypes of columns
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

numeric_df = df.select_dtypes(include=numerics)
len(numeric_df.columns)

22

In [None]:
#missing values
missing_percentages = df.isna().sum().sort_values(ascending=False)
missing_percentages

IR                839713
GM                839700
extent            839696
G                 839595
UB                838735
spec_T            838734
BV                838693
spec_B            838048
rot_per           820918
name              817747
albedo            703305
diameter          702078
pha                16442
moid               16442
data_arc           15474
H                   2689
condition_code       867
ma                     8
ad                     6
neo                    6
per                    6
a                      2
n                      2
per_y                  1
n_obs_used             0
w                      0
om                     0
i                      0
e                      0
class                  0
q                      0
dtype: int64

In [32]:
#Dropping the columns with too many missing values
columns_to_drop = ['IR', 'GM', 'extent', 'G', 'UB', 'spec_T', 'BV', 'spec_B', 'rot_per', 'name', 'albedo', 'diameter','condition_code']
df=df.drop(columns=columns_to_drop)

In [33]:
df=df.dropna(subset=['pha','H','data_arc','ma','ad','neo','per','n','a','per_y'])

In [34]:
#missing values
missing_percentages = df.isna().sum().sort_values(ascending=False)
missing_percentages

a             0
e             0
per           0
n             0
class         0
moid          0
pha           0
neo           0
H             0
n_obs_used    0
data_arc      0
per_y         0
ad            0
q             0
w             0
om            0
i             0
ma            0
dtype: int64

In [38]:
df['neo'] = df['neo'].replace({'Y': 1, 'N': 0})
df['pha'] = df['pha'].replace({'Y': 1, 'N': 0})

In [36]:
df['class'].unique()

array(['MBA', 'OMB', 'MCA', 'AMO', 'IMB', 'TJN', 'CEN', 'APO', 'ATE',
       'AST', 'TNO', 'IEO'], dtype=object)

In [37]:
class_mapping = {
    'MBA': 0,
    'OMB': 1,
    'MCA': 2,
    'AMO': 3,
    'IMB': 4,
    'TJN': 5,
    'CEN': 6,
    'APO': 7,
    'ATE': 8,
    'AST': 9,
    'TNO': 10,
    'IEO': 11
}
df['class'] = df['class'].replace(class_mapping)

In [40]:
df.info

<bound method DataFrame.info of                a         e          i          om           w         q  \
0       2.769165  0.076009  10.594067   80.305532   73.597694  2.558684   
1       2.772466  0.230337  34.836234  173.080063  310.048857  2.133865   
2       2.669150  0.256942  12.988919  169.852760  248.138626  1.983332   
3       2.361418  0.088721   7.141771  103.810804  150.728541  2.151909   
4       2.574249  0.191095   5.366988  141.576605  358.687607  2.082324   
...          ...       ...        ...         ...         ...       ...   
839709  2.812945  0.664688   4.695700  183.310012  234.618352  0.943214   
839710  2.645238  0.259376  12.574937    1.620020  339.568072  1.959126   
839711  2.373137  0.202053   0.732484  176.499082  198.026527  1.893638   
839712  2.260404  0.258348   9.661947  204.512448  148.496988  1.676433   
839713  2.546442  0.287672   5.356238   70.709555  273.483265  1.813901   

              ad     per_y  data_arc  n_obs_used       H  neo  pha 