### Importing necessary libraries

In [1]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler
from sklearn import metrics
from imblearn.over_sampling import SMOTE

from lightgbm import LGBMClassifier

### Loading the data

In [2]:
df = pd.read_csv(r'data/neo_v2.csv')

### Relabeling the hazardous class to int (to replace boolean values with integers)

In [3]:
df['hazardous'] = df['hazardous'].astype('int')

### Dropping unnecessary columns

In [4]:
df.drop(['id', 'name', 'orbiting_body', 'sentry_object'], axis = 1, inplace = True)

In [5]:
df

Unnamed: 0,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,absolute_magnitude,hazardous
0,1.198271,2.679415,13569.249224,5.483974e+07,16.73,0
1,0.265800,0.594347,73588.726663,6.143813e+07,20.00,1
2,0.722030,1.614507,114258.692129,4.979872e+07,17.83,0
3,0.096506,0.215794,24764.303138,2.543497e+07,22.20,0
4,0.255009,0.570217,42737.733765,4.627557e+07,20.09,1
...,...,...,...,...,...,...
90831,0.026580,0.059435,52078.886692,1.230039e+07,25.00,0
90832,0.016771,0.037501,46114.605073,5.432121e+07,26.00,0
90833,0.031956,0.071456,7566.807732,2.840077e+07,24.60,0
90834,0.007321,0.016370,69199.154484,6.869206e+07,27.80,0


### Preprocessing the data

In [6]:
df_cols = list(df.columns[df.columns != 'hazardous'])

In [7]:
quan_pipeline = Pipeline([
    ('quan_scaler', RobustScaler())
])

In [8]:
processed_data = df.copy()
processed_data[df_cols] = quan_pipeline.fit_transform(df[df_cols])
processed_data['hazardous'] = df['hazardous']

In [9]:
processed_data

Unnamed: 0,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,absolute_magnitude,hazardous
0,9.262476,9.262476,-0.892617,0.431976,-1.598624,0
1,1.751419,1.751419,0.856988,0.599711,-0.848624,1
2,5.426350,5.426350,2.042543,0.303831,-1.346330,0
3,0.387756,0.387756,-0.566275,-0.315510,-0.344037,0
4,1.664495,1.664495,-0.042338,0.214270,-0.827982,1
...,...,...,...,...,...,...
90831,-0.175500,-0.175500,0.229963,-0.649399,0.298165,0
90832,-0.254512,-0.254512,0.056100,0.418795,0.527523,0
90833,-0.132194,-0.132194,-1.067592,-0.240118,0.206422,0
90834,-0.330633,-0.330633,0.729029,0.784111,0.940367,0


### Generating synthetic data

In [10]:
train_data = processed_data.copy()

In [11]:
X = train_data.drop(['hazardous'], axis = 1)
y = train_data['hazardous']

In [12]:
sampling_strategy = {1: 8840*2}

In [13]:
over_sampler = SMOTE(random_state=0, sampling_strategy=sampling_strategy)

In [14]:
X_os, y_os = over_sampler.fit_resample(X, y)

In [15]:
resampled_data = X_os.copy()
resampled_data['hazardous'] = y_os.values

In [16]:
resampled_data.hazardous.value_counts()

0    81996
1    17680
Name: hazardous, dtype: int64

#### Keeping only the hazardous class of the oversampled data

In [17]:
resampled_minor_data = resampled_data.loc[ resampled_data['hazardous']==1 ].copy()

In [18]:
resampled_minor_data.drop(['hazardous'], axis = 1, inplace = True)

In [19]:
train_data_minor = train_data.loc[ train_data['hazardous']==1 ].copy()

In [20]:
resampled_minor_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17680 entries, 1 to 99675
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   est_diameter_min    17680 non-null  float64
 1   est_diameter_max    17680 non-null  float64
 2   relative_velocity   17680 non-null  float64
 3   miss_distance       17680 non-null  float64
 4   absolute_magnitude  17680 non-null  float64
dtypes: float64(5)
memory usage: 828.8 KB


In [21]:
train_data_minor.drop(['hazardous'], axis = 1, inplace = True)

### Keeping only the synthetic data

In [22]:
minority_synth_data = pd.concat([resampled_minor_data,train_data_minor]).drop_duplicates(keep=False)

In [23]:
minority_synth_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8839 entries, 90836 to 99675
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   est_diameter_min    8839 non-null   float64
 1   est_diameter_max    8839 non-null   float64
 2   relative_velocity   8839 non-null   float64
 3   miss_distance       8839 non-null   float64
 4   absolute_magnitude  8839 non-null   float64
dtypes: float64(5)
memory usage: 414.3 KB


In [24]:
min_synth_cols = list(minority_synth_data.columns)

#### Performing inverse transform of the pre-processing

In [25]:
minority_synth_data[min_synth_cols] = quan_pipeline.inverse_transform(minority_synth_data)

In [26]:
minority_synth_data

Unnamed: 0,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,absolute_magnitude
90836,0.152952,0.342011,57969.999476,5.741332e+07,21.200000
90837,0.523328,1.170198,43538.363931,5.197663e+06,18.528906
90838,0.457272,1.022490,60885.607516,3.190762e+07,18.822249
90839,0.106652,0.238482,36990.220336,2.249528e+07,21.983250
90840,0.152952,0.342011,57994.093289,5.638625e+07,21.200000
...,...,...,...,...,...
99671,0.256285,0.573071,53002.367806,7.578642e+06,20.079363
99672,0.555335,1.241767,93209.710181,6.881741e+06,18.400000
99673,0.280189,0.626522,36580.476271,2.123283e+07,19.885535
99674,0.145109,0.324474,65729.739149,5.935945e+07,21.314322


In [27]:
minority_synth_data.dropna(axis =0, inplace = True)

In [28]:
minority_synth_data['hazardous'] = 1

#### Selecting only the hazardous class of the real data

In [29]:
minority_real_data = df.loc[ df['hazardous']==1 ].copy()

In [30]:
minority_real_data.shape

(8840, 6)

In [31]:
minority_real_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8840 entries, 1 to 90818
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   est_diameter_min    8840 non-null   float64
 1   est_diameter_max    8840 non-null   float64
 2   relative_velocity   8840 non-null   float64
 3   miss_distance       8840 non-null   float64
 4   absolute_magnitude  8840 non-null   float64
 5   hazardous           8840 non-null   int64  
dtypes: float64(5), int64(1)
memory usage: 483.4 KB


In [32]:
minority_synth_data.shape

(8839, 6)

### Creating new labels for adversarial validation

In [33]:
minority_real_data.drop(['hazardous'], axis = 1, inplace = True)
minority_synth_data.drop(['hazardous'], axis = 1, inplace = True)

In [34]:
minority_real_data['is_synth'] = 0
minority_synth_data['is_synth'] = 1

### Merging the real and synthetic data

In [35]:
combined_df = pd.concat([minority_real_data, minority_synth_data], ignore_index=True)

### X and y for training the model

In [36]:
X = combined_df.drop(['is_synth'], axis = 1)
y = combined_df['is_synth']

In [37]:
y.value_counts()

0    8840
1    8839
Name: is_synth, dtype: int64

### Splitting train and test splits

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

### Scaling the data

In [39]:
processed_data = quan_pipeline.fit_transform(X_train)

In [40]:
processed_test_data = quan_pipeline.transform(X_test)

### Training the model

In [41]:
model = LGBMClassifier(random_state = 0)

In [42]:
model.fit(processed_data, y_train)

LGBMClassifier(random_state=0)

### Evaluating the model

In [43]:
model.score(processed_test_data, y_test)

0.6866515837104072

In [44]:
from sklearn.metrics import classification_report
print(classification_report(y_test, model.predict(processed_test_data)))

              precision    recall  f1-score   support

           0       0.68      0.71      0.69      1768
           1       0.70      0.66      0.68      1768

    accuracy                           0.69      3536
   macro avg       0.69      0.69      0.69      3536
weighted avg       0.69      0.69      0.69      3536

