In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import zscore
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:

original_dataframe = pd.read_csv('24.csv')

In [3]:

def remove_outliers_iqr(df_orig):
  df = df_orig.copy()

  df_numeric = df.select_dtypes(include=['number']).select_dtypes(exclude=['bool'])
  df_non_numeric = df.select_dtypes(exclude=['number'])

  Q1 = df_numeric.quantile(0.25)
  Q3 = df_numeric.quantile(0.75)
  IQR = Q3 - Q1

  lower_bound = Q1 - 1.5 * IQR
  upper_bound = Q3 + 1.5 * IQR

  # Removing outliers
  df_iqr = df_numeric[~((df_numeric < lower_bound) | (df_numeric > upper_bound)).any(axis=1)]
  df_cleaned = pd.concat([df_iqr, df_non_numeric.loc[df_iqr.index]], axis=1)

  return df_cleaned

In [4]:
def remove_outliers_zscore(df_orig, z_score_threshold = 3):
  df = df_orig.copy()

  df_numeric = df.select_dtypes(include=['number']).select_dtypes(exclude=['bool'])
  df_non_numeric = df.select_dtypes(exclude=['number'])

  z_scores = df_numeric.apply(zscore)
  df_z = df_numeric[(z_scores.abs() < z_score_threshold).all(axis=1)]  # Remove rows with Z-score >  z_score_threshold in any column
  df_cleaned = pd.concat([df_z, df_non_numeric.loc[df_z.index]], axis=1)

  return df_cleaned

In [5]:

def remove_highly_correlated(df_cleaned_orig, target_col="Hazardous", high_corr_threshold = 0.99):
  df_cleaned = df_cleaned_orig.copy()

  df_numeric = df_cleaned.select_dtypes(include=['number']).select_dtypes(exclude=['bool'])

  target_series = None
  if target_col in df_numeric.columns: # High correaltion with target column should not be dropped
    target_series = df_cleaned[target_col]
    df_numeric = df_numeric.drop(columns=[target_col])

  high_corr_pairs = set()
  correlation_matrix = df_numeric.corr()

  for i in range(len(correlation_matrix.columns)):
    for j in range(i): # Lower triangular matrix
      if abs(correlation_matrix.iloc[i, j]) >= high_corr_threshold:
        col1 = correlation_matrix.columns[i]
        col2 = correlation_matrix.columns[j]
        high_corr_pairs.add((col1, col2))

  columns_to_drop = {col2 for col1, col2 in high_corr_pairs}
  df_reduced = df_cleaned.drop(columns=columns_to_drop)

  if target_series is not None: # If present in df_numeric and is removed, has to be added again
    df_reduced[target_col] = target_series

  return df_reduced

In [6]:

def normalize_minmax(df_train, df_test, df_valid):
  scaler = MinMaxScaler()

  df_train_numeric = df_train.select_dtypes(include=['number']).select_dtypes(exclude=['bool'])
  df_test_numeric = df_test.select_dtypes(include=['number']).select_dtypes(exclude=['bool'])
  df_valid_numeric = df_valid.select_dtypes(include=['number']).select_dtypes(exclude=['bool'])

  df_train[df_train_numeric.columns] = scaler.fit_transform(df_train_numeric)
  df_test[df_test_numeric.columns] = scaler.transform(df_test_numeric)
  df_valid[df_valid_numeric.columns] = scaler.transform(df_valid_numeric)

  return df_train, df_test, df_valid

In [7]:

def standardize_data(df_train, df_test, df_valid):
  scaler = StandardScaler()

  df_train_numeric = df_train.select_dtypes(include=['number']).select_dtypes(exclude=['bool'])
  df_test_numeric = df_test.select_dtypes(include=['number']).select_dtypes(exclude=['bool'])
  df_valid_numeric = df_valid.select_dtypes(include=['number']).select_dtypes(exclude=['bool'])

  df_train[df_train_numeric.columns] = scaler.fit_transform(df_train_numeric)
  df_test[df_test_numeric.columns] = scaler.transform(df_test_numeric)
  df_valid[df_valid_numeric.columns] = scaler.transform(df_valid_numeric)

  return df_train, df_test, df_valid

In [8]:

def label_encode_categorical(df_normalized):
  df_normalized_copy = df_normalized.copy()

  categorical_cols = df_normalized_copy.select_dtypes(include=['object']).columns
  label_encoder = LabelEncoder()

  for col in categorical_cols:
    df_normalized_copy[col] = label_encoder.fit_transform(df_normalized_copy[col])

  return df_normalized_copy

In [9]:

def select_high_corr_features(df_train, df_test, df_valid, target_col="Hazardous", top_n=3):
  df_train_numeric = df_train.select_dtypes(include=['number']).select_dtypes(exclude=['bool'])

  # If target column is not in numerical form
  if target_col not in df_train_numeric.columns:
      raise ValueError(f"Target column '{target_col}' must be numeric and present in the dataset.")

  corr_values = df_train_numeric.corr()[target_col].abs().sort_values(ascending=False)
  selected_features = corr_values.drop(index=target_col).head(top_n).index.tolist()

  # print(f"Selected features based on correlation with '{target_col}': {selected_features}")

  return df_train[selected_features], df_test[selected_features], df_valid[selected_features]

In [10]:

def perform_pca(df_train, df_test, df_valid, n_components=3):
  pca = PCA(n_components=n_components)

  df_train_numeric = df_train.select_dtypes(include=['number'])
  df_test_numeric = df_test.select_dtypes(include=['number'])
  df_valid_numeric = df_valid.select_dtypes(include=['number'])

  pca.fit(df_train_numeric)
  df_train_pca = pca.transform(df_train_numeric)
  df_test_pca = pca.transform(df_test_numeric)
  df_valid_pca = pca.transform(df_valid_numeric)

  pca_columns = [f'PC{i+1}' for i in range(n_components)]
  df_train_pca = pd.DataFrame(df_train_pca, columns=pca_columns)
  df_test_pca = pd.DataFrame(df_test_pca, columns=pca_columns)
  df_valid_pca = pd.DataFrame(df_valid_pca, columns=pca_columns)

  return df_train_pca, df_test_pca, df_valid_pca

In [11]:
def numeric_conversion(df_orig, one_hot_encode_month=False):
  df = df_orig.copy()

  # Dropping 'Equinox' and 'Orbiting Body'
  df.drop(columns=['Equinox', 'Orbiting Body'], errors='ignore', inplace=True)

  # Converting 'Close Approach Date' to datetime
  df['Close Approach Date'] = pd.to_datetime(df['Close Approach Date'])
  df['Close Approach Year'] = df['Close Approach Date'].dt.year
  df['Close Approach Month'] = df['Close Approach Date'].dt.month



  # Converting 'Epoch Date Close Approach' to datetime
  df['Converted Date'] = df['Epoch Date Close Approach'].apply(
    lambda x: datetime.utcfromtimestamp(x / 1000) if pd.notnull(x) else None
  )

  df['Epoch Close Approach Year'] = df['Converted Date'].dt.year
  df['Epoch Close Approach Month'] = df['Converted Date'].dt.month



  # Encoding 'Hazardous' column
  df['Hazardous'] = df['Hazardous'].astype(int)

  # One-Hot Encoding for months (if needed)
  if one_hot_encode_month:
    df = pd.get_dummies(df, columns=['Close Approach Month', 'Epoch Close Approach Month'], prefix=['Close Approach Month no.', 'Epoch Close Approach Month no.'], dtype=int)

  df = df.drop(columns=["Converted Date"])
  return df

In [12]:

def data_preprocessing(df):
  no_outliers = remove_outliers_zscore(df)
  no_highly_correlated = remove_highly_correlated(no_outliers)
  encoded = numeric_conversion(no_highly_correlated)
  df_train, df_temp = train_test_split(encoded, test_size=0.4, random_state=42)
  df_valid, df_test = train_test_split(df_temp, test_size=0.5, random_state=42)
  normal_train, normal_test, normal_valid = normalize_minmax(df_train, df_test, df_valid)
  normal_train.reset_index(drop=True, inplace=True)
  normal_valid.reset_index(drop=True, inplace=True)
  normal_test.reset_index(drop=True, inplace=True)
  return normal_train, normal_valid, normal_test

In [13]:
normal_train, normal_valid, normal_test = data_preprocessing(original_dataframe)

  lambda x: datetime.utcfromtimestamp(x / 1000) if pd.notnull(x) else None


In [14]:
target_column = "Hazardous"


In [15]:

X_train = normal_train.drop(columns=[target_column])
y_train = normal_train[target_column]

In [16]:

X_valid = normal_valid.drop(columns=[target_column])
y_valid = normal_valid[target_column]

In [17]:
X_test = normal_test.drop(columns=[target_column])
y_test = normal_test[target_column]

In [18]:

X_train["Close Approach Date"] = X_train["Close Approach Date"].astype("int64") // 10**9  # Convert to seconds
X_valid["Close Approach Date"] = X_valid["Close Approach Date"].astype("int64") // 10**9
X_test["Close Approach Date"] = X_test["Close Approach Date"].astype("int64") // 10**9

In [19]:

X_train["Orbit Determination Date"] = pd.to_datetime(X_train["Orbit Determination Date"]).astype("int64") // 10**9
X_valid["Orbit Determination Date"] = pd.to_datetime(X_valid["Orbit Determination Date"]).astype("int64") // 10**9
X_test["Orbit Determination Date"] = pd.to_datetime(X_test["Orbit Determination Date"]).astype("int64") // 10**9

In [None]:
from sklearn.svm import SVC
linear_svm = SVC(kernel='linear', C=1.0) # You can adjust the C parameter here
linear_svm.fit(X_train, y_train)
y_test_pred = linear_svm.predict(X_test)
y_val_pred = linear_svm.predict(X_val)

accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)
conf_matrix = confusion_matrix(y_test, y_test_pred)

print(f"Test Data Metrics:")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"\n")
#For val
accuracy = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred)
recall = recall_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred)
conf_matrix = confusion_matrix(y_val, y_val_pred)

print(f"Validation Data Metrics:")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

In [None]:
rbf_svm = SVC(kernel='rbf', C=1.0, gamma='scale')
rbf_svm.fit(X_train, y_train)
y_test_pred = rbf_svm.predict(X_test)
y_val_pred = rbf_svm.predict(X_val)


accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)
conf_matrix = confusion_matrix(y_test, y_test_pred)

print(f"Test Data Metrics:")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"\n")
#For val
accuracy = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred)
recall = recall_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred)
conf_matrix = confusion_matrix(y_val, y_val_pred)

print(f"Validation Data Metrics:")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

In [None]:
poly_svm = SVC(kernel='poly', degree=3, C=1.0, gamma='scale')
poly_svm.fit(X_train, y_train)
y_test_pred = poly_svm.predict(X_test)
y_val_pred = poly_svm.predict(X_val)

accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)
conf_matrix = confusion_matrix(y_test, y_test_pred)

print(f"Test Data Metrics:")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"\n")
#For val
accuracy = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred)
recall = recall_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred)
conf_matrix = confusion_matrix(y_val, y_val_pred)

print(f"Validation Data Metrics:")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")