In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report

# load dataset
df = pd.read_csv('neo.csv')

# choose features and target
features = ['est_diameter_min', 'est_diameter_max', 'relative_velocity', 'miss_distance', 'absolute_magnitude']
X = df[features]
y = df['hazardous']

print("Data loaded and features are selected.")

Data loaded and features are selected.


In [22]:
# split data - provide 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data has been split into training and testing sets.")
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")

Data has been split into training and testing sets.
Training set size: 72668 samples
Test set size: 18168 samples


In [23]:
print("--- Model 1: Decision Tree ---")

# create the decision tree model
decision_tree_model = DecisionTreeClassifier(random_state=42)

# train model
decision_tree_model.fit(X_train, y_train)

# predict and evaluate performance
y_pred_tree = decision_tree_model.predict(X_test)
accuracy_tree = accuracy_score(y_test, y_pred_tree)

print(f"Accuracy: {accuracy_tree:.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_tree))

--- Model 1: Decision Tree ---
Accuracy: 0.893

Classification Report:
              precision    recall  f1-score   support

       False       0.94      0.94      0.94     16439
        True       0.44      0.48      0.46      1729

    accuracy                           0.89     18168
   macro avg       0.69      0.71      0.70     18168
weighted avg       0.90      0.89      0.89     18168



In [24]:
print("--- Model 2: Random Forest ---")

# create a random forest with 100 trees
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)

# train model
random_forest_model.fit(X_train, y_train)

# predict and evaluate performance
y_pred_rf = random_forest_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

print(f"Accuracy: {accuracy_rf:.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))

--- Model 2: Random Forest ---
Accuracy: 0.921

Classification Report:
              precision    recall  f1-score   support

       False       0.94      0.97      0.96     16439
        True       0.62      0.43      0.51      1729

    accuracy                           0.92     18168
   macro avg       0.78      0.70      0.73     18168
weighted avg       0.91      0.92      0.91     18168



In [26]:
print("--- Model 3: AdaBoost ---")

# create base classifier (shallow tree)
base_estimator = DecisionTreeClassifier(max_depth=1)

# create adaboost model with 50 estimators
adaboost_model = AdaBoostClassifier(estimator=base_estimator, n_estimators=50, random_state=42)

# train model
adaboost_model.fit(X_train, y_train)

# predict and evaluate performance
y_pred_adaboost = adaboost_model.predict(X_test)
accuracy_adaboost = accuracy_score(y_test, y_pred_adaboost)

print(f"Accuracy: {accuracy_adaboost:.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_adaboost))

--- Model 3: AdaBoost ---
Accuracy: 0.914

Classification Report:
              precision    recall  f1-score   support

       False       0.91      1.00      0.95     16439
        True       0.83      0.12      0.21      1729

    accuracy                           0.91     18168
   macro avg       0.87      0.56      0.58     18168
weighted avg       0.91      0.91      0.88     18168

