In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, recall_score
from pathlib import Path


In [16]:
csv_path = Path.cwd().parent.parent / 'data' / 'clean_data' / 'cleaned_df_data_training_2025_05_07.csv'
df = pd.read_csv(csv_path)

In [17]:
df['version_booster'].value_counts()

version_booster
F9 B5                    17
F9 FT [                  13
F9 B4 [                   6
F9 B5 [                   6
F9 B5 ♺                   5
F9 B5 ♺ [                 5
F9 v1.1 [                 4
F9 v1.1                   2
F9 v1.0 7 B0003.1 8       1
F9 B5 B1049.6 544         1
F9 B5 B1060.2 563         1
F9 B5 B1058.3 565         1
F9 B5 B1051.6 568         1
F9 B5 B1058.5 613         1
F9 B5 B1051.8 609         1
F9 B5 B1056.3 482         1
F9 B5 B1060.6 643         1
F9 B5 B1061.2 647         1
F9 B5 B1060.7 652         1
F9 B5 B1049.9 655         1
F9 B5 B1051.10 657        1
F9 B5 B1063.2 665         1
F9 B5 B1058.2 544         1
F9 B5 [ ] 413             1
F9 B5 B1056.2 465         1
F9 B5 311 B1046.1 268     1
F9 FT ♺ [                 1
F9 FT B1029.2 195         1
F9 FT B1031.2 220         1
F9 FT B1035.2 227         1
F9 FT B1036.2 227         1
F9 FT B1032.2 245         1
F9 B5 349 B1048 [         1
F9 B5 B1051.2 420         1
F9 B5 B1046.2 354         1
F9 B

In [18]:
counts = df['version_booster'].value_counts()

keep_values = counts[counts > 5].index

df['version_booster'] = df['version_booster'].where(df['version_booster'].isin(keep_values),'other')


In [19]:
df[['launch_site', 'version_booster', 'outcome', 'gridfins', 'reused', 'landingpad', 'block']] = df[['launch_site', 'version_booster', 'outcome', 'gridfins', 'reused', 'landingpad', 'block']].astype('category')
df['date'] = pd.to_datetime(df['date'])

In [20]:
df.dtypes

launch_site              category
payload_mass              float64
version_booster          category
date               datetime64[ns]
gridfins                 category
reused                   category
block                    category
reusedcount                 int64
landingpad               category
outcome                  category
dtype: object

In [21]:
X = df.drop('outcome', axis=1)
Y = df['outcome']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)

In [22]:
numeric_feature = ['payload_mass', 'reusedcount']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [23]:
categorical_features = ['launch_site', 'version_booster', 'gridfins', 'reused', 'block', 'landingpad']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [24]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_feature),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [25]:
dt_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier())
])

In [26]:
param_grid = {
    'classifier__max_depth': [3, 5, 10, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__criterion': ['gini', 'entropy']
}

In [27]:
grid_search = GridSearchCV(dt_pipeline, param_grid, cv=5, scoring='recall')
grid_search.fit(X_train, y_train)

In [None]:
print("Best hyperparameters:", grid_search.best_params_)
print("Best cross-validated recall:", grid_search.best_score_)
best_model = grid_search.best_estimator_

Best hyperparameters: {'classifier__criterion': 'entropy', 'classifier__max_depth': 3, 'classifier__min_samples_split': 2}
Best cross-validated accuracy: 0.9230769230769231
