In [401]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, recall_score
from imblearn.pipeline import Pipeline  as ImbleanPipeline
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from pathlib import Path


In [402]:
csv_path = Path.cwd().parent.parent / 'data' / 'clean_data' / 'cleaned_df_data_training_2025_05_07.csv'
df = pd.read_csv(csv_path)

In [403]:
df['outcome'].value_counts()

outcome
0    82
1     9
Name: count, dtype: int64

In [404]:
df['version_booster'].value_counts()

version_booster
F9 B5                    17
F9 FT [                  13
F9 B4 [                   6
F9 B5 [                   6
F9 B5 ♺                   5
F9 B5 ♺ [                 5
F9 v1.1 [                 4
F9 v1.1                   2
F9 v1.0 7 B0003.1 8       1
F9 B5 B1049.6 544         1
F9 B5 B1060.2 563         1
F9 B5 B1058.3 565         1
F9 B5 B1051.6 568         1
F9 B5 B1058.5 613         1
F9 B5 B1051.8 609         1
F9 B5 B1056.3 482         1
F9 B5 B1060.6 643         1
F9 B5 B1061.2 647         1
F9 B5 B1060.7 652         1
F9 B5 B1049.9 655         1
F9 B5 B1051.10 657        1
F9 B5 B1063.2 665         1
F9 B5 B1058.2 544         1
F9 B5 [ ] 413             1
F9 B5 B1056.2 465         1
F9 B5 311 B1046.1 268     1
F9 FT ♺ [                 1
F9 FT B1029.2 195         1
F9 FT B1031.2 220         1
F9 FT B1035.2 227         1
F9 FT B1036.2 227         1
F9 FT B1032.2 245         1
F9 B5 349 B1048 [         1
F9 B5 B1051.2 420         1
F9 B5 B1046.2 354         1
F9 B

In [405]:
counts = df['version_booster'].value_counts()

keep_values = counts[counts > 5].index

df['version_booster'] = df['version_booster'].where(df['version_booster'].isin(keep_values),'other')


In [406]:
df[['launch_site', 'version_booster', 'outcome', 'gridfins', 'reused', 'landingpad', 'block']] = df[['launch_site', 'version_booster', 'outcome', 'gridfins', 'reused', 'landingpad', 'block']].astype('category')
df = df.drop(['version_booster', 'gridfins'], axis=1)
df['date'] = pd.to_datetime(df['date'])

In [407]:
df.dtypes

launch_site           category
payload_mass           float64
date            datetime64[ns]
reused                category
block                 category
reusedcount              int64
landingpad            category
outcome               category
dtype: object

In [408]:
X = df.drop('outcome', axis=1)
Y = df['outcome']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2
                                                    , random_state=42, stratify=Y)

In [409]:
numeric_feature = ['payload_mass', 'reusedcount']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [410]:
categorical_features = ['launch_site','reused', 'block', 'landingpad']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [411]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_feature),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [412]:
smote = SMOTE(k_neighbors=2, random_state=42)

cv_nonsmote = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_smote = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [413]:
dt_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier())
])

In [414]:
param_grid_dt = {
    'classifier__max_depth': [3, 5, 10, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__criterion': ['gini', 'entropy']
}

In [415]:
grid_search_dt = GridSearchCV(dt_pipeline, param_grid_dt, cv=cv_nonsmote, scoring='recall', error_score='raise')
grid_search_dt.fit(X_train, y_train)

In [416]:
print('Best hyperparameters:', grid_search_dt.best_params_)
print('Best cross-validated recall:', grid_search_dt.best_score_)
best_model_dt = grid_search_dt.best_estimator_

Best hyperparameters: {'classifier__criterion': 'gini', 'classifier__max_depth': 3, 'classifier__min_samples_split': 2}
Best cross-validated recall: 0.7


In [417]:
y_pred_dt = best_model_dt.predict(X_test)



print('Test recall:', recall_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))

Test recall: 0.5
              precision    recall  f1-score   support

           0       0.94      0.94      0.94        17
           1       0.50      0.50      0.50         2

    accuracy                           0.89        19
   macro avg       0.72      0.72      0.72        19
weighted avg       0.89      0.89      0.89        19



In [418]:
logr_pipeline = ImbleanPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', smote),
    ('classifier', LogisticRegression())
])

In [419]:
param_grid_log = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],       # Regularization strength
    'classifier__penalty': ['l2'],                 # L1 can be used with 'liblinear'
    'classifier__solver': ['lbfgs'],               # Solver must match the penalty
    'classifier__class_weight': [None, 'balanced'] # Try with and without balancing
}

In [420]:
grid_search_log = GridSearchCV(logr_pipeline, param_grid_log, cv=cv_smote, scoring='recall', n_jobs=-1)
grid_search_log.fit(X_train, y_train)

In [421]:
print("Best hyperparameters:", grid_search_log.best_params_)
print("Best cross-validated recall:", grid_search_log.best_score_)
best_model_log = grid_search_log.best_estimator_


Best hyperparameters: {'classifier__C': 0.01, 'classifier__class_weight': None, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs'}
Best cross-validated recall: 0.7222222222222222


In [422]:
y_pred_log = best_model_log.predict(X_test)
print('Test recall:', recall_score(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log))

Test recall: 0.5
              precision    recall  f1-score   support

           0       0.93      0.76      0.84        17
           1       0.20      0.50      0.29         2

    accuracy                           0.74        19
   macro avg       0.56      0.63      0.56        19
weighted avg       0.85      0.74      0.78        19



In [423]:
svc_pipeline = ImbleanPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', smote),
    ('classifier', SVC())
])

In [424]:
param_grid_svc = {
    'classifier__C': [0.1, 1, 10],
    'classifier__kernel': ['linear', 'rbf'],
    'classifier__class_weight': [None, 'balanced'],
    'classifier__gamma': ['scale', 'auto']
}

In [425]:
grid_search_svc = GridSearchCV(svc_pipeline, param_grid=param_grid_svc, scoring='recall', cv=cv_smote, n_jobs=-1)
grid_search_svc.fit(X_train, y_train)

In [426]:
print("Best hyperparameters:", grid_search_svc.best_params_)
print("Best cross-validated recall:", grid_search_svc.best_score_)
best_model_svc = grid_search_svc.best_estimator_
y_pred_svc = best_model_svc.predict(X_test)



Best hyperparameters: {'classifier__C': 0.1, 'classifier__class_weight': None, 'classifier__gamma': 'auto', 'classifier__kernel': 'rbf'}
Best cross-validated recall: 0.7222222222222222


In [427]:
print("Test recall:", recall_score(y_test, y_pred_svc))
print(classification_report(y_test, y_pred_svc))

Test recall: 0.5
              precision    recall  f1-score   support

           0       0.93      0.76      0.84        17
           1       0.20      0.50      0.29         2

    accuracy                           0.74        19
   macro avg       0.56      0.63      0.56        19
weighted avg       0.85      0.74      0.78        19



RANDOM FOREST PIPELINE

In [428]:
rf_pipeline = ImbleanPipeline([
    ('preprocessor', preprocessor),
    ('smote', smote),
    ('classifier', RandomForestClassifier(random_state=42))
])


In [429]:
rf_params = {
    'classifier__n_estimators':[50, 100],
    'classifier__max_depth': [3, 5, 10]
}

In [430]:
grid_search_rf = GridSearchCV(rf_pipeline, rf_params, scoring='recall', cv=cv_smote)
grid_search_rf.fit(X_train, y_train)

In [431]:
print("Best hyperparameters:", grid_search_rf.best_params_)
print("Best cross-validated recall:", grid_search_rf.best_score_)
best_model_rf = grid_search_rf.best_estimator_
y_pred_rf = best_model_rf.predict(X_test)

Best hyperparameters: {'classifier__max_depth': 5, 'classifier__n_estimators': 50}
Best cross-validated recall: 0.3333333333333333


In [432]:
print("Test recall:", recall_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

Test recall: 0.0
              precision    recall  f1-score   support

           0       0.88      0.88      0.88        17
           1       0.00      0.00      0.00         2

    accuracy                           0.79        19
   macro avg       0.44      0.44      0.44        19
weighted avg       0.79      0.79      0.79        19



In [433]:
xgb_pipeline = ImbleanPipeline([
    ('preprocessor', preprocessor),
    ('smote', smote),
    ('classifier', XGBClassifier(scale_pos_weight=len(y_train[y_train == 0]) / len(y_train[y_train == 1]),
                          use_label_encoder=False, eval_metric='logloss', random_state=42))
])

In [434]:
xgb_params = {
    'classifier__max_depth': [3, 5, 7],
    'classifier__n_estimators': [50, 100],
    'classifier__learning_rate': [0.01, 0.1]
}

In [435]:
grid_search_xgb = GridSearchCV(xgb_pipeline, xgb_params, scoring='recall', cv=cv_smote)
grid_search_xgb.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


In [436]:
print("Best hyperparameters:", grid_search_xgb.best_params_)
print("Best cross-validated recall:", grid_search_xgb.best_score_)
best_model_xgb = grid_search_xgb.best_estimator_
y_probs_xgb = best_model_xgb.predict_proba(X_test)[:, 1]
y_pred_thresh_xgb = (y_probs_xgb > .3).astype(int)
print(y_probs_xgb)


Best hyperparameters: {'classifier__learning_rate': 0.01, 'classifier__max_depth': 3, 'classifier__n_estimators': 50}
Best cross-validated recall: 1.0
[0.61383307 0.8643307  0.5349111  0.5349111  0.8391803  0.5349111
 0.5349111  0.5349111  0.61383307 0.89462876 0.5349111  0.61383307
 0.5349111  0.5349111  0.5349111  0.8643307  0.89462876 0.5349111
 0.89462876]


In [437]:
print("Test recall:", recall_score(y_test, y_pred_thresh_xgb))
print(classification_report(y_test, y_pred_thresh_xgb))

Test recall: 1.0
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        17
           1       0.11      1.00      0.19         2

    accuracy                           0.11        19
   macro avg       0.05      0.50      0.10        19
weighted avg       0.01      0.11      0.02        19



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
