# SpaceX Falcon 9 First Stage Landing Prediction — Machine Learning Lab

**Tasks 1–12 completed**

This notebook follows the IBM Capstone lab structure: preprocessing, train/test split, model training with GridSearchCV (Logistic Regression, SVM, Decision Tree, KNN), evaluation, and confusion matrices.

## Task 1 — Import libraries & define helper function

In [None]:

# Core
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Helper: pretty confusion matrix
def plot_confusion_matrix(y_true, y_pred, title="Confusion Matrix"):
    cm = confusion_matrix(y_true, y_pred)
    fig, ax = plt.subplots(figsize=(4, 4))
    sns.heatmap(cm, annot=True, fmt='d', cbar=False, ax=ax)
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels')
    ax.set_title(title)
    ax.xaxis.set_ticklabels(['did not land','landed'])
    ax.yaxis.set_ticklabels(['did not land','landed'])
    plt.show()


## Task 2 — Load the dataframe

In [None]:

# IBM Skills Network datasets (public)
URL2 = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DS0321EN-SkillsNetwork/datasets/dataset_part_2.csv"
URL3 = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DS0321EN-SkillsNetwork/datasets/dataset_part_3.csv"

# Load; we primarily use part_2 for X and y (one-hot encoded features + 'Class')
data = pd.read_csv(URL2)
data_part3 = pd.read_csv(URL3)  # not strictly required for modeling, but loaded to mirror lab

print("Shapes — part_2:", data.shape, " part_3:", data_part3.shape)
display(data.head())


## Task 3 — Create `Y` from `Class` and `X` as features

In [None]:

# Target and features
y = data['Class'].to_numpy()            # numpy array per lab
X = data.drop(columns=['Class'])        # all other columns as features
print("X shape:", X.shape, " y shape:", y.shape)


## Task 4 — Standardize `X`

In [None]:

scaler = preprocessing.StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled[:3]


## Task 5 — Split into train and test sets

In [None]:

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=2, stratify=y
)
print("Train shapes:", X_train.shape, y_train.shape)
print("Test  shapes:", X_test.shape,  y_test.shape)


## Task 6 — Logistic Regression with GridSearchCV

In [None]:

parameters_lr = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs']  # works with l2
}
logreg = LogisticRegression(max_iter=2000)
logreg_cv = GridSearchCV(logreg, parameters_lr, cv=10, n_jobs=-1)
logreg_cv.fit(X_train, y_train)

print("tuned hyperparameters (best parameters):", logreg_cv.best_params_)
print("validation accuracy:", logreg_cv.best_score_)
print("test accuracy:", logreg_cv.score(X_test, y_test))

yhat_lr = logreg_cv.predict(X_test)
plot_confusion_matrix(y_test, yhat_lr, title='LogReg — Confusion Matrix')
print(classification_report(y_test, yhat_lr, digits=3))


## Task 7 — Support Vector Machine with GridSearchCV

In [None]:

parameters_svm = {
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'C': np.logspace(-3, 3, 5),
    'gamma': np.logspace(-3, 3, 5)
}
svm = SVC()
svm_cv = GridSearchCV(svm, parameters_svm, cv=10, n_jobs=-1)
svm_cv.fit(X_train, y_train)

print("tuned hyperparameters (best parameters):", svm_cv.best_params_)
print("validation accuracy:", svm_cv.best_score_)
print("test accuracy:", svm_cv.score(X_test, y_test))

yhat_svm = svm_cv.predict(X_test)
plot_confusion_matrix(y_test, yhat_svm, title='SVM — Confusion Matrix')
print(classification_report(y_test, yhat_svm, digits=3))


## Task 8 — Decision Tree with GridSearchCV

In [None]:

parameters_tree = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'splitter': ['best', 'random'],
    'max_depth': list(range(2, 13)),
    'max_features': ['sqrt', None],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10]
}
tree = DecisionTreeClassifier(random_state=2)
tree_cv = GridSearchCV(tree, parameters_tree, cv=10, n_jobs=-1)
tree_cv.fit(X_train, y_train)

print("tuned hyperparameters (best parameters):", tree_cv.best_params_)
print("validation accuracy:", tree_cv.best_score_)
print("test accuracy:", tree_cv.score(X_test, y_test))

yhat_tree = tree_cv.predict(X_test)
plot_confusion_matrix(y_test, yhat_tree, title='Decision Tree — Confusion Matrix')
print(classification_report(y_test, yhat_tree, digits=3))


## Task 9 — K-Nearest Neighbors with GridSearchCV

In [None]:

parameters_knn = {
    'n_neighbors': list(range(1, 11)),
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1, 2]  # Manhattan (1) or Euclidean (2)
}
knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, parameters_knn, cv=10, n_jobs=-1)
knn_cv.fit(X_train, y_train)

print("tuned hyperparameters (best parameters):", knn_cv.best_params_)
print("validation accuracy:", knn_cv.best_score_)
print("test accuracy:", knn_cv.score(X_test, y_test))

yhat_knn = knn_cv.predict(X_test)
plot_confusion_matrix(y_test, yhat_knn, title='KNN — Confusion Matrix')
print(classification_report(y_test, yhat_knn, digits=3))


## Task 10 — Compare models, pick the best, and show summary

In [None]:

results = {
    'Logistic Regression': {
        'cv_best': logreg_cv.best_score_, 'test_acc': accuracy_score(y_test, yhat_lr), 'estimator': logreg_cv.best_estimator_
    },
    'SVM': {
        'cv_best': svm_cv.best_score_, 'test_acc': accuracy_score(y_test, yhat_svm), 'estimator': svm_cv.best_estimator_
    },
    'Decision Tree': {
        'cv_best': tree_cv.best_score_, 'test_acc': accuracy_score(y_test, yhat_tree), 'estimator': tree_cv.best_estimator_
    },
    'KNN': {
        'cv_best': knn_cv.best_score_, 'test_acc': accuracy_score(y_test, yhat_knn), 'estimator': knn_cv.best_estimator_
    },
}

summary = (
    pd.DataFrame(results)
    .T.sort_values("test_acc", ascending=False)
)
display(summary)

best_model_name = summary.index[0]
print(f"\nBest model by test accuracy: {best_model_name}")


## Task 11 — Output confusion matrix for the best model

In [None]:

best_preds = {
    'Logistic Regression': yhat_lr,
    'SVM': yhat_svm,
    'Decision Tree': yhat_tree,
    'KNN': yhat_knn,
}[best_model_name]

plot_confusion_matrix(y_test, best_preds, title=f'{best_model_name} — Confusion Matrix (Best)')


## Task 12 — Final answer

The table above shows validation and test accuracy for each algorithm with tuned hyperparameters. The best model (by **test accuracy**) is printed, and its confusion matrix is plotted.