<a href="https://colab.research.google.com/github/Pop-brian/Capstone-DS/blob/main/SpaceX_Data_Splitting_and_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install seaborn if not already present in the environment
# This command is necessary in many lab environments to ensure the module is installed.
# Using '!' prefix allows running shell commands from a Python environment like a Jupyter cell.
!pip install seaborn

# Pandas is a software library written for the Python programming language for data manipulation and analysis.
import pandas as pd
# NumPy is a library for the Python programming language, adding support for large, multi-dimensional arrays and matrices, along with a large collection of high-level mathematical functions to operate on these arrays
import numpy as np
# Matplotlib is a plotting library for python and pyplot gives us a MatLab like plotting framework. We will use this in our plotter function to plot data.
import matplotlib.pyplot as plt
# Seaborn is a Python data visualization library based on matplotlib. It provides a high-level interface for drawing attractive and informative statistical graphics
import seaborn as sns
# Preprocessing allows us to standardize our data
from sklearn import preprocessing
# Allows us to split our data into training and testing data
from sklearn.model_selection import train_test_split # Imported for Task 3
# Allows us to test parameters of classification algorithms and find the best one
from sklearn.model_selection import GridSearchCV
# Logistic Regression classification algorithm
from sklearn.linear_model import LogisticRegression
# Support Vector Machine classification algorithm
from sklearn.svm import SVC
# Decision Tree classification algorithm
from sklearn.tree import DecisionTreeClassifier
# K Nearest Neighbors classification algorithm
from sklearn.neighbors import KNeighborsClassifier

# --- Data Loading URLs ---
from js import fetch
import io

# URL for Features (X) - dataset_part_3.csv (one-hot encoded features)
URL_X = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DS0321EN-SkillsNetwork/datasets/dataset_part_3.csv'
# URL for Target Variable Source (Y) - dataset_part_2.csv (contains the 'Class' column)
URL_Y_Source = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DS0321EN-SkillsNetwork/datasets/dataset_part_2.csv"


# Function to load data from URL
async def load_data(url):
    resp = await fetch(url)
    text = io.BytesIO((await resp.arrayBuffer()).to_py())
    return pd.read_csv(text)

# Main asynchronous function to perform all tasks: data loading, preprocessing, splitting, and modeling
async def run_data_preparation_and_modeling():
    print("--- Starting Data Preparation and Modeling ---")

    # Load X (Features)
    X_df = await load_data(URL_X)
    X = X_df
    print("X (Features) loaded successfully. Shape:", X.shape)

    # --- TASK 1: Create Y (Target Variable) ---
    data = await load_data(URL_Y_Source)
    Y = data['Class'].to_numpy()
    print("\n--- Task 1 Results (Y) ---")
    print("Y shape:", Y.shape)


    # --- TASK 2: Standardize the Feature Data X ---
    transform = preprocessing.StandardScaler()
    X = transform.fit_transform(X)
    print("\n--- Task 2 Results (Standardized X) ---")
    print("Standardized X shape:", X.shape)


    # --- TASK 3: Split the Data into Training and Test Sets ---
    X_train, X_test, Y_train, Y_test = train_test_split(
        X,
        Y,
        test_size=0.2,
        random_state=2
    )

    print("\n--- Task 3 Results (Train-Test Split) ---")
    print(f"X_train shape: {X_train.shape}, Y_train shape: {Y_train.shape}")
    print(f"X_test shape: {X_test.shape}, Y_test shape: {Y_test.shape}")


    # Initialize dictionary to store accuracies for Task 12
    model_accuracies = {}


    # --- TASK 4: Logistic Regression Model Training and Hyperparameter Tuning ---
    print("\n--- TASK 4: Logistic Regression (GridSearchCV) ---")

    # 1. Define parameters and model object
    parameters_lr ={'C':[0.01,0.1,1],
                 'penalty':['l2'],
                 'solver':['lbfgs']}

    # Define model and ensure it's available
    from sklearn.linear_model import LogisticRegression
    lr=LogisticRegression()

    # 2. Create GridSearchCV object and ensure it's available
    from sklearn.model_selection import GridSearchCV
    logreg_cv = GridSearchCV(
        estimator=lr,
        param_grid=parameters_lr, # Using parameters_lr
        scoring='accuracy',
        cv=10,
        verbose=0, # Set verbose to 0 to minimize output during fitting
        n_jobs=-1 # Use all available cores
    )

    # 3. Fit the object to the training data
    logreg_cv.fit(X_train, Y_train)

    # 4. Output the results
    print("\nLogReg GridSearchCV completed.")
    print("Tuned hpyerparameters (best parameters): ", logreg_cv.best_params_)
    print("Accuracy on validation data (best score): ", logreg_cv.best_score_)


    # --- TASK 5: Calculate Accuracy on Test Data (Logistic Regression) ---
    print("\n--- TASK 5: Test Data Accuracy (Logistic Regression) ---")

    # Calculate the accuracy on the test data using the score method
    lr_test_accuracy = logreg_cv.score(X_test, Y_test)
    print(f"Accuracy on test data (Logistic Regression): {lr_test_accuracy}")
    model_accuracies['Logistic Regression'] = lr_test_accuracy

    # --- TASK 6: Support Vector Machine (SVM) Model Training and Hyperparameter Tuning ---
    print("\n--- TASK 6: Support Vector Machine (GridSearchCV) ---")

    # 1. Define parameters and model object
    parameters_svm = {'kernel':('linear', 'rbf', 'poly', 'sigmoid'),
                      'C': np.logspace(-3, 3, 5),
                      'gamma': np.logspace(-3, 3, 5)}

    # Define SVM model and ensure it's available
    from sklearn.svm import SVC
    svm = SVC()

    # 2. Create GridSearchCV object
    svm_cv = GridSearchCV(
        estimator=svm,
        param_grid=parameters_svm,
        scoring='accuracy',
        cv=10,
        verbose=0,
        n_jobs=-1
    )

    # 3. Fit the object to the training data
    svm_cv.fit(X_train, Y_train)

    # 4. Output the results
    print("\nSVM GridSearchCV completed.")
    print("Tuned hpyerparameters (best parameters): ", svm_cv.best_params_)
    print("Accuracy on validation data (best score): ", svm_cv.best_score_)

    # --- TASK 7: Calculate Accuracy on Test Data (SVM) ---
    print("\n--- TASK 7: Test Data Accuracy (SVM) ---")

    # Calculate the accuracy on the test data using the score method
    svm_test_accuracy = svm_cv.score(X_test, Y_test)
    print(f"Accuracy on test data (SVM): {svm_test_accuracy}")
    model_accuracies['SVM'] = svm_test_accuracy

    # --- TASK 8: Decision Tree Classifier Model Training and Hyperparameter Tuning ---
    print("\n--- TASK 8: Decision Tree Classifier (GridSearchCV) ---")

    # 1. Define parameters and model object
    parameters_tree = {'criterion': ['gini', 'entropy'],
                       'splitter': ['best', 'random'],
                       'max_depth': [2*n for n in range(1, 10)],
                       'max_features': ['auto', 'sqrt'],
                       'min_samples_leaf': [1, 2, 4],
                       'min_samples_split': [2, 5, 10]}

    # Define Decision Tree model and ensure it's available
    from sklearn.tree import DecisionTreeClassifier
    tree = DecisionTreeClassifier()

    # 2. Create GridSearchCV object
    tree_cv = GridSearchCV(
        estimator=tree,
        param_grid=parameters_tree,
        scoring='accuracy',
        cv=10,
        verbose=0,
        n_jobs=-1
    )

    # 3. Fit the object to the training data
    # Note: Fitting a complex GridSearch object like this can be computationally intensive and take time.
    tree_cv.fit(X_train, Y_train)

    # 4. Output the results
    print("\nDecision Tree GridSearchCV completed.")
    print("Tuned hpyerparameters (best parameters): ", tree_cv.best_params_)
    print("Accuracy on validation data (best score): ", tree_cv.best_score_)


    # --- TASK 9: Calculate Accuracy on Test Data (Decision Tree) ---
    print("\n--- TASK 9: Test Data Accuracy (Decision Tree) ---")

    # Calculate the accuracy on the test data using the score method
    tree_test_accuracy = tree_cv.score(X_test, Y_test)
    print(f"Accuracy on test data (Decision Tree): {tree_test_accuracy}")
    model_accuracies['Decision Tree'] = tree_test_accuracy


    # --- TASK 10: K Nearest Neighbors (KNN) Model Training and Hyperparameter Tuning ---
    print("\n--- TASK 10: K Nearest Neighbors (GridSearchCV) ---")

    # 1. Define parameters and model object
    parameters_knn = {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                      'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                      'p': [1,2]}

    # Define KNN model and ensure it's available
    from sklearn.neighbors import KNeighborsClassifier
    knn = KNeighborsClassifier()

    # 2. Create GridSearchCV object
    knn_cv = GridSearchCV(
        estimator=knn,
        param_grid=parameters_knn,
        scoring='accuracy',
        cv=10,
        verbose=0,
        n_jobs=-1
    )

    # 3. Fit the object to the training data
    knn_cv.fit(X_train, Y_train)

    # 4. Output the results
    print("\nKNN GridSearchCV completed.")
    print("Tuned hpyerparameters (best parameters): ", knn_cv.best_params_)
    print("Accuracy on validation data (best score): ", knn_cv.best_score_)


    # --- TASK 11: Calculate Accuracy on Test Data (KNN) ---
    print("\n--- TASK 11: Test Data Accuracy (KNN) ---")

    # Calculate the accuracy on the test data using the score method
    knn_test_accuracy = knn_cv.score(X_test, Y_test)
    print(f"Accuracy on test data (KNN): {knn_test_accuracy}")
    model_accuracies['K-Nearest Neighbors'] = knn_test_accuracy


    # --- TASK 12: Find the Best Performing Model ---
    print("\n--- TASK 12: Model Performance Summary ---")

    # Find the best performing model
    best_model_name = max(model_accuracies, key=model_accuracies.get)
    best_accuracy = model_accuracies[best_model_name]

    # Create a DataFrame for easy comparison
    accuracy_df = pd.DataFrame(
        list(model_accuracies.items()),
        columns=['Model', 'Test Accuracy']
    ).sort_values(by='Test Accuracy', ascending=False).reset_index(drop=True)

    # Print the comparison table
    print("\nComparison of Model Test Accuracies:")
    print(accuracy_df.to_markdown(index=False))

    print(f"\nConclusion: The best performing model on the test data is '{best_model_name}' with an accuracy of {best_accuracy:.4f}.")


    # Return all necessary objects: the trained GridSearchCV objects and the split data
    return logreg_cv, svm_cv, tree_cv, knn_cv, X_train, X_test, Y_train, Y_test

# Execute the main data preparation and modeling function
logreg_cv_result, svm_cv_result, tree_cv_result, knn_cv_result, X_train, X_test, Y_train, Y_test = await run_data_preparation_and_modeling()

In [None]:
bb