In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

# Replace 'path/to/yourfile.csv' with your actual file path in Google Drive
file_path = '/content/drive/My Drive/Breast_Cancer_dataset.csv'
df = pd.read_csv(file_path)

# Display the first few rows to check if it's loaded correctly
df.head()

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
0,68.0,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,4.0,Positive,Positive,24.0,1,60,Alive
1,50.0,White,,T2,N2,IIIA,Moderately differentiated,2,Regional,35.0,Positive,Positive,14.0,5,62,Alive
2,58.0,White,Divorced,T3,N3,IIIC,Moderately differentiated,2,Regional,63.0,Positive,Positive,14.0,7,75,Alive
3,58.0,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,,Positive,Positive,2.0,1,84,Alive
4,47.0,White,Married,T2,N1,IIB,Poorly differentiated,3,Regional,41.0,,Positive,3.0,1,50,Alive


In [3]:
# Check for missing values in each column
missing_values = df.isnull().sum()

# Filter columns with missing values
columns_with_missing_values = missing_values[missing_values > 0]

# Print the results
print("Columns with missing values:")
print(columns_with_missing_values)

Columns with missing values:
Age                       201
Race                      402
Marital Status            321
Tumor Size                402
Estrogen Status           201
Regional Node Examined    603
dtype: int64


In [6]:
for column in df.select_dtypes(include=['number']).columns:
    if df[column].isnull().any():  # Check for missing values in the column
        df[column] = df[column].fillna(df[column].median())

for column in df.select_dtypes(include=['object', 'bool']).columns:
    if df[column].isnull().any():  # Check for missing values in the column
        df[column] = df[column].fillna(df[column].mode()[0])

In [7]:
# Check for missing values in each column
missing_values = df.isnull().sum()

# Filter columns with missing values
columns_with_missing_values = missing_values[missing_values > 0]

# Print the results
print("Columns with missing values:")
print(columns_with_missing_values)

Columns with missing values:
Series([], dtype: int64)


In [10]:
for column in df.select_dtypes(include=['number']).columns:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1

    outliers = df[(df[column] < (Q1 - 1.2 * IQR)) | (df[column] > (Q3 + 1.2 * IQR))]

    # Print or store the outliers for each column
    print(f"Outliers in {column}:")
    print(outliers)

Outliers in Age:
       Age   Race Marital Status T Stage  N Stage 6th Stage  \
702   30.0  White        Single        T2      N2      IIIA   
938   30.0  White       Divorced       T3      N2      IIIA   
1544  30.0  White        Married       T1      N1       IIA   
1888  30.0  White        Single        T2      N1       IIB   
3310  30.0  Other        Married       T2      N1       IIB   

                  differentiate Grade   A Stage  Tumor Size Estrogen Status  \
702   Moderately differentiated     2  Regional        28.0        Positive   
938       Poorly differentiated     3  Regional       105.0        Negative   
1544  Moderately differentiated     2  Regional         9.0        Positive   
1888      Poorly differentiated     3  Regional        50.0        Negative   
3310  Moderately differentiated     2  Regional        25.0        Positive   

     Progesterone Status  Regional Node Examined  Reginol Node Positive  \
702             Positive                    19.0      

In [9]:
outlier_counts = {}  # Dictionary to store outlier counts for each feature

for column in df.select_dtypes(include=['number']).columns:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1

    outliers = df[(df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR))]

    outlier_counts[column] = len(outliers)  # Store the outlier count for the current feature

# Print the outlier counts for each feature
for feature, count in outlier_counts.items():
    print(f"Feature: {feature}, Outlier Count: {count}")

Feature: Age, Outlier Count: 0
Feature: Tumor Size, Outlier Count: 306
Feature: Regional Node Examined, Outlier Count: 116
Feature: Reginol Node Positive, Outlier Count: 344
Feature: Survival Months, Outlier Count: 18


In [11]:
import numpy as np
from scipy import stats

outlier_counts = {}  # Dictionary to store outlier counts for each feature

for column in df.select_dtypes(include=['number']).columns:
    z_scores = np.abs(stats.zscore(df[column]))  # Calculate Z-scores
    outliers = df[(z_scores > 3)]  # Identify outliers with Z-score > 3

    outlier_counts[column] = len(outliers)  # Store outlier count for the current feature

# Print the outlier counts for each feature
for feature, count in outlier_counts.items():
    print(f"Feature: {feature}, Outlier Count: {count}")

Feature: Age, Outlier Count: 0
Feature: Tumor Size, Outlier Count: 78
Feature: Regional Node Examined, Outlier Count: 42
Feature: Reginol Node Positive, Outlier Count: 101
Feature: Survival Months, Outlier Count: 4


In [17]:
df_with_outliers = df.copy()

import numpy as np
from scipy import stats

df_without_outliers = df_with_outliers.copy()  # Create a copy outside the loop

for column in df_without_outliers.select_dtypes(include=['number']).columns:
    z_scores = np.abs(stats.zscore(df_without_outliers[column]))  # Calculate Z-scores
    df_without_outliers = df_without_outliers[(z_scores <= 3)]  # Keep data points with Z-score <= 3

In [18]:
# Select the numerical features you want to normalize
numerical_features = df_with_outliers.select_dtypes(include=['number']).columns

# Calculate the minimum and maximum values for each feature
min_values = df_with_outliers[numerical_features].min()
max_values = df_with_outliers[numerical_features].max()

# Apply min-max normalization using pandas and NumPy
df_with_outliers[numerical_features] = (df_with_outliers[numerical_features] - min_values) / (max_values - min_values)
df_without_outliers[numerical_features] = (df_without_outliers[numerical_features] - min_values) / (max_values - min_values)

In [33]:
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.preprocessing import OneHotEncoder

# Function to process a DataFrame (with or without outliers)
def process_dataframe(df):
    df_processed = df.copy()  # Create a copy for processing

    status_column = df_processed['Status']
    df_processed = df_processed.drop('Status', axis=1)

    # Identify categorical features
    categorical_features = df_processed.select_dtypes(include=['object']).columns

    # Create a OneHotEncoder object
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

    # Fit the encoder to the categorical features
    encoder.fit(df_processed[categorical_features])

    # Transform the categorical features into one-hot encoded features
    encoded_features = encoder.transform(df_processed[categorical_features])

    # Create a DataFrame with the encoded features
    encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_features))

    # Concatenate the encoded features with the numerical features
    df_processed = pd.concat([df_processed.drop(categorical_features, axis=1), encoded_df], axis=1)

    # Select all numerical features (including one-hot encoded)
    numerical_features = df_processed.select_dtypes(include=['number']).columns

    # Apply min-max normalization
    min_values = df_processed[numerical_features].min()
    max_values = df_processed[numerical_features].max()
    df_processed[numerical_features] = (df_processed[numerical_features] - min_values) / (max_values - min_values)

    df_processed['Status'] = status_column
    return df_processed

# Process both DataFrames
df_with_outliers_processed = process_dataframe(df_with_outliers)
df_without_outliers_processed = process_dataframe(df_without_outliers)



In [34]:
# ... (previous code)

# Print all columns of df_with_outliers_processed
print("All Columns (with outliers):", df_with_outliers_processed.columns.tolist())

# Print all columns of df_without_outliers_processed
print("All Columns (without outliers):", df_without_outliers_processed.columns.tolist())

All Columns (with outliers): ['Age', 'Tumor Size', 'Regional Node Examined', 'Reginol Node Positive', 'Survival Months', 'Race_Black', 'Race_Other', 'Race_White', 'Marital Status_Divorced', 'Marital Status_Married', 'Marital Status_Separated', 'Marital Status_Single ', 'Marital Status_Widowed', 'T Stage _T1', 'T Stage _T2', 'T Stage _T3', 'T Stage _T4', 'N Stage_N1', 'N Stage_N2', 'N Stage_N3', '6th Stage_IIA', '6th Stage_IIB', '6th Stage_IIIA', '6th Stage_IIIB', '6th Stage_IIIC', 'differentiate_Moderately differentiated', 'differentiate_Poorly differentiated', 'differentiate_Undifferentiated', 'differentiate_Well differentiated', 'Grade_ anaplastic; Grade IV', 'Grade_1', 'Grade_2', 'Grade_3', 'A Stage_Distant', 'A Stage_Regional', 'Estrogen Status_Negative', 'Estrogen Status_Positive', 'Progesterone Status_Negative', 'Progesterone Status_Positive', 'Status']
All Columns (without outliers): ['Age', 'Tumor Size', 'Regional Node Examined', 'Reginol Node Positive', 'Survival Months', 'Rac

In [35]:
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

df_with_outliers_processed.head()



Unnamed: 0,Age,Tumor Size,Regional Node Examined,Reginol Node Positive,Survival Months,Race_Black,Race_Other,Race_White,Marital Status_Divorced,Marital Status_Married,...,Grade_1,Grade_2,Grade_3,A Stage_Distant,A Stage_Regional,Estrogen Status_Negative,Estrogen Status_Positive,Progesterone Status_Negative,Progesterone Status_Positive,Status
0,0.974359,0.021583,0.383333,0.0,0.556604,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,Alive
1,0.512821,0.244604,0.216667,0.088889,0.575472,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,Alive
2,0.717949,0.446043,0.216667,0.133333,0.698113,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,Alive
3,0.717949,0.172662,0.016667,0.0,0.783019,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,Alive
4,0.435897,0.28777,0.033333,0.0,0.462264,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,Alive


In [45]:
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression

# Function to perform RFECV and print selected features
def rfecv_selection(df_processed):
    X = df_processed.drop('Status', axis=1)  # Features
    y = df_processed['Status']  # Target variable

    model = LogisticRegression()

    selector = RFECV(model, step=1, cv=5, scoring='accuracy')

    selector.fit(X, y)

    selected_features = X.columns[selector.support_]
    print("Selected Features:", selected_features)
    print("Optimal Number of Features:", selector.n_features_)
    return selected_features

# Remove rows with NaN values from both DataFrames
df_with_outliers_processed_na = df_with_outliers_processed.dropna()
df_without_outliers_processed_na = df_without_outliers_processed.dropna()


# Apply RFECV to both DataFrames
selected_features_with_outliers = rfecv_selection(df_with_outliers_processed_na)
selected_features_without_outliers = rfecv_selection(df_without_outliers_processed_na)

Selected Features: Index(['Age', 'Regional Node Examined', 'Reginol Node Positive',
       'Survival Months', 'Race_Black', 'Race_Other',
       'Marital Status_Separated', 'T Stage _T1', 'T Stage _T2', 'T Stage _T4',
       'N Stage_N1', 'N Stage_N2', 'N Stage_N3', '6th Stage_IIB',
       '6th Stage_IIIB', '6th Stage_IIIC',
       'differentiate_Poorly differentiated', 'differentiate_Undifferentiated',
       'differentiate_Well differentiated', 'Grade_ anaplastic; Grade IV',
       'Grade_1', 'Estrogen Status_Negative', 'Progesterone Status_Negative'],
      dtype='object')
Optimal Number of Features: 23
Selected Features: Index(['Age', 'Tumor Size', 'Regional Node Examined', 'Reginol Node Positive',
       'Survival Months', 'Race_Black', 'Race_Other',
       'Marital Status_Divorced', 'Marital Status_Married',
       'Marital Status_Separated', 'Marital Status_Single ',
       'Marital Status_Widowed', 'T Stage _T4', 'N Stage_N3', '6th Stage_IIIB',
       'A Stage_Distant', 'Estrog

In [49]:
import pandas as pd

def create_dataframe_iteratively(original_df, selected_features, target_variable='Status'):
    new_df = pd.DataFrame(original_df[target_variable])  # Start with the target variable

    for feature in selected_features:
        new_df[feature] = original_df[feature]  # Add each selected feature iteratively

        # You can perform analysis or visualization here with the current set of features
        # For example, print the current DataFrame shape:
        print(f"Added feature: {feature}, DataFrame shape: {new_df.shape}")

    return new_df  # Return the final DataFrame with all selected features

# Example usage:
df_with_outliers_selected = create_dataframe_iteratively(df_with_outliers_processed, selected_features_with_outliers)
df_without_outliers_selected = create_dataframe_iteratively(df_without_outliers_processed, selected_features_without_outliers)

Added feature: Age, DataFrame shape: (4024, 2)
Added feature: Regional Node Examined, DataFrame shape: (4024, 3)
Added feature: Reginol Node Positive, DataFrame shape: (4024, 4)
Added feature: Survival Months, DataFrame shape: (4024, 5)
Added feature: Race_Black, DataFrame shape: (4024, 6)
Added feature: Race_Other, DataFrame shape: (4024, 7)
Added feature: Marital Status_Separated, DataFrame shape: (4024, 8)
Added feature: T Stage _T1, DataFrame shape: (4024, 9)
Added feature: T Stage _T2, DataFrame shape: (4024, 10)
Added feature: T Stage _T4, DataFrame shape: (4024, 11)
Added feature: N Stage_N1, DataFrame shape: (4024, 12)
Added feature: N Stage_N2, DataFrame shape: (4024, 13)
Added feature: N Stage_N3, DataFrame shape: (4024, 14)
Added feature: 6th Stage_IIB, DataFrame shape: (4024, 15)
Added feature: 6th Stage_IIIB, DataFrame shape: (4024, 16)
Added feature: 6th Stage_IIIC, DataFrame shape: (4024, 17)
Added feature: differentiate_Poorly differentiated, DataFrame shape: (4024, 18)

In [53]:
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split

def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2)**2))

def knn(X_train, y_train, X_test, k=5):
    predictions = []
    for test_point in X_test:
        distances = [euclidean_distance(test_point, train_point) for train_point in X_train]
        k_nearest_indices = np.argsort(distances)[:k]
        k_nearest_labels = [y_train[i] for i in k_nearest_indices]
        # Predict the most frequent label among the k-nearest neighbors
        prediction = Counter(k_nearest_labels).most_common(1)[0][0]
        predictions.append(prediction)
    return predictions

# Example usage:
# predictions = knn(X_train, y_train, X_test, k=5)

for df, df_name in [(df_with_outliers_selected, 'with outliers'), (df_without_outliers_selected, 'without outliers')]:
    X = df.drop('Status', axis=1).values  # Features as NumPy array
    y = df['Status'].values  # Target variable as NumPy array

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # Adjust test_size and random_state as needed

    # Apply KNN and print predictions
    predictions = knn(X_train, y_train, X_test, k=5)  # Adjust k as needed
    accuracy = np.sum(predictions == y_test) / len(y_test)
    print(f"Accuracy for DataFrame {df_name}: {accuracy:.2f}")


Accuracy for DataFrame with outliers: 0.88
Accuracy for DataFrame without outliers: 0.87


In [55]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

# Apply Naive Bayes to both DataFrames and print accuracy
for df, df_name in [(df_with_outliers_selected, 'with outliers'), (df_without_outliers_selected, 'without outliers')]:
    X = df.drop('Status', axis=1).values
    y = df['Status'].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create and train the Naive Bayes model
    model = GaussianNB()
    model.fit(X_train, y_train)

    # Make predictions and calculate accuracy
    predictions = model.predict(X_test)
    accuracy = np.sum(predictions == y_test) / len(y_test)
    print(f"Accuracy for DataFrame {df_name} (Naive Bayes): {accuracy:.2f}")

Accuracy for DataFrame with outliers (Naive Bayes): 0.84
Accuracy for DataFrame without outliers (Naive Bayes): 0.84


In [56]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# Apply C4.5 Decision Tree to both DataFrames and print accuracy
for df, df_name in [(df_with_outliers_selected, 'with outliers'), (df_without_outliers_selected, 'without outliers')]:
    X = df.drop('Status', axis=1).values
    y = df['Status'].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create and train the C4.5 Decision Tree model
    model = DecisionTreeClassifier(criterion='entropy')  # Use 'entropy' for C4.5
    model.fit(X_train, y_train)

    # Make predictions and calculate accuracy
    predictions = model.predict(X_test)
    accuracy = np.sum(predictions == y_test) / len(y_test)
    print(f"Accuracy for DataFrame {df_name} (C4.5 Decision Tree): {accuracy:.2f}")

Accuracy for DataFrame with outliers (C4.5 Decision Tree): 0.85
Accuracy for DataFrame without outliers (C4.5 Decision Tree): 0.84


In [58]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Apply Random Forest to both DataFrames and print accuracy
for df, df_name in [(df_with_outliers_selected, 'with outliers'), (df_without_outliers_selected, 'without outliers')]:
    X = df.drop('Status', axis=1).values
    y = df['Status'].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create and train the Random Forest model
    model = RandomForestClassifier(n_estimators=100, random_state=42)  # Adjust n_estimators and random_state as needed
    model.fit(X_train, y_train)

    # Make predictions and calculate accuracy
    predictions = model.predict(X_test)
    accuracy = np.sum(predictions == y_test) / len(y_test)
    print(f"Accuracy for DataFrame {df_name} (Random Forest): {accuracy:.2f}")

Accuracy for DataFrame with outliers (Random Forest): 0.91
Accuracy for DataFrame without outliers (Random Forest): 0.89


In [59]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

# Apply Gradient Boosting to both DataFrames and print accuracy
for df, df_name in [(df_with_outliers_selected, 'with outliers'), (df_without_outliers_selected, 'without outliers')]:
    X = df.drop('Status', axis=1).values
    y = df['Status'].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create and train the Gradient Boosting model
    model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)  # Adjust hyperparameters as needed
    model.fit(X_train, y_train)

    # Make predictions and calculate accuracy
    predictions = model.predict(X_test)
    accuracy = np.sum(predictions == y_test) / len(y_test)
    print(f"Accuracy for DataFrame {df_name} (Gradient Boosting): {accuracy:.2f}")

Accuracy for DataFrame with outliers (Gradient Boosting): 0.91
Accuracy for DataFrame without outliers (Gradient Boosting): 0.90


In [60]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

# Apply Neural Network to both DataFrames and print accuracy
for df, df_name in [(df_with_outliers_selected, 'with outliers'), (df_without_outliers_selected, 'without outliers')]:
    X = df.drop('Status', axis=1).values
    y = df['Status'].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create and train the Neural Network model
    model = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', random_state=42, max_iter=2000)  # Adjust hyperparameters as needed
    model.fit(X_train, y_train)

    # Make predictions and calculate accuracy
    predictions = model.predict(X_test)
    accuracy = np.sum(predictions == y_test) / len(y_test)
    print(f"Accuracy for DataFrame {df_name} (Neural Network): {accuracy:.2f}")

Accuracy for DataFrame with outliers (Neural Network): 0.90
Accuracy for DataFrame without outliers (Neural Network): 0.89


In [61]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Data preparation (using df_without_outliers_selected as an example)
X = df_with_outliers_selected.drop('Status', axis=1).values
y = df_with_outliers_selected['Status'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10]
}

# Create the Random Forest model
model = RandomForestClassifier(random_state=42)

# Create GridSearchCV object
grid_search = GridSearchCV(model, param_grid, scoring='accuracy', cv=5)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Evaluate the model with the best hyperparameters on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)

# Print the results
print("Random Forest - Best Hyperparameters:", best_params)
print("Random Forest - Best Cross-Validation Score:", best_score)
print("Random Forest - Test Accuracy:", test_accuracy)

Random Forest - Best Hyperparameters: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 200}
Random Forest - Best Cross-Validation Score: 0.9027626710972442
Random Forest - Test Accuracy: 0.9105590062111801


In [62]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Data preparation (using df_without_outliers_selected as an example)
X = df_with_outliers_selected.drop('Status', axis=1).values
y = df_with_outliers_selected['Status'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

# Create the Gradient Boosting model
model = GradientBoostingClassifier(random_state=42)

# Create GridSearchCV object
grid_search = GridSearchCV(model, param_grid, scoring='accuracy', cv=5)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Evaluate the model with the best hyperparameters on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)

# Print the results
print("Gradient Boosting - Best Hyperparameters:", best_params)
print("Gradient Boosting - Best Cross-Validation Score:", best_score)
print("Gradient Boosting - Test Accuracy:", test_accuracy)

Gradient Boosting - Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
Gradient Boosting - Best Cross-Validation Score: 0.9012103590506457
Gradient Boosting - Test Accuracy: 0.906832298136646
