# Best Model  Selection in Python
To select the best model when using multiple models in a pipeline,you can use techniques like  cross-validation and evalution metrics to campare their performence. Here's an example of how to accomplish this on the titanic data.

In [11]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.pipeline  import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier


# import the data
df = sns.load_dataset('titanic')

#select fratures and target variable
X = df[['pclass', 'sex', 'age', 'fare', 'embarked']]
y = df['survived']

# split the data into train and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a list of models to evaluate
models = [
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42))
   
]

best_model = None
best_accuracy = 0.0

# Iterate over the models and evaluate their performance

for name, model in models:
    # Create a pipeline for each model
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore')),
        ('model', model)
    ])

    # Perform cross-validation
    scores = cross_val_score(pipeline, X_train, y_train, cv=5)

    # Calculate mean accuracy
    mean_accuracy = scores.mean()

    # fit the pipeline on the training data
    pipeline.fit(X_train, y_train)

    # make predictions on the test data
    y_pred = pipeline.predict(X_test)

    # calculate accuracy score
    accuracy = accuracy_score(y_test, y_pred)

    # Print the performance metrics
    print("Model:", name)
    print("Cross-validation Accuracy:", mean_accuracy)
    print('Test Accuracy',accuracy)
    print()


    # Compare to the best accuracy
    if mean_accuracy > best_accuracy:
        best_model = pipeline
        best_accuracy = mean_accuracy

# Print the best model
print('Best Model',best_model)

Model: Random Forest
Cross-validation Accuracy: 0.7991529597163399
Test Accuracy 0.8379888268156425

Model: Gradient Boosting
Cross-validation Accuracy: 0.8076135132473162
Test Accuracy 0.7988826815642458

Best Model Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown='ignore')),
                ('model', GradientBoostingClassifier(random_state=42))])
