# Pipeline with the hyperparameter 

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as pltng
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.metrics import accuracy_score,r2_score


# Load the tips dataset
tip = sns.load_dataset("tips")

# Display information about the dataset
tip.info()

# Extracting features (x) and target (y)
x1 = tip[['total_bill', 'sex', 'smoker']]
y1 = tip['tip']
# le = LabelEncoder()
# y1 = le.fit_transform(y1)
# Splitting the data
x_train1, x_test1, y_train1, y_test1 = train_test_split(x1, y1, test_size=0.2, random_state=42)

# Define transformers in the pipeline for preprocessing
num_transformation1 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

cat_transformation1 = Pipeline(steps=[
    ('imputer',  SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown="ignore"))
])

# Define preprocessor
preprocessor1 = ColumnTransformer(transformers=[
    ('num1', num_transformation1, ['total_bill']),
    ('cat1', cat_transformation1, ['sex', 'smoker'])
])

# Create the preprocessing pipeline
pipeline1 = Pipeline(steps=[
    ('preprocessor1', preprocessor1),
    ('model', RandomForestRegressor(random_state=42))
])

# Define hyperparameters grid
hyperparameters = {
    "model__n_estimators": [50, 100, 200, 400, 500],
    "model__max_depth": [None, 5, 10],
    "model__min_samples_split": [5, 10, 20]
}

# Perform GridSearchCV
grid_search = GridSearchCV(
    estimator=pipeline1,
    param_grid=hyperparameters,
    cv=3,
    scoring='r2',
    verbose=1,
    n_jobs=-1
)
grid_search.fit(x_train1, y_train1)

best_model = grid_search.best_estimator_

# Preprocess the test data


# Make predictions using the best model
y_pred1 = best_model.predict(x_test1)

# Calculate the R2 score
accuracy = r2_score(y_test1, y_pred1)
print("The R2 score:", accuracy)




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB
Fitting 3 folds for each of 45 candidates, totalling 135 fits
The R2 score: 0.1977348609288555
