In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer

In [11]:
num_features = ["age", "income"]
cat_features = ["gender", "city"]

In [12]:
num_transformer = StandardScaler()
cat_transformer = OneHotEncoder(handle_unknown='ignore')

In [13]:
preprocessor = ColumnTransformer(([
    ('num', num_transformer, num_features),
    ('cat', cat_transformer, cat_features)
]))

In [14]:
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [15]:
param_grid = {
 'regressor__n_estimators': [50, 100, 200],
 'regressor__max_depth': [None, 10, 20],
 'regressor__min_samples_split': [2, 5]
}

In [25]:
from sklearn.model_selection import train_test_split
X_train_df, X_test_df, y_train, y_test = train_test_split(X_df, y, test_size=0.33, random_state=42)

In [27]:
from sklearn.datasets import make_classification
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV

# Generate synthetic data
X, y = make_classification(n_samples=1000, n_features=20, n_informative=5, random_state=42)

# Create feature names
feature_names = [f'feature_{i}' for i in range(X.shape[1])]
X_df = pd.DataFrame(X, columns=feature_names)

# Define numeric and categorical columns
# Let's assume first 10 are numeric, next 10 are categorical just for demo
num_features = feature_names[:10]
cat_features = feature_names[10:]

# Create preprocessing pipeline
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), num_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
])

# Create pipeline
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.3, random_state=42)

# Define hyperparameter grid
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [10, 20],
    'classifier__min_samples_split': [2, 5]
}

# Perform grid search
grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy', n_jobs=1)
grid_search.fit(X_train, y_train)

# Print results
print("Best parameters:", grid_search.best_params_)
print("Best accuracy:", grid_search.best_score_)


Best parameters: {'classifier__max_depth': 20, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 200}
Best accuracy: 0.6842857142857143


In [28]:
print("Best Parameters:", grid_search.best_params_)
print("Best R2 Score:", grid_search.best_score_)

Best Parameters: {'classifier__max_depth': 20, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 200}
Best R2 Score: 0.6842857142857143


In [29]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import seaborn as sns

In [30]:
# Load dataset
df = sns.load_dataset("tips")

In [31]:
# Split features and target
X = df.drop("total_bill", axis=1)
y = df["total_bill"]

In [32]:
# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
# Identify column types
categorical_cols = X.select_dtypes(include="object").columns
numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns

In [34]:
# Define preprocessor
preprocessor = ColumnTransformer(transformers=[
    ("num", StandardScaler(), numerical_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
])

In [35]:
# Create pipeline
pipe = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("regressor", RandomForestRegressor(random_state=42))
])

In [36]:
pipe

In [37]:
pipe.fit(X_train, y_train)

In [38]:
y_pred = pipe.predict(X_test)

In [39]:
y_pred

array([27.86647917, 13.50768614, 21.86007167, 33.8086    , 13.50768614,
       13.50768614, 15.42566769, 13.90268983, 20.50314917, 21.13048012,
       21.13048012, 11.14779678, 15.17117511, 13.50768614,  9.9104    ,
       14.40573   , 23.66018571, 17.35757484, 13.90268983, 24.433     ,
       24.16683333, 23.688115  , 17.35757484, 11.14779678, 27.61494357,
       17.09858333, 11.5768    , 25.509925  , 21.86007167, 26.5451    ,
       23.66018571, 11.14779678, 20.87563333, 26.14090583, 27.61494357,
       28.10826357, 15.17117511, 32.2197    , 18.17035714, 13.50768614,
       10.3392    , 13.50768614, 16.31225348, 14.57425   , 13.50768614,
        8.174075  , 12.6251    , 20.27120833, 11.14779678])