In [2]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import seaborn as sns
df=sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
X=df.iloc[:,1:]
y=df['total_bill']

In [4]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [5]:
## Pieplining
numeric_preprocessor = Pipeline(steps=[
 ("imputation_mean", SimpleImputer(missing_values=np.nan,
strategy="mean")),
 ("scaler", StandardScaler())
])

In [8]:
from sklearn import set_config
set_config(display='diagram')
numeric_preprocessor

categorical_preprocessor = Pipeline(steps=[
 ("imputation_constant", SimpleImputer(missing_values=np.nan,
strategy="constant", fill_value="missing")),
 ("onehot", OneHotEncoder(handle_unknown="ignore"))
])
categorical_preprocessor

0,1,2
,steps,"[('imputation_constant', ...), ('onehot', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [9]:
from sklearn.compose import ColumnTransformer

# Define categorical and numerical columns
categorical_cols = X.select_dtypes(include='object').columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Correct: Use ColumnTransformer to apply preprocessors column-wise
preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", categorical_preprocessor, categorical_cols),
        ("numerical", numeric_preprocessor, numerical_cols),
    ]
)

pipe = Pipeline(
    [("preprocessor", preprocessor), ("regressor", RandomForestRegressor())]
)

pipe.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('categorical', ...), ('numerical', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [10]:
X_train

Unnamed: 0,tip,sex,smoker,day,time,size
10,1.71,Male,No,Sun,Dinner,2
15,3.92,Male,No,Sun,Dinner,2
169,2.00,Female,Yes,Sat,Dinner,2
99,1.50,Male,No,Fri,Dinner,2
58,1.76,Male,Yes,Sat,Dinner,2
...,...,...,...,...,...,...
211,5.16,Male,Yes,Sat,Dinner,4
209,2.23,Female,Yes,Sat,Dinner,2
41,2.54,Male,No,Sun,Dinner,2
102,2.50,Female,Yes,Sat,Dinner,3


In [12]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
from sklearn.model_selection import train_test_split 
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer 
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler, OneHotEncoder 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [13]:
df = sns.load_dataset('tips')

In [14]:
X = df.drop('total_bill', axis=1) 
y = df['total_bill'] > df['total_bill'].median()  # Convert to binary classification for demo

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
random_state=42) 

In [16]:
# Identify columns 
categorical_cols = X.select_dtypes(include='object').columns.tolist() 
numerical_cols = X.select_dtypes(include=['int64', 
'float64']).columns.tolist() 
# Pipelines 
numeric_preprocessor = Pipeline([ 
('imputer', SimpleImputer(strategy='mean')), 
('scaler', StandardScaler()) 
]) 
categorical_preprocessor = Pipeline([ 
('imputer', SimpleImputer(strategy='constant', fill_value='missing')), 
('encoder', OneHotEncoder(handle_unknown='ignore')) 
]) 
preprocessor = ColumnTransformer([ 
('num', numeric_preprocessor, numerical_cols), 
('cat', categorical_preprocessor, categorical_cols) 
]) 
# Define models 
models = { 
'Decision Tree': DecisionTreeClassifier(), 
    'Random Forest': RandomForestClassifier(), 
    'Logistic Regression': LogisticRegression(max_iter=200) 
}

In [17]:
# Dictionary to store results 
results = {} 
 
# Training and Evaluation with Pipeline 
for name, model in models.items(): 
    print(f"\nTraining Model: {name}") 
     
    # Create pipeline 
    pipe = Pipeline(steps=[ 
        ('preprocessor', preprocessor), 
        ('classifier', model) 
    ]) 
     
    # Fit 
    pipe.fit(X_train, y_train) 
     
    # Predict 
    train_pred = pipe.predict(X_train) 
    test_pred = pipe.predict(X_test) 
     
    # Metrics 
    train_acc = accuracy_score(y_train, train_pred) 
    test_acc = accuracy_score(y_test, test_pred) 
     
    results[name] = { 
        'model': pipe, 
        'train_accuracy': train_acc, 
        'test_accuracy': test_acc, 
        'train_predictions': train_pred, 
        'test_predictions': test_pred, 
        'confusion_matrix': confusion_matrix(y_test, test_pred), 
        'classification_report': classification_report(y_test, test_pred) 
    }


Training Model: Decision Tree

Training Model: Random Forest

Training Model: Logistic Regression


In [19]:
print("\nMODEL PERFORMANCE SUMMARY:") 
for name, info in results.items(): 
    print(f"\n=== {name} ===") 
    print(f"Training Accuracy: {info['train_accuracy']:.4f}") 
    print(f"Testing Accuracy: {info['test_accuracy']:.4f}") 
    print("Confusion Matrix (Test):") 
    print(info['confusion_matrix']) 
    print("Classification Report (Test):") 
    print(info['classification_report']) 
 
# Best model by test accuracy 
best_model = max(results.items(), key=lambda x: x[1]['test_accuracy']) 
print(f"\nBest Model: {best_model[0]} with Test Accuracy: {best_model[1]['test_accuracy']:.4f}")


MODEL PERFORMANCE SUMMARY:

=== Decision Tree ===
Training Accuracy: 0.9231
Testing Accuracy: 0.7551
Confusion Matrix (Test):
[[21  5]
 [ 7 16]]
Classification Report (Test):
              precision    recall  f1-score   support

       False       0.75      0.81      0.78        26
        True       0.76      0.70      0.73        23

    accuracy                           0.76        49
   macro avg       0.76      0.75      0.75        49
weighted avg       0.76      0.76      0.75        49


=== Random Forest ===
Training Accuracy: 0.9231
Testing Accuracy: 0.7551
Confusion Matrix (Test):
[[21  5]
 [ 7 16]]
Classification Report (Test):
              precision    recall  f1-score   support

       False       0.75      0.81      0.78        26
        True       0.76      0.70      0.73        23

    accuracy                           0.76        49
   macro avg       0.76      0.75      0.75        49
weighted avg       0.76      0.76      0.75        49


=== Logistic Regressi

In [20]:
# define categorical and numerical columns
categorical_cols = X.select_dtypes(include='object').columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [23]:
preprocessor = ColumnTransformer(
    transformers=[ 
        ("categorical", categorical_preprocessor, categorical_cols),
        ("numerical", numeric_preprocessor, numerical_cols),
    ])
preprocessor

0,1,2
,transformers,"[('categorical', ...), ('numerical', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True
