In [2]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV


In [3]:
import seaborn as sns

In [4]:
df=sns.load_dataset('tips')

In [5]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [6]:
X=df.iloc[:,1:]
y=df['total_bill']

In [7]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [8]:
## Pieplining
numeric_preprocessor = Pipeline( steps=[ ("imputation_mean", SimpleImputer(missing_values=np.nan,strategy="mean")),("scaler", StandardScaler()), ])


In [9]:
from sklearn import set_config
set_config(display='diagram')
numeric_preprocessor

0,1,2
,steps,"[('imputation_mean', ...), ('scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [10]:
categorical_preprocessor = Pipeline(steps=[ ( "imputation_constant", SimpleImputer(fill_value="missing", strategy="constant"), ),("onehot", OneHotEncoder(handle_unknown="ignore")),])


In [11]:
preprocessor=Pipeline(steps=[("categorical",categorical_preprocessor),("numerical",numeric_preprocessor)])


In [12]:
preprocessor

0,1,2
,steps,"[('categorical', ...), ('numerical', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,steps,"[('imputation_constant', ...), ('onehot', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,steps,"[('imputation_mean', ...), ('scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [13]:
pipe=Pipeline([("preprocessor",preprocessor),("regressor",RandomForestRegressor())])
pipe

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,steps,"[('categorical', ...), ('numerical', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,steps,"[('imputation_constant', ...), ('onehot', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,steps,"[('imputation_mean', ...), ('scaler', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [14]:
from sklearn.compose import ColumnTransformer

# Define categorical and numerical columns
categorical_cols = X.select_dtypes(include='object').columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Correct: Use ColumnTransformer to apply preprocessors column-wise
preprocessor = ColumnTransformer(transformers=[    ("categorical", categorical_preprocessor, categorical_cols),  ("numerical", numeric_preprocessor, numerical_cols), ])

pipe = Pipeline( [("preprocessor", preprocessor), ("regressor", RandomForestRegressor())])

pipe.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('categorical', ...), ('numerical', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [15]:
X_train

Unnamed: 0,tip,sex,smoker,day,time,size
92,1.00,Female,Yes,Fri,Dinner,2
163,2.00,Male,No,Sun,Dinner,2
174,4.00,Male,Yes,Sun,Dinner,2
201,2.01,Female,Yes,Thur,Lunch,2
185,5.00,Male,No,Sun,Dinner,5
...,...,...,...,...,...,...
132,1.50,Female,No,Thur,Lunch,2
68,2.01,Male,No,Sat,Dinner,2
173,3.18,Male,Yes,Sun,Dinner,2
113,2.55,Male,No,Sun,Dinner,2


In [17]:
import numpy as np
import pandas as pd
import seaborn as sns


In [18]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [19]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [22]:
from sklearn.metrics import confusion_matrix


In [23]:
from sklearn.metrics import accuracy_score, classification_report
confusion_matrix

<function sklearn.metrics._classification.confusion_matrix(y_true, y_pred, *, labels=None, sample_weight=None, normalize=None)>

In [24]:
# Load dataset
df = sns.load_dataset('tips')

In [26]:
# Features and target
X = df.drop('total_bill', axis=1)
y = df['total_bill'] > df['total_bill'].median() # Convert to binaryclassification for demo

In [27]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)

In [28]:
# Identify columns
categorical_cols = X.select_dtypes(include='object').columns.tolist()
numerical_cols = X.select_dtypes(include=['int64',
'float64']).columns.tolist()

In [29]:
# Pipelines
numeric_preprocessor = Pipeline([
 ('imputer', SimpleImputer(strategy='mean')),
 ('scaler', StandardScaler())
])

In [30]:
categorical_preprocessor = Pipeline([
 ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
 ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [31]:
preprocessor = ColumnTransformer([
 ('num', numeric_preprocessor, numerical_cols),
 ('cat', categorical_preprocessor, categorical_cols)
])

In [32]:
# Define models
models = {
 'Decision Tree': DecisionTreeClassifier(),
 'Random Forest': RandomForestClassifier(),
 'Logistic Regression': LogisticRegression(max_iter=200)
}
# Dictionary to store results
results = {}


In [33]:
# Dictionary to store results
results = {}

In [34]:
# Training and Evaluation with Pipeline
for name, model in models.items():
 print(f"\nTraining Model: {name}")

 # Create pipeline
 pipe = Pipeline(steps=[
 ('preprocessor', preprocessor),
 ('classifier', model)
 ])


Training Model: Decision Tree

Training Model: Random Forest

Training Model: Logistic Regression


In [36]:
# Fit
pipe.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,200


In [39]:
# Predict
train_pred = pipe.predict(X_train)
test_pred = pipe.predict(X_test)

In [41]:
# Metrics
train_acc = accuracy_score(y_train, train_pred)
test_acc = accuracy_score(y_test, test_pred)

In [42]:
results[name] = {
 'model': pipe,
 'train_accuracy': train_acc,
 'test_accuracy': test_acc,
 'train_predictions': train_pred,
 'test_predictions': test_pred,
 'confusion_matrix': confusion_matrix(y_test, test_pred),
 'classification_report': classification_report(y_test, test_pred)
 }

In [44]:
# Summary
print("\nMODEL PERFORMANCE SUMMARY:")
for name, info in results.items():
 print(f"\n=== {name} ===")
 print(f"Training Accuracy: {info['train_accuracy']:.4f}")
 print(f"Testing Accuracy: {info['test_accuracy']:.4f}")
 print("Confusion Matrix (Test):")
 print(info['confusion_matrix'])
 print("Classification Report (Test):")
 print(info['classification_report'])


MODEL PERFORMANCE SUMMARY:

=== Logistic Regression ===
Training Accuracy: 0.7487
Testing Accuracy: 0.7755
Confusion Matrix (Test):
[[23  3]
 [ 8 15]]
Classification Report (Test):
              precision    recall  f1-score   support

       False       0.74      0.88      0.81        26
        True       0.83      0.65      0.73        23

    accuracy                           0.78        49
   macro avg       0.79      0.77      0.77        49
weighted avg       0.78      0.78      0.77        49



In [46]:
# Best model by test accuracy
best_model = max(results.items(), key=lambda x: x[1]['test_accuracy'])
print(f"\nBest Model: {best_model[0]} with Test Accuracy:{best_model[1]['test_accuracy']:.4f}")


Best Model: Logistic Regression with Test Accuracy:0.7755
