In [83]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.datasets import load_iris
from sklearn.metrics import classification_report

In [84]:
# Load Iris dataset
data = load_iris(as_frame=True)
df = data.frame  # Iris dataset in DataFrame format
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [95]:
df.to_csv('iris.csv')

In [85]:

# Inspect DataFrame columns and structure
print("Columns in DataFrame:", df.columns.tolist())
print(df.info())

Columns in DataFrame: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)', 'target']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   target             150 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 6.0 KB
None


In [86]:
# Use the data in your pipeline
X = df.drop('target', axis=1)
y = df['target']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [87]:
numeric_features = X.columns
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)])

In [88]:
# Build the pipeline with preprocessor and classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))  # Set random_state for reproducibility
])

In [89]:
# Define the hyperparameter grid for tuning
param_grid = {
    'classifier__max_depth': [3, 5, 10],  # Control the depth of the tree
    'classifier__min_samples_split': [2, 5, 10],  # Minimum samples needed to split a node
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

# Train the model with hyperparameter tuning
grid_search.fit(X_train, y_train)

# Best parameters found by GridSearchCV
print("Best parameters found: ", grid_search.best_params_)

# Use the best model to make predictions
best_model = grid_search.best_estimator_


Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best parameters found:  {'classifier__max_depth': 3, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2}


In [90]:
# Predictions for training and testing sets
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

In [91]:
# Classification reports for both train and test sets
print("Classification Report on Training Data:")
print(classification_report(y_train, y_train_pred, target_names=data.target_names))

Classification Report on Training Data:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        40
  versicolor       0.91      0.98      0.94        41
   virginica       0.97      0.90      0.93        39

    accuracy                           0.96       120
   macro avg       0.96      0.96      0.96       120
weighted avg       0.96      0.96      0.96       120



In [92]:
print("Classification Report on Testing Data:")
print(classification_report(y_test, y_test_pred, target_names=data.target_names))

Classification Report on Testing Data:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      1.00      1.00         9
   virginica       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [93]:
print("Classification Report on Testing Data:")
print(classification_report(y_test, y_test_pred, target_names=data.target_names))

Classification Report on Testing Data:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      1.00      1.00         9
   virginica       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [94]:
import pickle

# Save the best model as a pickle file
with open('xgb_model_pipeline.pkl', 'wb') as f:
    pickle.dump(best_model, f)

print("Model pipeline saved as 'model_pipeline.pkl'.")


Model pipeline saved as 'model_pipeline.pkl'.
