In [14]:
import pandas as pd
import numpy as np

In [15]:

iris = pd.read_csv("data\\Iris.csv")

In [16]:
iris

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [17]:

X=iris.drop(labels=["Species"],axis=1)

In [18]:
# There are 4 numeric independent variable
numerical_cols=X.select_dtypes(exclude='object').columns
numerical_cols

Index(['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'], dtype='object')

In [19]:
y=iris[["Species"]]

In [20]:
from sklearn.impute import SimpleImputer ## Handling Missing Values
from sklearn.preprocessing import StandardScaler # Handling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [9]:

# Creating the pipeline for preprocessing for input numeric variables
num_pipeline=Pipeline(
    steps=[
        
        ('imputer',SimpleImputer()),
        ('scaler', StandardScaler())
    ]
)  

In [21]:
# Creating the column transformation for the variables (here numeric variables only as they are dependent variables)
preprocessor=ColumnTransformer(
    [
        
        ('num_pipeline',num_pipeline,numerical_cols)
    ]
)

In [22]:
## Train test split

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=30)

In [23]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [24]:
## Model Training

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, 
    f1_score, 
    precision_score, 
    recall_score,
    #roc_auc_score
)

# If confusion_matrix,classification_report are taken then we can use only them

In [25]:

import warnings
warnings.filterwarnings("ignore")

In [26]:
def evaluate_model(models_clf, param_grids):
    
    model_list_clf = []
    results_clf = []
    
    
    for name, model in models_clf.items():
        print("--"*20 + f"Evaluating {name}"+"--"*20)

        # Grid Search multi class
        grid_search = GridSearchCV(model, param_grid=param_grids[name], scoring='f1_weighted', cv=10) 
        grid_search.fit(X_train, y_train)

        # Best Model & Predictions
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test)

        # Evaluation for multiclass classification
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted') 
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        
        print("Best Parameters:", grid_search.best_params_)
        print("Accuracy:", accuracy)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1 Score:", f1)
        
        # Store results for comparison
        model_list_clf.append(name)
        results_clf.append({'Model': name, 'Best Parameters': grid_search.best_params_, 
                        'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1 Score': f1})
    
    # Create a DataFrame for easy comparison
    results_df = pd.DataFrame(results_clf)    
        
    # Find the best classification model
    best_clf_model_idx = results_df['F1 Score'].idxmax()
    best_clf_model = results_df.iloc[best_clf_model_idx] 
    
    # Get predictions from the best model
    best_model_name = best_clf_model['Model']
    best_model = models_clf[best_model_name].set_params(**best_clf_model['Best Parameters'])
    best_model.fit(X_train, y_train)
    best_model_predictions = best_model.predict(X_test)   
    
    return {"model_list_clf": model_list_clf, 
            "results_clf": results_clf, 
            "results_df": results_df, 
            "best_clf_model_idx": best_clf_model_idx, 
            "best_clf_model": best_clf_model,
            "best_model_predictions": best_model_predictions }  

In [36]:
model_from_grid_srch["best_clf_model"]["Best Parameters"]

{'max_depth': None, 'n_estimators': 100, 'random_state': 30}

In [38]:
model_from_grid_srch["best_clf_model_idx"]

1

In [37]:
model_from_grid_srch["best_clf_model_idx"].set_params(**model_from_grid_srch["best_clf_model"]["Best Parameters"])

AttributeError: 'int' object has no attribute 'set_params'

In [51]:
models_clf[1]

KeyError: 1

In [27]:
models_clf = {
    'SVC': SVC(),
    'RandomForestClassifier': RandomForestClassifier()
}

param_grids = {
    'SVC': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']},
    'RandomForestClassifier': {'n_estimators': [100, 200, 300], 'max_depth': [None, 5, 10], 'random_state': [30]}
}

In [28]:
evaluate_model(models_clf, param_grids)

----------------------------------------Evaluating SVC----------------------------------------
Best Parameters: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
Accuracy: 0.9111111111111111
Precision: 0.9134199134199135
Recall: 0.9111111111111111
F1 Score: 0.9096296296296297
----------------------------------------Evaluating RandomForestClassifier----------------------------------------
Best Parameters: {'max_depth': None, 'n_estimators': 100, 'random_state': 30}
Accuracy: 0.9333333333333333
Precision: 0.9352733686067018
Recall: 0.9333333333333333
F1 Score: 0.933667000333667


{'model_list_clf': ['SVC', 'RandomForestClassifier'],
 'results_clf': [{'Model': 'SVC',
   'Best Parameters': {'C': 1, 'gamma': 'scale', 'kernel': 'linear'},
   'Accuracy': 0.9111111111111111,
   'Precision': 0.9134199134199135,
   'Recall': 0.9111111111111111,
   'F1 Score': 0.9096296296296297},
  {'Model': 'RandomForestClassifier',
   'Best Parameters': {'max_depth': None,
    'n_estimators': 100,
    'random_state': 30},
   'Accuracy': 0.9333333333333333,
   'Precision': 0.9352733686067018,
   'Recall': 0.9333333333333333,
   'F1 Score': 0.933667000333667}],
 'results_df':                     Model                                    Best Parameters  \
 0                     SVC     {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}   
 1  RandomForestClassifier  {'max_depth': None, 'n_estimators': 100, 'rand...   
 
    Accuracy  Precision    Recall  F1 Score  
 0  0.911111   0.913420  0.911111  0.909630  
 1  0.933333   0.935273  0.933333  0.933667  ,
 'best_clf_model_idx': 1,
 'best_c

In [29]:

model_from_grid_srch = evaluate_model(models_clf, param_grids)

----------------------------------------Evaluating SVC----------------------------------------
Best Parameters: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
Accuracy: 0.9111111111111111
Precision: 0.9134199134199135
Recall: 0.9111111111111111
F1 Score: 0.9096296296296297
----------------------------------------Evaluating RandomForestClassifier----------------------------------------
Best Parameters: {'max_depth': None, 'n_estimators': 100, 'random_state': 30}
Accuracy: 0.9333333333333333
Precision: 0.9352733686067018
Recall: 0.9333333333333333
F1 Score: 0.933667000333667


In [31]:
model_from_grid_srch["best_clf_model_idx"]

1

In [50]:
models_clf[model_from_grid_srch["best_clf_model_idx"]]

KeyError: 1

In [44]:
model_clf

NameError: name 'model_clf' is not defined

In [53]:
best_model = models_clf[model_from_grid_srch["best_clf_model"]["Model"]]

In [55]:
best_model(**model_from_grid_srch["best_clf_model"]["Best Parameters"])

TypeError: 'RandomForestClassifier' object is not callable

In [48]:
model_from_grid_srch["best_clf_model"]["Best Parameters"]

{'max_depth': None, 'n_estimators': 100, 'random_state': 30}

In [47]:
models_clf[model_from_grid_srch["best_clf_model"]["Model"]].set_params(**set_params(**model_from_grid_srch["best_clf_model"]["Best Parameters"]))

NameError: name 'set_params' is not defined

In [42]:
a = {'A':10, 'B':20}

In [43]:
a['A']

10

In [19]:

model_from_grid_srch["results_df"]

Unnamed: 0,Model,Best Parameters,Accuracy,Precision,Recall,F1 Score
0,SVC,"{'C': 1, 'gamma': 'scale', 'kernel': 'linear'}",0.911111,0.91342,0.911111,0.90963
1,RandomForestClassifier,"{'max_depth': None, 'n_estimators': 100, 'rand...",0.933333,0.935273,0.933333,0.933667


In [20]:
model_from_grid_srch["best_clf_model"]["Best Parameters"]

{'max_depth': None, 'n_estimators': 100, 'random_state': 30}

In [21]:
y_pred = pd.DataFrame(model_from_grid_srch["best_model_predictions"])

In [22]:

result = pd.concat([y_pred,y_test.reset_index()],axis=1)

In [None]:
result

In [56]:
import pickle
with open('D:/HCL_IKEA_4_Excercise/Exercise_1_Mlflow/Ex_1_2_Python_mlflow_docker/Python_mlflow_evidently_ai_optuna/artifacts/model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

In [59]:
# extracting the best model
best_model = models_clf[model_from_grid_srch["best_clf_model"]["Model"]]
            
# extracting the best parameters
best_param = model_from_grid_srch["best_clf_model"]["Best Parameters"]

In [61]:
best_param

{'max_depth': None, 'n_estimators': 100, 'random_state': 30}

In [62]:
best_model

In [60]:
best_model.set_params(**best_param)

In [63]:
print(best_model.get_params()['n_estimators']) 

100


In [57]:
loaded_model

In [58]:
loaded_model.get_params

<bound method BaseEstimator.get_params of RandomForestClassifier(random_state=30)>

In [65]:
import os

In [66]:
os.path.join("artifacts","preprocessor.pkl")

'artifacts\\preprocessor.pkl'

In [70]:
from src.irisclassification.logger import logging

In [72]:
from src.irisclassification.exception import customexception

In [74]:
import sys

In [None]:
import pickle
with open('D:/HCL_IKEA_4_Excercise/Exercise_1_Mlflow/Ex_1_2_Python_mlflow_docker/Python_mlflow_evidently_ai_optuna/artifacts/model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

In [68]:
def load_object(file_path):
    try:
        with open(file_path,'rb') as file_obj:
            return pickle.load(file_obj)
    except Exception as e:
        logging.info('Exception Occured in load_object function utils')
        raise customexception(e,sys)

In [81]:
preprocessor_path = os.path.join("D:/HCL_IKEA_4_Excercise/Exercise_1_Mlflow/Ex_1_2_Python_mlflow_docker/Python_mlflow_evidently_ai_optuna", "artifacts","preprocessor.pkl")
model_path = os.path.join("D:/HCL_IKEA_4_Excercise/Exercise_1_Mlflow/Ex_1_2_Python_mlflow_docker/Python_mlflow_evidently_ai_optuna", "artifacts","model.pkl")
            
preprocessor = load_object(preprocessor_path)
model = load_object(model_path)

In [101]:
#preprocessor_path = os.path.join("..","artifacts","preprocessor.pkl")
#model_path = os.path.join("..","artifacts","model.pkl")
            
preprocessor = load_object('..\\artifacts\\preprocessor.pkl')
#model = load_object(model_path)

In [102]:
preprocessor

In [84]:
preprocessor

In [85]:
model

In [None]:
model.predict()

In [86]:
SepalLengthCm = 10.0
SepalWidthCm = 22.6
PetalLengthCm = 33.4
PetalWidthCm = 9.6

In [88]:
dict_data = {'SepalLengthCm':10.0,'SepalWidthCm':22.6, 'PetalLengthCm':33.4,'PetalWidthCm':9.6}

In [92]:
df = pd.DataFrame([dict_data])

In [93]:
df

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,10.0,22.6,33.4,9.6


In [104]:
preprocessor.transform(df)

array([[0., 0., 0., 0.]])

In [94]:
def predict(features):
    try:
        preprocessor_path = os.path.join("..", "artifacts", "preprocessor.pkl")
        model_path = os.path.join("..", "artifacts", "model.pkl")
            
        preprocessor = load_object(preprocessor_path)
        model = load_object(model_path)
            
        scaled_data = preprocessor.transform(features)
            
        pred = model.predict(scaled_data)
            
        return pred                                     # This should return string
        
    except Exception as e:
        raise customexception(e, sys)

In [100]:
predict(df)[0]

'virginica'

In [None]:
'..\\artifacts\\preprocessor.pkl'

In [107]:
preprocessor_path = os.path.join("..", "artifacts", "preprocessor.pkl")
model_path = os.path.join("..", "artifacts", "model.pkl")
            
preprocessor = load_object(preprocessor_path)
model = load_object(model_path)

In [114]:
params = model.get_params()

In [115]:
params

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 30,
 'verbose': 0,
 'warm_start': False}

In [108]:
scaled_data = preprocessor.transform(df) 

In [109]:
scaled_data

array([[ 5.1608619 , 44.09409381, 16.96147885, 11.17469958]])

In [110]:
pred = model.predict(scaled_data)

In [112]:
pred[0]

'virginica'