# Import Libraries

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow 
import mlflow.sklearn
import joblib,pickle

In [4]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split,RandomizedSearchCV,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,classification_report

# Data Loading

In [5]:
# Load MNIST data
X,y=fetch_openml('mnist_784',version=1,return_X_y=True)
#X

X=X.astype(np.float32)/255.0 # Normalize the pixel values

X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.2,random_state=42) #test size split

#X_train.shape[0]
#X_test.shape[0]
X_train.shape[0]==y_train.shape[0]

True

# Integrating Mlflow to track the experiments

In [6]:
mlflow.set_experiment("MLops_digit_recognizer")
mlflow.sklearn.autolog()

# Building and training the Ensemble

In [9]:
with mlflow.start_run():

    # initializing models
    rf= RandomForestClassifier(random_state=42)
    lr=LogisticRegression(max_iter=1000,random_state=42)
    #svc=SVC(probability= True, random_state=42)

    #Soft voting

    ensemble=VotingClassifier(estimators=[
        ('rf',rf),
        ('lr',lr)
        #,('svc',svc)
    ], voting='soft')

    # Hyperparamter grid 

    param_grid={
        'rf__n_estimators':[50],#,100],
        'lr__C':[0.01, 0.1, 1.0, 10.0]
        #,'svc__C':[1.0]
    }

    # search=RandomizedSearchCV(ensemble,param_distributions=param_grid, n_iter=4,cv=3,n_jobs=-1)
    search=GridSearchCV(ensemble,param_grid=param_grid,cv=3,n_jobs=-1)
    search.fit(X_train,y_train)

    y_pred=search.best_estimator_.predict(X_test)

    acc=accuracy_score(y_test,y_pred)
    print(f"Accuracy: {acc:.4f}")

    report =classification_report(
        y_test,y_pred)
    print(report)


    with open("classification_report.txt","w") as f:
        f.write(report)

    mlflow.log_artifact("classification_report.txt")

    #dump model in joblib format
    
    joblib.dump(search.best_estimator_,"model.joblib")
    mlflow.log_artifact("model.joblib")
    
    with open ("model.pkl","wb") as f:
        pickle.dump(search.best_estimator_,f)
    
    mlflow.log_artifact("model.pkl")
    
    


2025/07/04 21:16:19 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.


Accuracy: 0.9450
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1343
           1       0.96      0.98      0.97      1600
           2       0.93      0.93      0.93      1380
           3       0.93      0.92      0.92      1433
           4       0.94      0.95      0.94      1295
           5       0.92      0.92      0.92      1273
           6       0.96      0.97      0.96      1396
           7       0.95      0.96      0.95      1503
           8       0.93      0.92      0.92      1357
           9       0.94      0.93      0.93      1420

    accuracy                           0.94     14000
   macro avg       0.94      0.94      0.94     14000
weighted avg       0.94      0.94      0.94     14000

