## Assignment 1

Use iris dataset to write an MLflow program with below requirements to classify the 'variety' of petal.



1. Experiments, Runs, Parameters, Metrics shall be logged and tracked.
2. Input dataset, Train and Test data, Model shall be logged as artifacts.
3. Set multiple tags to the run.
4. Print the evaluation metrics.
5. Print the last active run.

Questions for this assignment
Which library, module and class can be used for this problem statement ?

In [None]:
# Import libraries 
import warnings, os, joblib
import pandas as pd
import numpy as np 
import mlflow
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics, svm


if __name__ == "__main__":
    warnings.filterwarnings("ignore")
    np.random.seed(40)
    
    
    # Set the experiment
    exp = mlflow.set_experiment(experiment_name="assignment_1")

    mlflow.start_run()

    # Encode the classes to codes and load the dataset, change the variety data from categorical to numerical codes
    class_codes = {"Setosa": 1, "Versicolor": 2, "Virginica": 3}
    data =  pd.read_csv("data/iris.csv")
    actual_data = data.replace({"variety": class_codes})
    features = ["sepal.length", "sepal.width", "petal.length", "petal.width"]
    features = actual_data[features]
    labels =  actual_data["variety"]
    
    # Create the train-test split
    train_data, test_data, train_labels, test_labels = train_test_split(features.to_numpy(), labels.to_numpy(), random_state=1)
    print(f"The number of data rows in the dataset is: {len(actual_data)} and after the train-test split we have {len(train_data)} train rows and {len(test_data)} test rows")
    
    
    # Log the input dataset, train and test datasets
    mlflow.log_artifact("data/iris.csv", "data")
    
    train_df = pd.DataFrame(train_data, columns=["sepal.length", "sepal.width", "petal.length", "petal.width"])
    train_df.to_csv("train.csv")
    mlflow.log_artifact("train.csv", "data")
    os.remove("train.csv")

    test_df  = pd.DataFrame(test_data, columns=["sepal.length", "sepal.width", "petal.length", "petal.width"])
    train_df.to_csv("test.csv")
    mlflow.log_artifact("test.csv", "data")
    os.remove("test.csv")
    
    # Create all the classifiers we want to test on the data and save as well [Decision Tree classifier object, Support Vector Machine]
    decision_tree_classifier = DecisionTreeClassifier()
    svm_classifier = svm.SVC(kernel='linear')
    
    # Train the models
    decision_tree_classifier = decision_tree_classifier.fit(train_data, train_labels)
    svm_classifier = svm_classifier.fit(train_data, train_labels)
    
    #Predict the response for test dataset
    decision_tree_predictions = decision_tree_classifier.predict(test_data)
    svm_predictions = svm_classifier.predict(test_data)
    print("Decision Tree classifier Accuracy:",metrics.accuracy_score(test_labels, decision_tree_predictions))
    print("Support Vector Machine Accuracy:",metrics.accuracy_score(test_labels, decision_tree_predictions))
    
    # Save the two models and save them in an artifact
    joblib.dump(decision_tree_classifier, "decision_tree_classifier.sav")
    joblib.dump(svm_classifier, "svm_classifier.sav")
    mlflow.log_artifact("decision_tree_classifier.sav", "model")
    mlflow.log_artifact("svm_classifier.sav", "model")
    os.remove("decision_tree_classifier.sav")
    os.remove("svm_classifier.sav")
    
    
    tags = {"Model": "Decision Tree", "Model": "Support Vector Machine"}
    mlflow.set_tags(tags=tags)
    
    mlflow.end_run()
    print(f"The last active run is: {mlflow.last_active_run()}")