In [38]:

import mlflow
import logging
import pandas as pd
import numpy as np
from urllib.parse import urlparse
from markupsafe import escape

logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score,precision_recall_curve,auc
from sklearn.ensemble import RandomForestClassifier

In [3]:
train = pd.read_csv("C:/Users/pragy/Applied-Machine-Learning/Assignment2/Data/Training Data.csv")
val = pd.read_csv("C:/Users/pragy/Applied-Machine-Learning/Assignment2/Data/Validation Data.csv")
test = pd.read_csv("C:/Users/pragy/Applied-Machine-Learning/Assignment2/Data/Test Data.csv")

In [8]:
train.head()

Unnamed: 0,Text,Label
0,http//tms. widelive.com/index. wml?id=820554ad...,1
1,Its so common hearin How r u? Wat r u doing? H...,0
2,I want snow. It's just freezing and windy.,0
3,Its worse if if uses half way then stops. Its ...,0
4,Lol for real. She told my dad I have cancer,0


In [9]:
train_Y,train_X=train["Label"],train["Text"]
val_Y,val_X=val["Label"],val["Text"]
test_Y,test_X=test["Label"],test["Text"]

In [10]:
count = CountVectorizer().fit(train_X)
train_X = count.transform(train_X)
val_X = count.transform(val_X)
test_X = count.transform(test_X)

In [11]:
tfidf_transformer = TfidfTransformer()
tfidf_train = tfidf_transformer.fit_transform(train_X)
tfidf_val = tfidf_transformer.fit_transform(val_X)
tfidf_test = tfidf_transformer.fit_transform(test_X)

In [12]:
tfidf_train.shape,tfidf_val.shape,tfidf_test.shape

((4512, 7817), (502, 7817), (558, 7817))

In [13]:

def eval_metrics(actual, pred):
    precision, recall, thresholds = precision_recall_curve(actual, pred)
    auc_precision_recall = auc(recall, precision)
    return (auc_precision_recall)

In [25]:
mlflow.sklearn.autolog()

n_estimators = 200
max_depth = 5
clf = RandomForestClassifier(n_estimators = n_estimators, max_depth = max_depth,random_state=195)
clf.fit(tfidf_train, train_Y)

pred_Y = clf.predict(tfidf_test)
aucpr = eval_metrics(test_Y, pred_Y)
acc = accuracy_score(test_Y, pred_Y)
conf_1=confusion_matrix(test_Y, pred_Y)

with mlflow.start_run(run_name=f"n_estimators : {n_estimators}, max_depth : {max_depth}"):
  
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("AUCPR",aucpr)
    mlflow.log_dict(np.array(conf_1).tolist(), "confusion_matrix.json")
    mlflow.sklearn.log_model(clf, "model")

    print("\nRandom Classifier Model (no_of_estimator={:f}, max_depth={:f}):".format(n_estimators, max_depth))
    print(f"Accuracy: {acc}")
    print(f"AUCPR: {aucpr} ")
    print(f"Confusion Matrix: \n{conf_1} \n \n")
    

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    mlflow.sklearn.log_model(
        sk_model=clf,
        artifact_path="sklearn-model",
        registered_model_name="random-forest-classification-model"
    )
    if tracking_url_type_store != "file":
      mlflow.sklearn.log_model(clf, "model", registered_model_name="Randomclassifier")
    else:
      mlflow.sklearn.log_model(clf, "model")

2023/02/27 20:07:56 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'c316ad919c59479497d035208c2e36a8', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow



Random Classifier Model (no_of_estimator=200.000000, max_depth=5.000000):
Accuracy: 0.8745519713261649
AUCPR: 0.5697662678580443 
Confusion Matrix: 
[[487   0]
 [ 70   1]] 
 



Registered model 'random-forest-classification-model' already exists. Creating a new version of this model...
2023/02/27 20:08:26 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: random-forest-classification-model, version 2
Created version '2' of model 'random-forest-classification-model'.


In [18]:
mlflow.tracking.MlflowClient().get_model_version("random-forest-classification-model","1")

<ModelVersion: creation_timestamp=1677507776170, current_stage='None', description=None, last_updated_timestamp=1677507776170, name='random-forest-classification-model', run_id='a9454735996946aeac232a5971a7c58e', run_link=None, source='file:///c:/Users/pragy/Applied-Machine-Learning/Assignment2/mlruns/0/a9454735996946aeac232a5971a7c58e/artifacts/sklearn-model', status='READY', status_message=None, tags={}, user_id=None, version=1>

Model 2

In [26]:
mlflow.sklearn.autolog()

clf = MultinomialNB()
clf.fit(tfidf_train, train_Y)

pred_Y = clf.predict(tfidf_test)
aucpr = eval_metrics(test_Y, pred_Y)
acc = accuracy_score(test_Y, pred_Y)
conf_2=confusion_matrix(test_Y, pred_Y)

with mlflow.start_run(run_name="Multinomial Naive Bayes"):

    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("AUCPR",aucpr)
    mlflow.sklearn.log_model(clf, "model")
    mlflow.log_dict(np.array(conf_2).tolist(), "confusion_matrix.json")

    print("\nMultinomial Naive Bayes")
    print(f"Accuracy: {acc}")
    print(f"AUCPR: {aucpr}")
    print(f"Confusion Matrix: \n{conf_2} \n\n")
    

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    mlflow.sklearn.log_model(
        sk_model=clf,
        artifact_path="sklearn-model",
        registered_model_name="multinomial-nb-model"
    )
    if tracking_url_type_store != "file":
      mlflow.sklearn.log_model(clf, "model", registered_model_name="multinomial-nb-model")
    else:
      mlflow.sklearn.log_model(clf, "model")

2023/02/27 20:08:34 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '85d1716e5da643d29530bb759682115f', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow



Multinomial Naive Bayes
Accuracy: 0.9551971326164874
AUCPR: 0.8463450956635873
Confusion Matrix: 
[[487   0]
 [ 25  46]] 




Registered model 'multinomial-nb-model' already exists. Creating a new version of this model...
2023/02/27 20:08:58 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: multinomial-nb-model, version 2
Created version '2' of model 'multinomial-nb-model'.


In [28]:
print(mlflow.tracking.MlflowClient().get_model_version("multinomial-nb-model", '1'))


<ModelVersion: creation_timestamp=1677508343641, current_stage='None', description=None, last_updated_timestamp=1677508343641, name='multinomial-nb-model', run_id='036d1c65c2934416a2ab68c358d58433', run_link=None, source='file:///c:/Users/pragy/Applied-Machine-Learning/Assignment2/mlruns/0/036d1c65c2934416a2ab68c358d58433/artifacts/sklearn-model', status='READY', status_message=None, tags={}, user_id=None, version=1>


Model 3


In [36]:
mlflow.sklearn.autolog()

clf = SVC(C=500)
clf.fit(tfidf_train, train_Y)

pred_Y = clf.predict(tfidf_test)
aucpr = eval_metrics(test_Y, pred_Y)
acc = accuracy_score(test_Y, pred_Y)
conf_3=confusion_matrix(test_Y, pred_Y)

with mlflow.start_run(run_name="Support Vector Machine"):

    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("AUCPR",aucpr)
    mlflow.sklearn.log_model(clf, "model")
    
    print("\nSupport Vector Machine")
    print(f"Accuracy: {acc}")
    print(f"AUCPR: {aucpr}")
    print(f"Confusion Matrix \n{conf_3} \n\n")
    

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    mlflow.sklearn.log_model(
        sk_model=clf,
        artifact_path="sklearn-model",
        registered_model_name="svm-model"
    )
    if tracking_url_type_store != "file":
      mlflow.sklearn.log_model(clf, "model", registered_model_name="svm-model")
    else:
      mlflow.sklearn.log_model(clf, "model")

2023/02/27 20:19:50 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'a61591f23444439296d0a0efd364c7bd', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow



Support Vector Machine
Accuracy: 0.989247311827957
AUCPR: 0.963122822959261
Confusion Matrix 
[[487   0]
 [  6  65]] 




Successfully registered model 'svm-model'.
2023/02/27 20:20:29 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: svm-model, version 1
Created version '1' of model 'svm-model'.


In [37]:
print(mlflow.tracking.MlflowClient().get_model_version("svm-model", '1'))


<ModelVersion: creation_timestamp=1677509429910, current_stage='None', description=None, last_updated_timestamp=1677509429910, name='svm-model', run_id='bd18db12799241499ddf0fe91332d9ca', run_link=None, source='file:///c:/Users/pragy/Applied-Machine-Learning/Assignment2/mlruns/0/bd18db12799241499ddf0fe91332d9ca/artifacts/sklearn-model', status='READY', status_message=None, tags={}, user_id=None, version=1>
