In [1]:
import os

In [2]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

import mlflow
import mlflow.sklearn
from  mlflow.tracking import MlflowClient
categories = ['alt.atheism', 'soc.religion.christian',
              'comp.graphics', 'sci.med']
mlflow_params  = {}
mlflow_metrics = {}

In [3]:
tracking_uri = os.environ.get("TRACKING_URL")
client = MlflowClient(tracking_uri=tracking_uri)
mlflow.set_tracking_uri(tracking_uri)
experiments = client.list_experiments()
experiment_names = []
for exp in experiments:
    experiment_names.append(exp.name)
experiment_name = "nlp_demo"
if experiment_name not in experiment_names:
    mlflow.create_experiment(experiment_name)
mlflow.set_experiment(experiment_name)

In [4]:
twenty_train = fetch_20newsgroups(subset='train',
    categories=categories, shuffle=True, random_state=42)

In [5]:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [6]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
mlflow_params["samples"]  = X_train_counts.shape[0]
mlflow_params["features"] = X_train_counts.shape[1]


In [7]:
use_idf = True
tf_transformer = TfidfTransformer(use_idf=use_idf).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape
mlflow_params["use_idf"] = use_idf

In [8]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None))
])

mlflow_params["classifier"] = text_clf.steps[2][1].__class__.__name__

In [9]:
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf',
                 SGDClassifier(alpha=0.001, max_iter=5, random_state=42,
                               tol=None))])

In [10]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test',
    categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
accuracy = np.mean(predicted == twenty_test.target)
mlflow_metrics["accuracy"] = accuracy

In [11]:
from sklearn import metrics

In [12]:
report = metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names)

In [13]:
fp = open("report.txt","w")
fp.write(report)
fp.close()

In [14]:
with mlflow.start_run():
    mlflow.log_params(mlflow_params)
    mlflow.log_metrics(mlflow_metrics)
    mlflow.sklearn.log_model(text_clf,"model")
    mlflow.log_artifact("audio/p282_285.wav")