In [1]:
!pip install mlflow databricks-sdk databricks-cli

Collecting mlflow
  Downloading mlflow-2.13.2-py3-none-any.whl.metadata (29 kB)
Collecting databricks-sdk
  Downloading databricks_sdk-0.28.0-py3-none-any.whl.metadata (35 kB)
Collecting databricks-cli
  Downloading databricks_cli-0.18.0-py2.py3-none-any.whl.metadata (4.0 kB)
Collecting cachetools<6,>=5.0.0 (from mlflow)
  Downloading cachetools-5.3.3-py3-none-any.whl.metadata (5.3 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.3-py2.py3-none-any.whl.metadata (7.7 kB)
Collecting querystring-parser<2 (from mlflow)
  Downloading querystring_parser-1.2.4-py2.py3-none-any.whl.metadata (559 bytes)
Collecting gunicorn<23 (from mlflow)
  Downloading gunicorn-22.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting graphql-core<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_core-3.2.3-py3-none-any.whl.metadata (10 kB)
Collecting graphql-relay<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_relay-3.2.0-py3-none-any.whl.metadata (12 kB)
Collecting aniso8601<10,

In [4]:
import numpy as np
import pandas as pd
import os
import mlflow
import boto3

In [13]:
os.environ['MLFLOW_S3_ENDPOINT_URL'] = 'https://s3.eu-central-1.amazonaws.com'

In [5]:
s3 = boto3.client('s3')
s3.download_file('mlops-news-trends', 'data/train.json', 'train.json')

In [6]:
mlflow.set_tracking_uri('databricks')
mlflow.set_experiment("/Users/palamariuk.pn@ucu.edu.ua/experiment")

<Experiment: artifact_location='dbfs:/databricks/mlflow-tracking/3206751157090435', creation_time=1717879026587, experiment_id='3206751157090435', last_update_time=1717882069583, lifecycle_stage='active', name='/Users/palamariuk.pn@ucu.edu.ua/experiment', tags={'mlflow.experiment.sourceName': '/Users/palamariuk.pn@ucu.edu.ua/experiment',
 'mlflow.experimentType': 'MLFLOW_EXPERIMENT',
 'mlflow.ownerEmail': 'palamariuk.pn@ucu.edu.ua',
 'mlflow.ownerId': '7973479790873778'}>

In [7]:
df = pd.read_json('/kaggle/working/train.json', lines=True)
df.dropna(inplace=True)

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score


In [10]:
X = df['short_description']
y = df['category']

vectorizer = TfidfVectorizer(stop_words='english')
X_vect = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)

In [11]:
model = LogisticRegression()

In [19]:
expr_name = "/Users/palamariuk.pn@ucu.edu.ua/experiment_2"
s3_bucket = "s3://mlops-news-trends/artifacts"

mlflow.create_experiment(expr_name, s3_bucket)
mlflow.set_experiment(expr_name)

with mlflow.start_run(run_name=f'Run {type(model).__name__} - 2'):
    mlflow.set_tag("model", type(model).__name__)
    mlflow.set_tag("owner", "maksym palamariuk")
    mlflow.set_tag("description", "The second run of the model (accuracy is corrected)")

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    accuracy = round(accuracy_score(y_test, y_pred), 4)
    
    print(classification_report(y_test, y_pred))

    mlflow.log_metric("accuracy", accuracy)
    mlflow.sklearn.log_model(model, "model")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


                precision    recall  f1-score   support

          ARTS       0.38      0.02      0.04       246
ARTS & CULTURE       0.29      0.01      0.02       224
  BLACK VOICES       0.44      0.15      0.22       711
      BUSINESS       0.40      0.24      0.30       982
       COLLEGE       0.30      0.05      0.08       184
        COMEDY       0.33      0.09      0.14       844
         CRIME       0.41      0.20      0.27       547
CULTURE & ARTS       0.32      0.08      0.12       155
       DIVORCE       0.79      0.51      0.62       556
     EDUCATION       0.31      0.10      0.15       166
 ENTERTAINMENT       0.35      0.49      0.41      2805
   ENVIRONMENT       0.63      0.11      0.19       237
         FIFTY       0.14      0.00      0.01       209
  FOOD & DRINK       0.51      0.57      0.54       959
     GOOD NEWS       0.00      0.00      0.00       213
         GREEN       0.35      0.08      0.13       421
HEALTHY LIVING       0.26      0.04      0.07  



In [43]:
import joblib

joblib.dump(model, 'model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']