In [1]:
!pip show mlflow

Name: mlflow
Version: 2.4.1
Summary: MLflow: A Platform for ML Development and Productionization
Home-page: https://mlflow.org/
Author: Databricks
Author-email: 
License: Apache License 2.0
Location: c:\users\gaikw\anaconda3\lib\site-packages
Requires: alembic, click, cloudpickle, databricks-cli, docker, entrypoints, Flask, gitpython, importlib-metadata, Jinja2, markdown, matplotlib, numpy, packaging, pandas, protobuf, pyarrow, pytz, pyyaml, querystring-parser, requests, scikit-learn, scipy, sqlalchemy, sqlparse, waitress
Required-by: 


In [2]:
from argparse import Namespace
import mlflow
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

In [3]:
import utils
import json

In [4]:
# Specify arguments
args = Namespace(
    lower=True,
    stem=False,
    analyzer="char",
    ngram_max_range=7,
    alpha=1e-4,
    learning_rate=1e-1,
    power_t=0.1,
    num_epochs=50,
    max_features='log2',
    min_samples_split=3,
    n_estimators=264
)

In [5]:
# Set tracking URI
MODEL_REGISTRY = Path("../mlruns")    #name of experiment folder
Path(MODEL_REGISTRY).mkdir(exist_ok=True) # create experiments dir
mlflow.set_tracking_uri("file:///" + str(MODEL_REGISTRY.absolute()))

In [6]:
import pandas as pd
df = pd.read_csv(r"C:\Users\gaikw\Desktop\Data Science\projects\mlopsDQ\data\sample_adv_features.csv",nrows=10000)
df = df.drop(['Unnamed: 0.1','Unnamed: 0','qid1','qid2'],axis=1)

In [21]:
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss, precision_recall_fscore_support
from tqdm.notebook import tqdm
import optuna

In [9]:
def train(args,df,trial=None,clean=False):
    #setup
    utils.set_seeds()
    df = pd.read_csv(r"C:\Users\gaikw\Desktop\Data Science\projects\mlopsDQ\data\sample_adv_features.csv",nrows=10000)
    df = df.drop(['Unnamed: 0.1','Unnamed: 0','qid1','qid2'],axis=1)
    if clean:
        df.questio1 = df.question1.apply(utlis.prerocess)  ##data is already preprocessed
        df.questio1 = df.question1.apply(utlis.prerocess)  ##data is already preprocessed
    
    ##Feature extraction and tfidf Vectorization
    data = []
    for q1,q2 in tqdm(zip(df.question1.values,df.question2.values)):
        data.append(utils.query_point_creator(q1,q2))
    data=np.array(data)
    data=data.reshape(10000,6022)
    X = pd.DataFrame(data)
    y = df.is_duplicate
    
    #train_test_split
    X_train, X_val, X_test, y_train, y_val, y_test=utils.get_data_splits(X,y)
    
    #model
    model = SGDClassifier(
            loss="log", penalty="l2", alpha=args.alpha, max_iter=1,
            learning_rate="constant", eta0=args.learning_rate, power_t=args.power_t, warm_start=True)
#     model = RandomForestClassifier(max_features=args.max_features, min_samples_split=args.min_samples_split,
#                        n_estimators=args.n_estimators)
    
    #training
    for epoch in tqdm(range(args.num_epochs)):
        model.fit(X_train, y_train)
        train_loss = log_loss(y_train, model.predict_proba(X_train))
        val_loss = log_loss(y_val, model.predict_proba(X_val))
        if not epoch%10:
            print(
                f"Epoch: {epoch:02d} | "
                f"train_loss: {train_loss:.5f}, "
                f"val_loss: {val_loss:.5f}"
            )
        # ml-flow Log
        if not trial:
            mlflow.log_metrics({"train_loss": train_loss, "val_loss": val_loss}, step=epoch)
            
#         Pruning (for optimization )
        if trial:
            trial.report(val_loss, epoch)
            if trial.should_prune():
                raise optuna.TrialPruned()
    
    # Threshold
    y_pred = model.predict(X_val)
    y_prob = model.predict_proba(X_val)
    args.threshold = np.quantile(
        [y_prob[i][j] for i, j in enumerate(y_pred)], q=0.25)  # Q1
    
    # Evaluation
    y_prob = model.predict_proba(X_test)
    y_pred = model.predict(X_test)
#     y_pred = utils.custom_predict(y_prob=y_prob, threshold=args.threshold, index=0)
    metrics = precision_recall_fscore_support(y_test, y_pred, average="binary")
    performance = {"precision": metrics[0], "recall": metrics[1], "f1": metrics[2]}
    print (json.dumps(performance, indent=2))
    
    return {
        "args": args,
        "model": model,
        "performance": performance
    }   

**Tracking**

In [10]:
import joblib
import tempfile

In [11]:
# Set experiment
mlflow.set_experiment(experiment_name="baselines")

<Experiment: artifact_location=('file:///C:\\Users\\gaikw\\Desktop\\Data '
 'Science\\projects\\mlopsDQ\\notebooks\\..\\experiments/165876016812443962'), creation_time=1686997405361, experiment_id='165876016812443962', last_update_time=1686997405361, lifecycle_stage='active', name='baselines', tags={}>

In [12]:
def save_dict(d, filepath):
    """Save dict to a json file."""
    with open(filepath, "w") as fp:
        json.dump(d, indent=2, sort_keys=False, fp=fp)

In [13]:
# Tracking
with mlflow.start_run(run_name="sgd"):

    # Train & evaluate
    artifacts = train(args=args, df=df)

    # Log key metrics
    mlflow.log_metrics({"precision": artifacts["performance"]["precision"]})
    mlflow.log_metrics({"recall": artifacts["performance"]["recall"]})
    mlflow.log_metrics({"f1": artifacts["performance"]["f1"]})
    
    # Log artifacts
    with tempfile.TemporaryDirectory() as dp:
#         artifacts["label_encoder"].save(Path(dp, "label_encoder.json"))
#         joblib.dump(artifacts["vectorizer"], Path(dp, "vectorizer.pkl"))
        joblib.dump(artifacts["model"], Path(dp, "model.pkl"))
        save_dict(artifacts["performance"], Path(dp, "performance.json"))
        mlflow.log_artifacts(dp)
        
    # Log parameters
    mlflow.log_params(vars(artifacts["args"]))

0it [00:00, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 00 | train_loss: 17.94459, val_loss: 17.94974
Epoch: 10 | train_loss: 17.94459, val_loss: 17.94974
Epoch: 20 | train_loss: 11.25783, val_loss: 11.29327
Epoch: 30 | train_loss: 16.12747, val_loss: 15.89963
Epoch: 40 | train_loss: 17.94459, val_loss: 17.94974
{
  "precision": 0.7407407407407407,
  "recall": 0.02677376171352075,
  "f1": 0.05167958656330749
}


**Loading**

In [14]:
def load_dict(filepath):
    """Load a dict from a json file."""
    with open(filepath, "r") as fp:
        d = json.load(fp)
    return d

In [15]:
# Load all runs from experiment
experiment_id = mlflow.get_experiment_by_name("baselines").experiment_id
all_runs = mlflow.search_runs(experiment_ids=experiment_id, order_by=["metrics.val_loss ASC"])

In [17]:
# # Best run
# best_run_id = all_runs.iloc[0].run_id
# best_run = mlflow.get_run(run_id=best_run_id)
# client = mlflow.tracking.MlflowClient()
# with tempfile.TemporaryDirectory() as dp:
#     client.download_artifacts(run_id=best_run_id, path="", dst_path=dp)
# #     vectorizer = joblib.load(Path(dp, "vectorizer.pkl"))
# #     label_encoder = LabelEncoder.load(fp=Path(dp, "label_encoder.json"))
#     model = joblib.load(Path(dp, "model.pkl"))
#     performance = load_dict(filepath=Path(dp, "performance.json"))

**Optimization**

In [18]:
def objective(args, trial):
    """Objective function for optimization trials."""
    # Parameters to tune
    args.analyzer = trial.suggest_categorical("analyzer", ["word", "char", "char_wb"])
    args.ngram_max_range = trial.suggest_int("ngram_max_range", 3, 10)
    args.learning_rate = trial.suggest_loguniform("learning_rate", 1e-2, 1e0)
    args.power_t = trial.suggest_uniform("power_t", 0.1, 0.5)

    # Train & evaluate
    artifacts = train(args=args, df=df, trial=trial)

    # Set additional attributes
    performance = artifacts["performance"]
    print(json.dumps(performance, indent=2))
    trial.set_user_attr("precision", performance["precision"])
    trial.set_user_attr("recall", performance["recall"])
    trial.set_user_attr("f1", performance["f1"])

    return performance["f1"]

In [19]:
from numpyencoder import NumpyEncoder
from optuna.integration.mlflow import MLflowCallback

In [22]:
NUM_TRIALS = 20  # small sample for now

# Optimize
pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=5)
study = optuna.create_study(study_name="optimization", direction="maximize", pruner=pruner)
mlflow_callback = MLflowCallback(tracking_uri=mlflow.get_tracking_uri(), metric_name="f1")
study.optimize(lambda trial: objective(args, trial),n_trials=NUM_TRIALS,callbacks=[mlflow_callback])

[32m[I 2023-06-18 14:35:47,533][0m A new study created in memory with name: optimization[0m


0it [00:00, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 00 | train_loss: 17.94459, val_loss: 17.94974
Epoch: 10 | train_loss: 17.93123, val_loss: 17.97377
Epoch: 20 | train_loss: 12.35951, val_loss: 12.56853
Epoch: 30 | train_loss: 14.29423, val_loss: 14.08348
Epoch: 40 | train_loss: 12.18409, val_loss: 12.56635


[32m[I 2023-06-18 14:37:09,181][0m Trial 0 finished with value: 0.3326403326403326 and parameters: {'analyzer': 'char', 'ngram_max_range': 7, 'learning_rate': 0.1745960695757196, 'power_t': 0.24460931148741297}. Best is trial 0 with value: 0.3326403326403326.[0m
2023/06/18 14:37:09 INFO mlflow.tracking.fluent: Experiment with name 'optimization' does not exist. Creating a new experiment.


{
  "precision": 0.7441860465116279,
  "recall": 0.214190093708166,
  "f1": 0.3326403326403326
}
{
  "precision": 0.7441860465116279,
  "recall": 0.214190093708166,
  "f1": 0.3326403326403326
}


0it [00:00, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 00 | train_loss: 17.94459, val_loss: 17.94974
Epoch: 10 | train_loss: 12.01719, val_loss: 12.29997
Epoch: 20 | train_loss: 11.53667, val_loss: 11.48882
Epoch: 30 | train_loss: 13.03519, val_loss: 12.93999
Epoch: 40 | train_loss: 17.94459, val_loss: 17.94974


[32m[I 2023-06-18 14:38:28,879][0m Trial 1 finished with value: 0.5751533742331288 and parameters: {'analyzer': 'char_wb', 'ngram_max_range': 6, 'learning_rate': 0.21822319443551513, 'power_t': 0.1931773301154395}. Best is trial 1 with value: 0.5751533742331288.[0m


{
  "precision": 0.6732495511669659,
  "recall": 0.5020080321285141,
  "f1": 0.5751533742331288
}
{
  "precision": 0.6732495511669659,
  "recall": 0.5020080321285141,
  "f1": 0.5751533742331288
}


0it [00:00, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 00 | train_loss: 17.94459, val_loss: 17.94974
Epoch: 10 | train_loss: 17.91370, val_loss: 17.99780
Epoch: 20 | train_loss: 14.12598, val_loss: 13.74011
Epoch: 30 | train_loss: 13.06407, val_loss: 12.60167
Epoch: 40 | train_loss: 13.84608, val_loss: 13.82923


[32m[I 2023-06-18 14:39:46,852][0m Trial 2 finished with value: 0.0026631158455392807 and parameters: {'analyzer': 'char', 'ngram_max_range': 7, 'learning_rate': 0.20990214536757826, 'power_t': 0.3491611351618473}. Best is trial 1 with value: 0.5751533742331288.[0m


{
  "precision": 0.25,
  "recall": 0.0013386880856760374,
  "f1": 0.0026631158455392807
}
{
  "precision": 0.25,
  "recall": 0.0013386880856760374,
  "f1": 0.0026631158455392807
}


0it [00:00, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 00 | train_loss: 17.47401, val_loss: 17.43606
Epoch: 10 | train_loss: 11.77880, val_loss: 12.17982
Epoch: 20 | train_loss: 11.11228, val_loss: 11.16350
Epoch: 30 | train_loss: 11.73718, val_loss: 11.94485
Epoch: 40 | train_loss: 12.26244, val_loss: 12.39341


[32m[I 2023-06-18 14:41:04,381][0m Trial 3 finished with value: 0.00267379679144385 and parameters: {'analyzer': 'char_wb', 'ngram_max_range': 9, 'learning_rate': 0.03322795378389335, 'power_t': 0.153848550130522}. Best is trial 1 with value: 0.5751533742331288.[0m


{
  "precision": 1.0,
  "recall": 0.0013386880856760374,
  "f1": 0.00267379679144385
}
{
  "precision": 1.0,
  "recall": 0.0013386880856760374,
  "f1": 0.00267379679144385
}


0it [00:00, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 00 | train_loss: 14.51391, val_loss: 13.96997
Epoch: 10 | train_loss: 11.47488, val_loss: 11.70592
Epoch: 20 | train_loss: 11.01094, val_loss: 11.02273
Epoch: 30 | train_loss: 10.48606, val_loss: 10.36098
Epoch: 40 | train_loss: 16.45138, val_loss: 16.23971


[32m[I 2023-06-18 14:42:23,115][0m Trial 4 finished with value: 0.32558139534883723 and parameters: {'analyzer': 'word', 'ngram_max_range': 7, 'learning_rate': 0.01751923479137445, 'power_t': 0.15145060765679014}. Best is trial 1 with value: 0.5751533742331288.[0m


{
  "precision": 0.7738693467336684,
  "recall": 0.20615796519410978,
  "f1": 0.32558139534883723
}
{
  "precision": 0.7738693467336684,
  "recall": 0.20615796519410978,
  "f1": 0.32558139534883723
}


0it [00:00, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 00 | train_loss: 17.94459, val_loss: 17.94974
Epoch: 10 | train_loss: 11.37481, val_loss: 11.67551
Epoch: 20 | train_loss: 11.83209, val_loss: 11.75754
Epoch: 30 | train_loss: 16.63292, val_loss: 16.48097
Epoch: 40 | train_loss: 17.25424, val_loss: 17.07426


[32m[I 2023-06-18 14:43:42,872][0m Trial 5 finished with value: 0.07848101265822786 and parameters: {'analyzer': 'word', 'ngram_max_range': 5, 'learning_rate': 0.040124674854010806, 'power_t': 0.1358641678878744}. Best is trial 1 with value: 0.5751533742331288.[0m


{
  "precision": 0.7209302325581395,
  "recall": 0.041499330655957165,
  "f1": 0.07848101265822786
}
{
  "precision": 0.7209302325581395,
  "recall": 0.041499330655957165,
  "f1": 0.07848101265822786
}


0it [00:00, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 00 | train_loss: 17.94459, val_loss: 17.94974
Epoch: 10 | train_loss: 11.29156, val_loss: 11.40454
Epoch: 20 | train_loss: 10.10977, val_loss: 10.29480
Epoch: 30 | train_loss: 15.03288, val_loss: 14.76011
Epoch: 40 | train_loss: 16.82922, val_loss: 16.61386


[32m[I 2023-06-18 14:45:03,660][0m Trial 6 finished with value: 0.0 and parameters: {'analyzer': 'char', 'ngram_max_range': 9, 'learning_rate': 0.015098026826918963, 'power_t': 0.2112766284936033}. Best is trial 1 with value: 0.5751533742331288.[0m


{
  "precision": 0.0,
  "recall": 0.0,
  "f1": 0.0
}
{
  "precision": 0.0,
  "recall": 0.0,
  "f1": 0.0
}


0it [00:00, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 00 | train_loss: 14.97052, val_loss: 14.34679
Epoch: 10 | train_loss: 12.55386, val_loss: 12.78848
Epoch: 20 | train_loss: 10.69441, val_loss: 10.97922
Epoch: 30 | train_loss: 16.51763, val_loss: 16.25635
Epoch: 40 | train_loss: 11.81348, val_loss: 12.31351


[32m[I 2023-06-18 14:46:22,660][0m Trial 7 finished with value: 0.4833782569631626 and parameters: {'analyzer': 'char', 'ngram_max_range': 8, 'learning_rate': 0.031062934312453987, 'power_t': 0.4765917029141259}. Best is trial 1 with value: 0.5751533742331288.[0m


{
  "precision": 0.7349726775956285,
  "recall": 0.36010709504685406,
  "f1": 0.4833782569631626
}
{
  "precision": 0.7349726775956285,
  "recall": 0.36010709504685406,
  "f1": 0.4833782569631626
}


0it [00:00, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 00 | train_loss: 17.94459, val_loss: 17.94974
Epoch: 10 | train_loss: 17.45325, val_loss: 17.56656
Epoch: 20 | train_loss: 16.91572, val_loss: 16.79769
Epoch: 30 | train_loss: 9.99552, val_loss: 9.83574
Epoch: 40 | train_loss: 15.66483, val_loss: 15.57290


[32m[I 2023-06-18 14:47:39,636][0m Trial 8 finished with value: 0.3427377220480669 and parameters: {'analyzer': 'char_wb', 'ngram_max_range': 9, 'learning_rate': 0.012848840442187318, 'power_t': 0.1105252607445137}. Best is trial 1 with value: 0.5751533742331288.[0m


{
  "precision": 0.780952380952381,
  "recall": 0.21954484605087016,
  "f1": 0.3427377220480669
}
{
  "precision": 0.780952380952381,
  "recall": 0.21954484605087016,
  "f1": 0.3427377220480669
}


0it [00:00, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 00 | train_loss: 17.94459, val_loss: 17.94974
Epoch: 10 | train_loss: 17.83612, val_loss: 17.83150
Epoch: 20 | train_loss: 12.05555, val_loss: 12.04450
Epoch: 30 | train_loss: 11.71320, val_loss: 11.91649
Epoch: 40 | train_loss: 12.32135, val_loss: 12.70463


[32m[I 2023-06-18 14:48:57,356][0m Trial 9 finished with value: 0.0 and parameters: {'analyzer': 'char_wb', 'ngram_max_range': 4, 'learning_rate': 0.0736281186563863, 'power_t': 0.3117816781482768}. Best is trial 1 with value: 0.5751533742331288.[0m


{
  "precision": 0.0,
  "recall": 0.0,
  "f1": 0.0
}
{
  "precision": 0.0,
  "recall": 0.0,
  "f1": 0.0
}


0it [00:00, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 00 | train_loss: 17.94459, val_loss: 17.94974
Epoch: 10 | train_loss: 17.93944, val_loss: 17.94974
Epoch: 20 | train_loss: 14.71173, val_loss: 14.43125
Epoch: 30 | train_loss: 11.83226, val_loss: 11.84634
Epoch: 40 | train_loss: 11.89010, val_loss: 12.33474


[32m[I 2023-06-18 14:50:15,534][0m Trial 10 finished with value: 0.0 and parameters: {'analyzer': 'char_wb', 'ngram_max_range': 3, 'learning_rate': 0.9218922324026961, 'power_t': 0.4156474905718887}. Best is trial 1 with value: 0.5751533742331288.[0m


{
  "precision": 0.0,
  "recall": 0.0,
  "f1": 0.0
}
{
  "precision": 0.0,
  "recall": 0.0,
  "f1": 0.0
}


0it [00:00, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 00 | train_loss: 16.36384, val_loss: 16.34322
Epoch: 10 | train_loss: 14.25591, val_loss: 14.37261
Epoch: 20 | train_loss: 14.14001, val_loss: 13.68309
Epoch: 30 | train_loss: 11.71405, val_loss: 12.06757
Epoch: 40 | train_loss: 17.00187, val_loss: 16.99828


[32m[I 2023-06-18 14:51:31,873][0m Trial 11 finished with value: 0.5992673992673992 and parameters: {'analyzer': 'char', 'ngram_max_range': 6, 'learning_rate': 0.471002488290957, 'power_t': 0.4930807550799224}. Best is trial 11 with value: 0.5992673992673992.[0m


{
  "precision": 0.6618122977346278,
  "recall": 0.5475234270414994,
  "f1": 0.5992673992673992
}
{
  "precision": 0.6618122977346278,
  "recall": 0.5475234270414994,
  "f1": 0.5992673992673992
}


0it [00:00, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 00 | train_loss: 17.94459, val_loss: 17.94974
Epoch: 10 | train_loss: 17.92399, val_loss: 17.94974
Epoch: 20 | train_loss: 12.02139, val_loss: 12.31222
Epoch: 30 | train_loss: 13.06495, val_loss: 13.04027
Epoch: 40 | train_loss: 14.27400, val_loss: 14.25806


[32m[I 2023-06-18 14:52:49,374][0m Trial 12 finished with value: 0.5429936305732486 and parameters: {'analyzer': 'char_wb', 'ngram_max_range': 5, 'learning_rate': 0.5454015416471906, 'power_t': 0.48181622485777337}. Best is trial 11 with value: 0.5992673992673992.[0m


{
  "precision": 0.6699410609037328,
  "recall": 0.4564926372155288,
  "f1": 0.5429936305732486
}
{
  "precision": 0.6699410609037328,
  "recall": 0.4564926372155288,
  "f1": 0.5429936305732486
}


0it [00:00, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 00 | train_loss: 17.94459, val_loss: 17.94974
Epoch: 10 | train_loss: 11.95878, val_loss: 11.93220
Epoch: 20 | train_loss: 12.51378, val_loss: 12.59527
Epoch: 30 | train_loss: 12.76273, val_loss: 12.82956
Epoch: 40 | train_loss: 12.79690, val_loss: 12.83432


[32m[I 2023-06-18 14:54:10,402][0m Trial 13 finished with value: 0.0 and parameters: {'analyzer': 'char', 'ngram_max_range': 5, 'learning_rate': 0.3381597352138705, 'power_t': 0.24075640598771536}. Best is trial 11 with value: 0.5992673992673992.[0m


{
  "precision": 0.0,
  "recall": 0.0,
  "f1": 0.0
}
{
  "precision": 0.0,
  "recall": 0.0,
  "f1": 0.0
}


0it [00:00, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 00 | train_loss: 17.94459, val_loss: 17.94974
Epoch: 10 | train_loss: 12.14028, val_loss: 12.52554
Epoch: 20 | train_loss: 11.69346, val_loss: 11.93547
Epoch: 30 | train_loss: 16.82909, val_loss: 16.60593
Epoch: 40 | train_loss: 17.94459, val_loss: 17.94974


[32m[I 2023-06-18 14:55:28,054][0m Trial 14 finished with value: 0.5829596412556054 and parameters: {'analyzer': 'word', 'ngram_max_range': 6, 'learning_rate': 0.13274837242373805, 'power_t': 0.38486634952190435}. Best is trial 11 with value: 0.5992673992673992.[0m


{
  "precision": 0.6598984771573604,
  "recall": 0.5220883534136547,
  "f1": 0.5829596412556054
}
{
  "precision": 0.6598984771573604,
  "recall": 0.5220883534136547,
  "f1": 0.5829596412556054
}


0it [00:00, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 00 | train_loss: 17.94459, val_loss: 17.94974
Epoch: 10 | train_loss: 12.03344, val_loss: 12.42403
Epoch: 20 | train_loss: 12.20942, val_loss: 12.40982
Epoch: 30 | train_loss: 13.50576, val_loss: 13.15417
Epoch: 40 | train_loss: 16.24577, val_loss: 16.31127


[32m[I 2023-06-18 14:56:43,826][0m Trial 15 finished with value: 0.005319148936170213 and parameters: {'analyzer': 'word', 'ngram_max_range': 6, 'learning_rate': 0.09464427098304289, 'power_t': 0.40736459200489356}. Best is trial 11 with value: 0.5992673992673992.[0m


{
  "precision": 0.4,
  "recall": 0.002677376171352075,
  "f1": 0.005319148936170213
}
{
  "precision": 0.4,
  "recall": 0.002677376171352075,
  "f1": 0.005319148936170213
}


0it [00:00, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 00 | train_loss: 17.94459, val_loss: 17.94974
Epoch: 10 | train_loss: 17.82870, val_loss: 17.82959
Epoch: 20 | train_loss: 15.71403, val_loss: 15.67375
Epoch: 30 | train_loss: 12.65806, val_loss: 12.81032
Epoch: 40 | train_loss: 17.56663, val_loss: 17.43997


[32m[I 2023-06-18 14:57:59,220][0m Trial 16 finished with value: 0.0 and parameters: {'analyzer': 'word', 'ngram_max_range': 3, 'learning_rate': 0.5412364104348623, 'power_t': 0.4162091766041182}. Best is trial 11 with value: 0.5992673992673992.[0m


{
  "precision": 0.0,
  "recall": 0.0,
  "f1": 0.0
}
{
  "precision": 0.0,
  "recall": 0.0,
  "f1": 0.0
}


0it [00:00, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 00 | train_loss: 17.94459, val_loss: 17.94974
Epoch: 10 | train_loss: 17.93102, val_loss: 17.92616
Epoch: 20 | train_loss: 17.94459, val_loss: 17.94974
Epoch: 30 | train_loss: 14.65377, val_loss: 14.32366
Epoch: 40 | train_loss: 11.63718, val_loss: 12.24964


[32m[I 2023-06-18 14:59:18,949][0m Trial 17 finished with value: 0.3017241379310345 and parameters: {'analyzer': 'word', 'ngram_max_range': 10, 'learning_rate': 0.13756852932208657, 'power_t': 0.3720809424219286}. Best is trial 11 with value: 0.5992673992673992.[0m


{
  "precision": 0.7734806629834254,
  "recall": 0.18741633199464525,
  "f1": 0.3017241379310345
}
{
  "precision": 0.7734806629834254,
  "recall": 0.18741633199464525,
  "f1": 0.3017241379310345
}


0it [00:00, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 00 | train_loss: 17.94459, val_loss: 17.94974
Epoch: 10 | train_loss: 17.94459, val_loss: 17.94974
Epoch: 20 | train_loss: 11.82028, val_loss: 11.96675
Epoch: 30 | train_loss: 12.74236, val_loss: 13.02174
Epoch: 40 | train_loss: 17.94459, val_loss: 17.94974


[32m[I 2023-06-18 15:00:34,644][0m Trial 18 finished with value: 0.6053584359160029 and parameters: {'analyzer': 'char', 'ngram_max_range': 4, 'learning_rate': 0.34073571363906413, 'power_t': 0.45666361540632616}. Best is trial 18 with value: 0.6053584359160029.[0m


{
  "precision": 0.6593059936908517,
  "recall": 0.5595716198125836,
  "f1": 0.6053584359160029
}
{
  "precision": 0.6593059936908517,
  "recall": 0.5595716198125836,
  "f1": 0.6053584359160029
}


0it [00:00, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 00 | train_loss: 17.94459, val_loss: 17.94974
Epoch: 10 | train_loss: 17.94459, val_loss: 17.94974
Epoch: 20 | train_loss: 12.88921, val_loss: 12.78036
Epoch: 30 | train_loss: 14.45855, val_loss: 14.32042
Epoch: 40 | train_loss: 13.70939, val_loss: 13.57781


[32m[I 2023-06-18 15:01:51,242][0m Trial 19 finished with value: 0.0 and parameters: {'analyzer': 'char', 'ngram_max_range': 4, 'learning_rate': 0.37111223010779426, 'power_t': 0.4973282794124489}. Best is trial 18 with value: 0.6053584359160029.[0m


{
  "precision": 0.0,
  "recall": 0.0,
  "f1": 0.0
}
{
  "precision": 0.0,
  "recall": 0.0,
  "f1": 0.0
}


In [23]:
# All trials
trials_df = study.trials_dataframe()
trials_df = trials_df.sort_values(["user_attrs_f1"], ascending=False)  # sort by metric
trials_df.head()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_analyzer,params_learning_rate,params_ngram_max_range,params_power_t,user_attrs_f1,user_attrs_precision,user_attrs_recall,state
18,18,0.605358,2023-06-18 14:59:19.122475,2023-06-18 15:00:34.644248,0 days 00:01:15.521773,char,0.340736,4,0.456664,0.605358,0.659306,0.559572,COMPLETE
11,11,0.599267,2023-06-18 14:50:15.703302,2023-06-18 14:51:31.864945,0 days 00:01:16.161643,char,0.471002,6,0.493081,0.599267,0.661812,0.547523,COMPLETE
14,14,0.58296,2023-06-18 14:54:10.576382,2023-06-18 14:55:28.054649,0 days 00:01:17.478267,word,0.132748,6,0.384866,0.58296,0.659898,0.522088,COMPLETE
1,1,0.575153,2023-06-18 14:37:09.401631,2023-06-18 14:38:28.879261,0 days 00:01:19.477630,char_wb,0.218223,6,0.193177,0.575153,0.67325,0.502008,COMPLETE
12,12,0.542994,2023-06-18 14:51:32.039331,2023-06-18 14:52:49.374642,0 days 00:01:17.335311,char_wb,0.545402,5,0.481816,0.542994,0.669941,0.456493,COMPLETE


In [24]:
# Best trial
print (f"Best value (f1): {study.best_trial.value}")
print (f"Best hyperparameters: {json.dumps(study.best_trial.params, indent=2)}")

Best value (f1): 0.6053584359160029
Best hyperparameters: {
  "analyzer": "char",
  "ngram_max_range": 4,
  "learning_rate": 0.34073571363906413,
  "power_t": 0.45666361540632616
}


In [25]:
# Save best parameter values
args = {**args.__dict__, **study.best_trial.params}
print (json.dumps(args, indent=2, cls=NumpyEncoder))

{
  "lower": true,
  "stem": false,
  "analyzer": "char",
  "ngram_max_range": 4,
  "alpha": 0.0001,
  "learning_rate": 0.34073571363906413,
  "power_t": 0.45666361540632616,
  "num_epochs": 50,
  "max_features": "log2",
  "min_samples_split": 3,
  "n_estimators": 264,
  "threshold": 1.0
}


## Observation

1. Linear Model is has low accuray than baselines therefore we are not using it.
2. Random forrest algorithm gives better accuracy .