In [3]:
import pandas as pd
pd.options.display.max_columns = None
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from xgboost import XGBClassifier
import mlflow
import datetime
import warnings
warnings.filterwarnings("ignore")


* 'schema_extra' has been renamed to 'json_schema_extra'


In [9]:
version = "v1.0"
data_url = "/data/weatherAUS.csv"

In [5]:
from data_processing import transform_data

In [6]:
import os
os.environ['MLFLOW_TRACKING_USERNAME']= "Sahar-dev"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "1a82d2e1cf9c21f919dfb44e98e6eb57fc75ab0a"

In [7]:

mlflow.set_tracking_uri("https://dagshub.com/Sahar-dev/weather.mlflow")
mlflow.set_experiment("Final-experiment")

2023/12/01 09:50:06 INFO mlflow.tracking.fluent: Experiment with name 'Final-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/22bca6fafa764f5f9b31797f132bcceb', creation_time=1701420606417, experiment_id='8', last_update_time=1701420606417, lifecycle_stage='active', name='Final-experiment', tags={}>

In [18]:
#read the data
df = pd.read_csv("data\weatherAUS.csv")

In [19]:
#shape of the data
df.shape

(145460, 23)

In [20]:
#cleaning and preprocessing
X_train,X_test,y_train,y_test = transform_data(df)

In [21]:
mlflow.sklearn.autolog(disable=True)

In [22]:

with mlflow.start_run(run_name='LogisticRegression'):
    mlflow.log_param("data_url",data_url)
    mlflow.log_param("data_version",version)
    mlflow.log_param("input_rows",df.shape[0])
    mlflow.log_param("input_cols",df.shape[1])
    #model fitting and training
    lr=LogisticRegression()
    mlflow.set_tag(key= "model",value="LogisticRegression")
    params = lr.get_params()
    mlflow.log_params(params)
    lr.fit(X_train,y_train)
    train_features_name = f'{X_train=}'.split('=')[0]
    train_label_name = f'{y_train=}'.split('=')[0]
    mlflow.set_tag(key="train_features_name",value= train_features_name)
    mlflow.set_tag(key= "train_label_name",value=train_label_name)
    predicted=lr.predict(X_test)
    precision,recall,fscore,support=score(y_test,predicted,average='macro')
    mlflow.log_metric("Precision_test",precision)
    mlflow.log_metric("Recall_test",recall)
    mlflow.log_metric("F1_score_test",fscore)
    mlflow.sklearn.log_model(lr,artifact_path="ML_models")


In [23]:
# disable autologging
mlflow.sklearn.autolog(disable=True)

In [24]:

with mlflow.start_run(run_name='RandomForest'):
    mlflow.log_param("data_url",data_url)
    mlflow.log_param("data_version",version)
    mlflow.log_param("input_rows",df.shape[0])
    mlflow.log_param("input_cols",df.shape[1])
    rf = RandomForestClassifier(random_state=5)
    mlflow.set_tag(key="model", value = "RandomForest")
    params = rf.get_params()
    mlflow.log_params(params)
    rf.fit(X_train,y_train)
    train_features_name = f'{X_train=}'.split('=')[0]
    train_label_name = f'{y_train=}'.split('=')[0]
    mlflow.set_tag(key="train_features_name",value= train_features_name)
    mlflow.set_tag(key= "train_label_name",value=train_label_name)
    predicted=rf.predict(X_test)
    precision,recall,fscore,support=score(y_test,predicted,average='macro')
    mlflow.log_metric("Precision_test",precision)
    mlflow.log_metric("Recall_test",recall)
    mlflow.log_metric("F1_score_test",fscore)
    mlflow.sklearn.log_model(rf,artifact_path="ML_models")


In [25]:
# disable autologging
mlflow.xgboost.autolog(disable=True)

In [26]:

with mlflow.start_run(run_name='XGBoost'):
    mlflow.log_param("data_url",data_url)
    mlflow.log_param("data_version",version)
    mlflow.log_param("input_rows",df.shape[0])
    mlflow.log_param("input_cols",df.shape[1])
    xg = XGBClassifier()
    params = xg.get_params()
    mlflow.set_tag(key= "model", value="XGBClassifier")
    mlflow.log_params(params)
    xg.fit(X_train,y_train)
    train_features_name = f'{X_train=}'.split('=')[0]
    train_label_name = f'{y_train=}'.split('=')[0]
    mlflow.set_tag(key="train_features_name",value= train_features_name)
    mlflow.set_tag(key= "train_label_name",value=train_label_name)
    predicted=xg.predict(X_test)
    precision,recall,fscore,support=score(y_test,predicted,average='macro')
    mlflow.log_metric("Precision_test",precision)
    mlflow.log_metric("Recall_test",recall)
    mlflow.log_metric("F1_score_test",fscore)
    mlflow.xgboost.log_model(xg,artifact_path="ML_models")


In [27]:

#Reading Pandas Dataframe from mlflow
all_experiments = [exp.experiment_id for exp in mlflow.search_experiments()]
df_mlflow = mlflow.search_runs(experiment_ids=all_experiments,filter_string="metrics.F1_score_test <1")
run_id = df_mlflow.loc[df_mlflow['metrics.F1_score_test'].idxmax()]['run_id']
print(run_id)

b279f77c54524c388f0da64e3b01586a


In [28]:
df_mlflow

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.F1_score_test,metrics.Recall_test,metrics.Precision_test,params.scale_pos_weight,params.verbosity,params.multi_strategy,params.enable_categorical,params.max_delta_step,params.max_leaves,params.reg_alpha,params.eval_metric,params.colsample_bylevel,params.max_cat_threshold,params.importance_type,params.n_estimators,params.validate_parameters,params.colsample_bytree,params.callbacks,params.subsample,params.grow_policy,params.reg_lambda,params.data_version,params.n_jobs,params.max_cat_to_onehot,params.learning_rate,params.input_rows,params.colsample_bynode,params.objective,params.random_state,params.feature_types,params.booster,params.data_url,params.interaction_constraints,params.missing,params.num_parallel_tree,params.sampling_method,params.gamma,params.early_stopping_rounds,params.monotone_constraints,params.min_child_weight,params.device,params.input_cols,params.base_score,params.max_depth,params.max_bin,params.tree_method,params.bootstrap,params.min_weight_fraction_leaf,params.max_features,params.min_samples_split,params.max_leaf_nodes,params.min_impurity_decrease,params.oob_score,params.criterion,params.verbose,params.class_weight,params.warm_start,params.min_samples_leaf,params.ccp_alpha,params.max_samples,params.penalty,params.multi_class,params.max_iter,params.tol,params.dual,params.l1_ratio,params.C,params.fit_intercept,params.solver,params.intercept_scaling,tags.train_label_name,tags.train_features_name,tags.mlflow.user,tags.mlflow.runName,tags.mlflow.source.name,tags.mlflow.log-model.history,tags.mlflow.source.git.commit,tags.mlflow.source.type,tags.model
0,b279f77c54524c388f0da64e3b01586a,8,FINISHED,mlflow-artifacts:/22bca6fafa764f5f9b31797f132b...,2023-12-01 09:03:04.646000+00:00,2023-12-01 09:03:16.083000+00:00,0.760263,0.73446,0.805663,,,,False,,,,,,,,,,,,,,,v1.0,,,,145460,,binary:logistic,,,,/data/weatherAUS.csv,,,,,,,,,,25,,,,,,,,,,,,,,,,,,,,,,,,,,,,,y_train,X_train,sahar-dev,XGBoost,c:\Users\sahas\anaconda3\envs\mlops\lib\site-p...,"[{""run_id"": ""b279f77c54524c388f0da64e3b01586a""...",f60408ba26398d9ba4ad8adcf2b72d1b9e022965,LOCAL,XGBClassifier
1,c585307d3fdf495683e63a6cbe58987b,8,FINISHED,mlflow-artifacts:/22bca6fafa764f5f9b31797f132b...,2023-12-01 08:57:00.657000+00:00,2023-12-01 09:02:36.007000+00:00,0.756074,0.728926,0.806298,,,,,,,,,,,,100.0,,,,,,,v1.0,,,,145460,,,5.0,,,/data/weatherAUS.csv,,,,,,,,,,25,,,,,True,0.0,sqrt,2.0,,0.0,False,gini,0.0,,False,1.0,0.0,,,,,,,,,,,,y_train,X_train,sahar-dev,RandomForest,c:\Users\sahas\anaconda3\envs\mlops\lib\site-p...,"[{""run_id"": ""c585307d3fdf495683e63a6cbe58987b""...",f60408ba26398d9ba4ad8adcf2b72d1b9e022965,LOCAL,RandomForest
2,a8c2426239c74818a0d598d52a4a778a,8,FINISHED,mlflow-artifacts:/22bca6fafa764f5f9b31797f132b...,2023-12-01 08:55:06.030000+00:00,2023-12-01 08:55:21.285000+00:00,0.738376,0.711099,0.793233,,,,,,,,,,,,,,,,,,,v1.0,,,,145460,,,,,,/data/weatherAUS.csv,,,,,,,,,,25,,,,,,,,,,,,,0.0,,False,,,,l2,auto,100.0,0.0001,False,,1.0,True,lbfgs,1.0,y_train,X_train,sahar-dev,LogisticRegression,c:\Users\sahas\anaconda3\envs\mlops\lib\site-p...,"[{""run_id"": ""a8c2426239c74818a0d598d52a4a778a""...",f60408ba26398d9ba4ad8adcf2b72d1b9e022965,LOCAL,LogisticRegression


In [29]:

#let's call the model from the model registry ( in production stage)
import mlflow.pyfunc

logged_model = f'runs:/{run_id}/ML_models'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)
print(loaded_model)

# Predict on a Pandas DataFrame.

loaded_model.predict(X_test)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

mlflow.pyfunc.loaded_model:
  artifact_path: ML_models
  flavor: mlflow.xgboost
  run_id: b279f77c54524c388f0da64e3b01586a



array([1, 0, 1, ..., 0, 1, 0])