In [None]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from xgboost import XGBClassifier
import mlflow, datetime, os, pickle, random, sys
from joblib import dump

# rcv1 = fetch_rcv1()

In [None]:

# pickle.dump(rcv1.data, open('../data/data.pickle', 'wb'))
# pickle.dump(rcv1.target, open('../data/target.pickle', 'wb'))

In [None]:
# Check if the file exists within the folder
if os.path.exists('../data/'):
    X = pickle.load(open('../data/data.pickle', 'rb'))
    y = pickle.load(open('../data/target.pickle', 'rb'))
else:
    os.makedirs('../data/', exist_ok=True)
    data = load_wine()
    X, y = data.data, data.target
    with open('../data/data.pickle', 'wb') as data_file:
        pickle.dump(X, data_file)
    with open('../data/target.pickle', 'wb') as target_file:
        pickle.dump(y, target_file)

In [None]:
mlflow.set_tracking_uri("./mlruns")
dataset_name = "Wine Dataset Test"
current_time = datetime.datetime.now().strftime("%y%m%d_%H%M%S")
experiment_name = f"{dataset_name}_{current_time}"
experiment_id = mlflow.create_experiment(f"{experiment_name}")

with mlflow.start_run(experiment_id=experiment_id, run_name=f"{dataset_name}"):

    params = {
        "dataset_name": dataset_name,
        "number_of_datapoints": X.shape[0],
        "number_of_dimensions": X.shape[1]
    }
    mlflow.log_params(params)

    train_X, test_X, train_y, test_y = train_test_split(
        X, y, test_size=0.3, shuffle=True, random_state=42
    )

    model = XGBClassifier(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        use_label_encoder=False,
        eval_metric='mlogloss'
    )

    model.fit(train_X, train_y)
    y_predict = model.predict(test_X)

    acc = accuracy_score(test_y, y_predict)
    f1 = f1_score(test_y, y_predict, average="weighted")

    mlflow.log_metrics({"Accuracy": acc, "F1 Score": f1})

    os.makedirs('../model/', exist_ok=True)
    model_path = f'../model/{experiment_id}.joblib'
    dump(model, model_path)

In [None]:
def test_data_shape():
    data = load_wine()
    assert data.data.shape[0] > 100
    assert data.data.shape[1] == 13

def test_train_test_split_ratio():
    X, _, y, _ = train_test_split(load_wine().data, load_wine().target, test_size=0.3)
    assert len(X) > 0.6 * len(load_wine().data)

def test_xgb_training():
    data = load_wine()
    X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.3)
    model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    assert len(preds) == len(y_test)

def test_model_performance():
    data = load_wine()
    X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.3)
    model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
    model.fit(X_train, y_train)
    acc = accuracy_score(y_test, model.predict(X_test))
    assert acc > 0.85  # Reasonable threshold for this dataset