# Classification Example

Using purchase order data to determine whether or not a purchase order will be accepted or rejected.

## Install

In [None]:
# !pip install fedml-databricks

In [None]:
# !pip install flaml

In [None]:
# !pip install "flaml[automl]"

In [None]:
# if the following error occurs: TypeError: XGBClassifier.fit() got an unexpected keyword argument 'callbacks'
# error occurs due to version conflicts b/w FLAML and xgboost: https://github.com/microsoft/FLAML/issues/1314

# please run the following pip install command to fix the above error:
# !pip install --force-reinstall xgboost==2.0.3 "numpy<2.*"

## Imports

In [None]:
import pandas as pd
from flaml import AutoML
from sklearn.model_selection import train_test_split

from helpers.fedml import get_data
from helpers.automl import infer_problem_type, get_accuracy, get_predictions, BINARY, MULTICLASS, CLASSIFICATION

## Data Loading and Prep

In [None]:
# grab fields from request
view_name = "sampled_purchase_requisition_view"
label = "DecisionFlag"
config_path = "./config/db_connection.json"

In [None]:
# create df from view name
df, le, encoded_cols = get_data(view_name, config_path)

# if you are using the csv provided:
# df, le, encoded_cols = get_data(view_name, config_path, csv_path=f"./data/{view_name}.csv")

In [None]:
# extra column in dataset
df.drop(columns=["column0"], inplace=True)

In [None]:
df.head()

In [None]:
# create features and labels
X = df.drop(columns=[label])
y = df[label]

In [None]:
# for reverse translation on predictions
label_encoded = False
if label in encoded_cols:
    label_encoded = True

In [None]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## AutoML

In [None]:
# determine the problem type
task = infer_problem_type(y_train)
if task == BINARY or task == MULTICLASS:
    task = CLASSIFICATION

In [None]:
# configure and train the model
automl = AutoML()
automl.fit(X_train, y_train, task=task, time_budget=10)

In [None]:
# name of best model
best_model_name = automl.best_estimator
best_model_name

In [None]:
# grab model accuracy
accuracy = get_accuracy(task, automl, X_test, y_test)
accuracy

In [None]:
# predict
predictions, actual_values = get_predictions(automl, X_test, y_test, le, label_encoded)

In [None]:
display_df = pd.DataFrame({"predictions": predictions, "actual_values": actual_values})
display_df["Correct"] = display_df["predictions"] == display_df["actual_values"]
display_df.head(5)