In [0]:
from pyspark.sql.types import DoubleType, StringType, StructType, StructField
schema = StructType([
    StructField("age", DoubleType(), True),
    StructField("workclass", StringType(), True),
    StructField("fnlwgt", DoubleType(), True),
    StructField("education", StringType(), True),
    StructField("education_num", DoubleType(), True),
    StructField("marital_status", StringType(), True),
    StructField("occupation", StringType(), True),
    StructField("relationship", StringType(), True),
    StructField("race", StringType(), True),
    StructField("sex", StringType(), True),

    StructField("capital_gain", DoubleType(), True),
    StructField("capital_loss", DoubleType(), True),
    StructField("hours_per_week", DoubleType(), True),
    StructField("native_country", StringType(), True),
    StructField("income", StringType(), True),
])

census_df = spark.read.format("csv").schema(schema).load("/databricks-datasets/adult/adult.data")

In [0]:
census_df.display()

In [0]:
census_df.count()

In [0]:
census_df.columns

In [0]:
census_df.dtypes

In [0]:
type(census_df)

In [0]:
train_df, test_df = census_df.randomSplit([0.99, 0.01], seed=42)

In [0]:
from databricks import automl

In [0]:
summary = automl.classify(train_df,target_col="income",timeout_minutes=5)

In [0]:
print(summary)

In [0]:
print(summary.best_trial.model_path)

Lets test the model

In [0]:
import mlflow

# Prepare test dataset
test_pdf = test_df.toPandas()
y_test=test_pdf["income"]
X_test = test_pdf.drop("income", axis=1)

In [0]:
# Run inference using the best model
model_uri = summary.best_trial.model_path

model = mlflow.pyfunc.load_model(model_uri)
predictions=model.predict(X_test)
test_pdf["income_predicted"]=predictions
display(test_pdf)

In [0]:
import sklearn.metrics
model = mlflow.sklearn.load_model(model_uri)
sklearn.metrics.plot_confusion_matrix(model, X_test,y_test)