***GENERATED CODE FOR regressionmedicare PIPELINE.***

***DON'T EDIT THIS CODE.***

***CONNECTOR FUNCTIONS TO READ DATA.***

In [None]:
import os
import datetime
import logging
import warnings
warnings.filterwarnings('ignore')
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)


class HDFSConnector:

    def fetch(spark, config):
        ################### INPUT HADOOP HOST PORT TO CONNECT WITH ###############################
        hdfs_server = str(os.environ['HDFS_SERVER'])
        hdfs_port = int(os.environ['HDFS_PORT'])
        df = spark.read.options(header='true', inferschema='true').csv(
            f"hdfs://{hdfs_server}:{hdfs_port}{eval(config)['url']}", header='true')
        display(df.limit(2).toPandas())
        return df

    def put(df, spark, config):
        return df.write.format('csv').options(header='true' if eval(config)["is_header"] == "Use Header Line" else 'false',
                                              delimiter=eval(config)["delimiter"]).save(("%s %s") % (datetime.datetime.now().strftime("%Y-%m-%d %H.%M.%S")+"_", eval(config)['url']))


***TRANSFORMATIONS FUNCTIONS THAT WILL BE APPLIED ON DATA***

In [None]:
import json
from pyspark.ml.feature import Binarizer
from pyspark.sql.functions import round
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import col, when
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import mean, stddev, min, max, col


class CleanseData:
    # def __init__(self,df):
    #     #print()

    def cleanValueForFE(self, value):
        if value == None:
            return ""
        elif str(value) == 'nan':
            return "nan"
        else:
            return value

    def replaceByMean(self, feature, df, mean_=-1):
        df1 = df
        df1 = df1.dropna()
        meanValue = self.cleanValueForFE(df1.select(
            mean(col(feature.name)).alias('mean')).collect()[0]["mean"])
        df = df.fillna(meanValue, subset=[feature.name])
        df.withColumn(feature.name, when(col(feature.name) == " ",
                      meanValue).otherwise(col(feature.name).cast("Integer")))
        return df

    def replaceByMax(self, feature, df, max_=-1):
        df1 = df
        df1 = df1.dropna()
        maxValue = self.cleanValueForFE(df1.select(
            max(col(feature.name)).alias('max')).collect()[0]["max"])
        df = df.fillna(maxValue, subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", maxValue).otherwise(col(feature.name)))
        return df

    def replaceByMin(self, feature, df, min_=-1):
        df1 = df
        df1 = df1.dropna()
        minValue = self.cleanValueForFE(df1.select(
            min(col(feature.name)).alias('min')).collect()[0]["min"])
        df = df.fillna(minValue, subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", minValue).otherwise(col(feature.name)))
        return df

    def replaceByStandardDeviation(self, feature, df, stddev_=-1):
        df1 = df
        df1 = df1.dropna()
        stddevValue = self.cleanValueForFE(df1.select(
            stddev(col(feature.name)).alias('stddev')).collect()[0]["stddev"])
        df = df.fillna(stddevValue, subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", stddevValue).otherwise(col(feature.name)))
        return df

    def replaceDateRandomly(self, feature, df):
        df1 = df
        df1 = df1.dropna()
        fillValue = self.cleanValueForFE(
            df.where(col(feature.name).isNotNull()).head(1)[0][feature.name])
        df = df.fillna(str(fillValue), subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", fillValue).otherwise(col(feature.name)))
        # print("CleanseData:replaceDateRandomly Schema : ", df.#printSchema())
        return df

    def replaceNullValues(self, fList, df):
        featuresList = df.schema.fields
        for featureObj in fList:
            for feat in featuresList:
                if featureObj["feature"] in feat.name:
                    featureName = feat
                    if "mean" in featureObj["replaceby"]:
                        df = self.replaceByMean(featureName, df)
                    elif "max" in featureObj["replaceby"]:
                        df = self.replaceByMax(featureName, df)
                    elif "min" in featureObj["replaceby"]:
                        df = self.replaceByMin(featureName, df)
                    elif "stddev" in featureObj["replaceby"]:
                        df = self.replaceByStandardDeviation(featureName, df)
                    elif "random" in featureObj["replaceby"]:
                        df = self.replaceDateRandomly(featureName, df)
        return df


def StringIndexerTransform(df, params, transformationData={}):
    dfReturn = df
    feature = params["feature"]

    dfReturn = dfReturn.fillna({feature: ''})
    outcol = feature + "_stringindexer"
    indexer = StringIndexer(
        inputCol=feature, outputCol=outcol, handleInvalid="skip")
    indexed = indexer.fit(dfReturn).transform(dfReturn)
    dfReturn = indexed
    distinct_values_list = dfReturn.select(
        outcol).distinct().rdd.map(lambda r: r[0]).collect()
    len_distinct_values_list = len(distinct_values_list)
    if len_distinct_values_list <= 4:
        changed_type_df = dfReturn.withColumn(
            outcol, dfReturn[outcol].cast(IntegerType()))
        return changed_type_df
    return dfReturn


def BinarizerTransform(df, params, transformationData={}):
    dfReturn = df
    transform_params = params
    feature = transform_params['feature']
    outcol = feature + "_binarizer"
    dfReturn = dfReturn.withColumn("feature_cast", dfReturn[feature].cast("double")).drop(feature)\
        .withColumnRenamed("feature_cast", feature)

    dfReturn = dfReturn.fillna({feature: 0.0})
    binarizer = Binarizer(threshold=float(
        transformationData['threshold']), inputCol=feature, outputCol=outcol)
    binarizedDataFrame = binarizer.transform(dfReturn)

    # binarizedDataFrame=binarizedDataFrame.drop(feature).withColumnRenamed(outcol,feature)

    dfReturn = binarizedDataFrame
    dfReturn = dfReturn.withColumn(feature, round(dfReturn[feature], 2))

    return dfReturn


class TransformationMain:
    # TODO: change df argument in run with following
    def run(transformationDF, config):
        configObj = json.loads(config)
        featureData = configObj["FE"]
        transformationDF = CleanseData().replaceNullValues(featureData, transformationDF)
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Brnd_Name', 'transformation_label': 'String Indexer'}], 'feature': 'Brnd_Name', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
                                                  'count': '500', 'mean': '', 'stddev': '', 'min': 'Reditrex', 'max': 'Sildenafil Citrate', 'missing': '0', 'distinct': '156'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Brnd_Name'}, {'feature_label': 'Brnd_Name', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Brnd_Name')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Gnrc_Name', 'transformation_label': 'String Indexer'}], 'feature': 'Gnrc_Name', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': 'Amifampridine', 'max': 'Zidovudine', 'missing': '0', 'distinct': '136'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Gnrc_Name'}, {'feature_label': 'Gnrc_Name', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Gnrc_Name')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Mftr_Name', 'transformation_label': 'String Indexer'}], 'feature': 'Mftr_Name', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': 'AHP', 'max': 'Zydus Pharmaceu', 'missing': '0', 'distinct': '161'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Mftr_Name'}, {'feature_label': 'Mftr_Name', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Mftr_Name')
        transformationDF = BinarizerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Tot_Spndng_2018', 'threshold': 30630580.598, 'transformation_label': 'Binarizer'}], 'feature': 'Tot_Spndng_2018', 'type': 'real', 'selected': 'True', 'replaceby': 'mean', 'stats': {
            'count': '344', 'mean': '60110197.87', 'stddev': '326910796.59', 'min': '728.01', 'max': '4.0648719156E9', 'missing': '156'}, 'transformation': [{'transformation': 'Binarizer', 'selectedAsDefault': 1}], 'updatedLabel': 'Tot_Spndng_2018'}, {'feature_label': 'Tot_Spndng_2018', 'threshold': 30630580.598, 'transformation_label': 'Binarizer'})
        transformationDF = transformationDF.drop('Tot_Spndng_2018')
        transformationDF = BinarizerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Tot_Dsg_Unts_2018', 'threshold': 13991445.868, 'transformation_label': 'Binarizer'}], 'feature': 'Tot_Dsg_Unts_2018', 'type': 'real', 'selected': 'True', 'replaceby': 'mean', 'stats': {
            'count': '372', 'mean': '13873302.98', 'stddev': '64817161.52', 'min': '94.0', 'max': '8.1386729166E8', 'missing': '128'}, 'transformation': [{'transformation': 'Binarizer', 'selectedAsDefault': 1}], 'updatedLabel': 'Tot_Dsg_Unts_2018'}, {'feature_label': 'Tot_Dsg_Unts_2018', 'threshold': 13991445.868, 'transformation_label': 'Binarizer'})
        transformationDF = transformationDF.drop('Tot_Dsg_Unts_2018')
        transformationDF = BinarizerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Tot_Spndng_2019', 'threshold': 34374080.624, 'transformation_label': 'Binarizer'}], 'feature': 'Tot_Spndng_2019', 'type': 'real', 'selected': 'True', 'replaceby': 'mean', 'stats': {
            'count': '375', 'mean': '61150178.07', 'stddev': '363129978.81', 'min': '204.15', 'max': '4.6730950516E9', 'missing': '125'}, 'transformation': [{'transformation': 'Binarizer', 'selectedAsDefault': 1}], 'updatedLabel': 'Tot_Spndng_2019'}, {'feature_label': 'Tot_Spndng_2019', 'threshold': 34374080.624, 'transformation_label': 'Binarizer'})
        transformationDF = transformationDF.drop('Tot_Spndng_2019')
        transformationDF = BinarizerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Tot_Dsg_Unts_2019', 'threshold': 12462559.202, 'transformation_label': 'Binarizer'}], 'feature': 'Tot_Dsg_Unts_2019', 'type': 'real', 'selected': 'True', 'replaceby': 'mean', 'stats': {
            'count': '375', 'mean': '15256592.29', 'stddev': '70309312.78', 'min': '281.0', 'max': '8.6966374658E8', 'missing': '90'}, 'transformation': [{'transformation': 'Binarizer', 'selectedAsDefault': 1}], 'updatedLabel': 'Tot_Dsg_Unts_2019'}, {'feature_label': 'Tot_Dsg_Unts_2019', 'threshold': 12462559.202, 'transformation_label': 'Binarizer'})
        transformationDF = transformationDF.drop('Tot_Dsg_Unts_2019')
        transformationDF = BinarizerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Tot_Spndng_2020', 'threshold': 26826835.094, 'transformation_label': 'Binarizer'}], 'feature': 'Tot_Spndng_2020', 'type': 'real', 'selected': 'True', 'replaceby': 'mean', 'stats': {
            'count': '448', 'mean': '26826835.6', 'stddev': '108913441.27', 'min': '100.65', 'max': '1.1604749027E9', 'missing': '88'}, 'transformation': [{'transformation': 'Binarizer', 'selectedAsDefault': 1}], 'updatedLabel': 'Tot_Spndng_2020'}, {'feature_label': 'Tot_Spndng_2020', 'threshold': 26826835.094, 'transformation_label': 'Binarizer'})
        transformationDF = transformationDF.drop('Tot_Spndng_2020')
        transformationDF = BinarizerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Tot_Dsg_Unts_2020', 'threshold': 14701030.716, 'transformation_label': 'Binarizer'}], 'feature': 'Tot_Dsg_Unts_2020', 'type': 'real', 'selected': 'True', 'replaceby': 'mean', 'stats': {
            'count': '412', 'mean': '15374680.61', 'stddev': '77160421.57', 'min': '337.0', 'max': '1.090258137E9', 'missing': '88'}, 'transformation': [{'transformation': 'Binarizer', 'selectedAsDefault': 1}], 'updatedLabel': 'Tot_Dsg_Unts_2020'}, {'feature_label': 'Tot_Dsg_Unts_2020', 'threshold': 14701030.716, 'transformation_label': 'Binarizer'})
        transformationDF = transformationDF.drop('Tot_Dsg_Unts_2020')
        transformationDF = BinarizerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Avg_Spnd_Per_Bene_2020', 'threshold': 10986.966, 'transformation_label': 'Binarizer'}], 'feature': 'Avg_Spnd_Per_Bene_2020', 'type': 'real', 'selected': 'True', 'replaceby': 'mean', 'stats': {
            'count': '438', 'mean': '9109.54', 'stddev': '43990.13', 'min': '1.7768061367', 'max': '457905.58608', 'missing': '99'}, 'transformation': [{'transformation': 'Binarizer', 'selectedAsDefault': 1}], 'updatedLabel': 'Avg_Spnd_Per_Bene_2020'}, {'feature_label': 'Avg_Spnd_Per_Bene_2020', 'threshold': 10986.966, 'transformation_label': 'Binarizer'})
        transformationDF = transformationDF.drop('Avg_Spnd_Per_Bene_2020')
        transformationDF = BinarizerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Tot_Spndng_2021', 'threshold': 41645685.852, 'transformation_label': 'Binarizer'}], 'feature': 'Tot_Spndng_2021', 'type': 'real', 'selected': 'True', 'replaceby': 'mean', 'stats': {
            'count': '462', 'mean': '59173897.67', 'stddev': '407762257.58', 'min': '92.79', 'max': '5.893547689E9', 'missing': '25'}, 'transformation': [{'transformation': 'Binarizer', 'selectedAsDefault': 1}], 'updatedLabel': 'Tot_Spndng_2021'}, {'feature_label': 'Tot_Spndng_2021', 'threshold': 41645685.852, 'transformation_label': 'Binarizer'})
        transformationDF = transformationDF.drop('Tot_Spndng_2021')
        transformationDF = BinarizerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Tot_Dsg_Unts_2021', 'threshold': 14551274.198, 'transformation_label': 'Binarizer'}], 'feature': 'Tot_Dsg_Unts_2021', 'type': 'real', 'selected': 'True', 'replaceby': 'mean', 'stats': {
            'count': '462', 'mean': '14870273.72', 'stddev': '82764971.76', 'min': '101.0', 'max': '1.321691809E9', 'missing': '38'}, 'transformation': [{'transformation': 'Binarizer', 'selectedAsDefault': 1}], 'updatedLabel': 'Tot_Dsg_Unts_2021'}, {'feature_label': 'Tot_Dsg_Unts_2021', 'threshold': 14551274.198, 'transformation_label': 'Binarizer'})
        transformationDF = transformationDF.drop('Tot_Dsg_Unts_2021')
        transformationDF = BinarizerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Avg_Spnd_Per_Bene_2021', 'threshold': 12806.038, 'transformation_label': 'Binarizer'}], 'feature': 'Avg_Spnd_Per_Bene_2021', 'type': 'real', 'selected': 'True', 'replaceby': 'mean', 'stats': {
            'count': '464', 'mean': '12202.59', 'stddev': '64402.04', 'min': '2.1564227642', 'max': '704565.9', 'missing': '36'}, 'transformation': [{'transformation': 'Binarizer', 'selectedAsDefault': 1}], 'updatedLabel': 'Avg_Spnd_Per_Bene_2021'}, {'feature_label': 'Avg_Spnd_Per_Bene_2021', 'threshold': 12806.038, 'transformation_label': 'Binarizer'})
        transformationDF = transformationDF.drop('Avg_Spnd_Per_Bene_2021')
        transformationDF = BinarizerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Tot_Spndng_2022', 'threshold': 45679210.582, 'transformation_label': 'Binarizer'}], 'feature': 'Tot_Spndng_2022', 'type': 'real', 'selected': 'True', 'replaceby': 'mean', 'stats': {
            'count': '500', 'mean': '25178948.99', 'stddev': '100729563.62', 'min': '33.06', 'max': '9.4742593983E8', 'missing': '0'}, 'transformation': [{'transformation': 'Binarizer', 'selectedAsDefault': 1}], 'updatedLabel': 'Tot_Spndng_2022'}, {'feature_label': 'Tot_Spndng_2022', 'threshold': 45679210.582, 'transformation_label': 'Binarizer'})
        transformationDF = transformationDF.drop('Tot_Spndng_2022')
        transformationDF = BinarizerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Tot_Dsg_Unts_2022', 'threshold': 10188193.582, 'transformation_label': 'Binarizer'}], 'feature': 'Tot_Dsg_Unts_2022', 'type': 'real', 'selected': 'True', 'replaceby': 'mean', 'stats': {
            'count': '500', 'mean': '10188193.71', 'stddev': '52195799.71', 'min': '224.0', 'max': '7.3983309035E8', 'missing': '0'}, 'transformation': [{'transformation': 'Binarizer', 'selectedAsDefault': 1}], 'updatedLabel': 'Tot_Dsg_Unts_2022'}, {'feature_label': 'Tot_Dsg_Unts_2022', 'threshold': 10188193.582, 'transformation_label': 'Binarizer'})
        transformationDF = transformationDF.drop('Tot_Dsg_Unts_2022')
        transformationDF = BinarizerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Avg_Spnd_Per_Bene_2022', 'threshold': 13774.69, 'transformation_label': 'Binarizer'}], 'feature': 'Avg_Spnd_Per_Bene_2022', 'type': 'real', 'selected': 'True', 'replaceby': 'mean', 'stats': {
            'count': '475', 'mean': '13775.18', 'stddev': '46062.99', 'min': '10.154306476', 'max': '488333.77705', 'missing': '13'}, 'transformation': [{'transformation': 'Binarizer', 'selectedAsDefault': 1}], 'updatedLabel': 'Avg_Spnd_Per_Bene_2022'}, {'feature_label': 'Avg_Spnd_Per_Bene_2022', 'threshold': 13774.69, 'transformation_label': 'Binarizer'})
        transformationDF = transformationDF.drop('Avg_Spnd_Per_Bene_2022')
        display(transformationDF.limit(2).toPandas())
        return transformationDF


***AUTOML FUNCTIONS***

In [None]:
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
import pyspark


def functionClassification(sparkDF, listOfFeatures, label):
    sparkDF.persist(pyspark.StorageLevel.MEMORY_AND_DISK)
    df = (sparkDF.toPandas())
    X = (df.drop(label, axis=1))[listOfFeatures].values
    y = df[label].values
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=1, test_size=0.1)
    tpotModel = TPOTClassifier(verbosity=3, n_jobs=-1, generations=10, max_time_mins=5,
                               population_size=15, use_dask=True)
    tpotModel.fit(X_train, y_train)
    display(" Accuracy of Model : %s" % tpotModel.score(X_test, y_test))
    data = {'model': tpotModel,
            'X_test': X_test,
            'y_test': y_test,
            'label': label,
            'columnNames': listOfFeatures}
    return data


***READING DATAFRAME***

In [None]:
############## CREATE SPARK SESSION ############################ ENTER YOUR SPARK MASTER IP AND PORT TO CONNECT TO SERVER ################
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[1]').getOrCreate()
#%run regressionmedicareHooks.ipynb
try:
	#sourcePreExecutionHook()

	medicarepartdspendingbydrug = HDFSConnector.fetch(spark, "{'url': '/FileStore/platform/uploadedSourceFiles/Medicare_Part_D_Spending_by_Drug_2022.csv', 'filename': 'Medicare_Part_D_Spending_by_Drug_2022.csv', 'delimiter': ',', 'file_type': 'Delimeted', 'is_header': 'Use Header Line', 'domain': 'http://172.31.59.158', 'port': '40070', 'dirPath': '/FileStore/platform', 'server_url': '/nexusMax/NexusMaxPlatform/uploads/platform/'}")
	#sourcePostExecutionHook(medicarepartdspendingbydrug)

except Exception as ex: 
	logging.error(ex)
#spark.stop()


***TRANSFORMING DATAFRAME***

In [None]:
#%run regressionmedicareHooks.ipynb
try:
	#transformationPreExecutionHook()

	regressionmedicareautofe = TransformationMain.run(medicarepartdspendingbydrug,json.dumps( {"FE": [{"transformationsData": [{"feature_label": "Brnd_Name", "transformation_label": "String Indexer"}], "feature": "Brnd_Name", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "Reditrex", "max": "Sildenafil Citrate", "missing": "0", "distinct": "156"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Brnd_Name"}, {"transformationsData": [{"feature_label": "Gnrc_Name", "transformation_label": "String Indexer"}], "feature": "Gnrc_Name", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "Amifampridine", "max": "Zidovudine", "missing": "0", "distinct": "136"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Gnrc_Name"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Tot_Mftr", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "1.49", "stddev": "2.45", "min": "1", "max": "34", "missing": "0"}, "updatedLabel": "Tot_Mftr"}, {"transformationsData": [{"feature_label": "Mftr_Name", "transformation_label": "String Indexer"}], "feature": "Mftr_Name", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "AHP", "max": "Zydus Pharmaceu", "missing": "0", "distinct": "161"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Mftr_Name"}, {"transformationsData": [{"feature_label": "Tot_Spndng_2018", "threshold": 30630580.598, "transformation_label": "Binarizer"}], "feature": "Tot_Spndng_2018", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "344", "mean": "60110197.87", "stddev": "326910796.59", "min": "728.01", "max": "4.0648719156E9", "missing": "156"}, "transformation": [{"transformation": "Binarizer", "selectedAsDefault": 1}], "updatedLabel": "Tot_Spndng_2018"}, {"transformationsData": [{"feature_label": "Tot_Dsg_Unts_2018", "threshold": 13991445.868, "transformation_label": "Binarizer"}], "feature": "Tot_Dsg_Unts_2018", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "372", "mean": "13873302.98", "stddev": "64817161.52", "min": "94.0", "max": "8.1386729166E8", "missing": "128"}, "transformation": [{"transformation": "Binarizer", "selectedAsDefault": 1}], "updatedLabel": "Tot_Dsg_Unts_2018"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Tot_Clms_2018", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "344", "mean": "240669.67", "stddev": "1045291.78", "min": "13", "max": "13699428", "missing": "156"}, "updatedLabel": "Tot_Clms_2018"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Tot_Benes_2018", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "374", "mean": "64854.97", "stddev": "245992.14", "min": "11", "max": "2522897", "missing": "126"}, "updatedLabel": "Tot_Benes_2018"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Avg_Spnd_Per_Dsg_Unt_Wghtd_2018", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "344", "mean": "134.73", "stddev": "599.42", "min": "0.0993199294", "max": "5883.2772926", "missing": "156"}, "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "updatedLabel": "Avg_Spnd_Per_Dsg_Unt_Wght..."}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Avg_Spnd_Per_Clm_2018", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "372", "mean": "1775.81", "stddev": "7371.41", "min": "1.3838099174", "max": "60858.160656", "missing": "156"}, "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "updatedLabel": "Avg_Spnd_Per_Clm_2018"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Avg_Spnd_Per_Bene_2018", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "336", "mean": "8181.56", "stddev": "26711.67", "min": "11.555714286", "max": "232096.95711", "missing": "164"}, "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "updatedLabel": "Avg_Spnd_Per_Bene_2018"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Outlier_Flag_2018", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "372", "mean": "0.05", "stddev": "0.23", "min": "0", "max": "1", "missing": "156"}, "updatedLabel": "Outlier_Flag_2018"}, {"transformationsData": [{"feature_label": "Tot_Spndng_2019", "threshold": 34374080.624, "transformation_label": "Binarizer"}], "feature": "Tot_Spndng_2019", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "375", "mean": "61150178.07", "stddev": "363129978.81", "min": "204.15", "max": "4.6730950516E9", "missing": "125"}, "transformation": [{"transformation": "Binarizer", "selectedAsDefault": 1}], "updatedLabel": "Tot_Spndng_2019"}, {"transformationsData": [{"feature_label": "Tot_Dsg_Unts_2019", "threshold": 12462559.202, "transformation_label": "Binarizer"}], "feature": "Tot_Dsg_Unts_2019", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "375", "mean": "15256592.29", "stddev": "70309312.78", "min": "281.0", "max": "8.6966374658E8", "missing": "90"}, "transformation": [{"transformation": "Binarizer", "selectedAsDefault": 1}], "updatedLabel": "Tot_Dsg_Unts_2019"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Tot_Clms_2019", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "375", "mean": "258297.0", "stddev": "1175361.45", "min": "12", "max": "13854316", "missing": "125"}, "updatedLabel": "Tot_Clms_2019"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Tot_Benes_2019", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "367", "mean": "80456.45", "stddev": "382160.06", "min": "12", "max": "3853082", "missing": "98"}, "updatedLabel": "Tot_Benes_2019"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Avg_Spnd_Per_Dsg_Unt_Wghtd_2019", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "410", "mean": "158.06", "stddev": "898.41", "min": "0.0057110463", "max": "9377.56106", "missing": "90"}, "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "updatedLabel": "Avg_Spnd_Per_Dsg_Unt_Wght..."}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Avg_Spnd_Per_Clm_2019", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "410", "mean": "1687.26", "stddev": "6898.05", "min": "1.3625274177", "max": "60985.811474", "missing": "90"}, "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "updatedLabel": "Avg_Spnd_Per_Clm_2019"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Avg_Spnd_Per_Bene_2019", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "364", "mean": "10144.23", "stddev": "40915.4", "min": "5.987", "max": "477066.32724", "missing": "136"}, "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "updatedLabel": "Avg_Spnd_Per_Bene_2019"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Outlier_Flag_2019", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "375", "mean": "0.07", "stddev": "0.26", "min": "0", "max": "1", "missing": "90"}, "updatedLabel": "Outlier_Flag_2019"}, {"transformationsData": [{"feature_label": "Tot_Spndng_2020", "threshold": 26826835.094, "transformation_label": "Binarizer"}], "feature": "Tot_Spndng_2020", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "448", "mean": "26826835.6", "stddev": "108913441.27", "min": "100.65", "max": "1.1604749027E9", "missing": "88"}, "transformation": [{"transformation": "Binarizer", "selectedAsDefault": 1}], "updatedLabel": "Tot_Spndng_2020"}, {"transformationsData": [{"feature_label": "Tot_Dsg_Unts_2020", "threshold": 14701030.716, "transformation_label": "Binarizer"}], "feature": "Tot_Dsg_Unts_2020", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "412", "mean": "15374680.61", "stddev": "77160421.57", "min": "337.0", "max": "1.090258137E9", "missing": "88"}, "transformation": [{"transformation": "Binarizer", "selectedAsDefault": 1}], "updatedLabel": "Tot_Dsg_Unts_2020"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Tot_Clms_2020", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "448", "mean": "226201.41", "stddev": "1030819.18", "min": "11", "max": "12298580", "missing": "52"}, "updatedLabel": "Tot_Clms_2020"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Tot_Benes_2020", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "402", "mean": "75829.24", "stddev": "373469.43", "min": "11", "max": "4009284", "missing": "98"}, "updatedLabel": "Tot_Benes_2020"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Avg_Spnd_Per_Dsg_Unt_Wghtd_2020", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "448", "mean": "190.21", "stddev": "1032.97", "min": "0.00680288", "max": "9168.8986612", "missing": "88"}, "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "updatedLabel": "Avg_Spnd_Per_Dsg_Unt_Wght..."}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Avg_Spnd_Per_Clm_2020", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "448", "mean": "1739.94", "stddev": "6830.28", "min": "1.4935169988", "max": "63090.611893", "missing": "88"}, "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "updatedLabel": "Avg_Spnd_Per_Clm_2020"}, {"transformationsData": [{"feature_label": "Avg_Spnd_Per_Bene_2020", "threshold": 10986.966, "transformation_label": "Binarizer"}], "feature": "Avg_Spnd_Per_Bene_2020", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "438", "mean": "9109.54", "stddev": "43990.13", "min": "1.7768061367", "max": "457905.58608", "missing": "99"}, "transformation": [{"transformation": "Binarizer", "selectedAsDefault": 1}], "updatedLabel": "Avg_Spnd_Per_Bene_2020"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Outlier_Flag_2020", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "448", "mean": "0.07", "stddev": "0.25", "min": "0", "max": "1", "missing": "52"}, "updatedLabel": "Outlier_Flag_2020"}, {"transformationsData": [{"feature_label": "Tot_Spndng_2021", "threshold": 41645685.852, "transformation_label": "Binarizer"}], "feature": "Tot_Spndng_2021", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "462", "mean": "59173897.67", "stddev": "407762257.58", "min": "92.79", "max": "5.893547689E9", "missing": "25"}, "transformation": [{"transformation": "Binarizer", "selectedAsDefault": 1}], "updatedLabel": "Tot_Spndng_2021"}, {"transformationsData": [{"feature_label": "Tot_Dsg_Unts_2021", "threshold": 14551274.198, "transformation_label": "Binarizer"}], "feature": "Tot_Dsg_Unts_2021", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "462", "mean": "14870273.72", "stddev": "82764971.76", "min": "101.0", "max": "1.321691809E9", "missing": "38"}, "transformation": [{"transformation": "Binarizer", "selectedAsDefault": 1}], "updatedLabel": "Tot_Dsg_Unts_2021"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Tot_Clms_2021", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "475", "mean": "224282.77", "stddev": "1073339.49", "min": "11", "max": "12787802", "missing": "25"}, "updatedLabel": "Tot_Clms_2021"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Tot_Benes_2021", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "441", "mean": "71320.02", "stddev": "355567.33", "min": "12", "max": "4867268", "missing": "36"}, "updatedLabel": "Tot_Benes_2021"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Avg_Spnd_Per_Dsg_Unt_Wghtd_2021", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "475", "mean": "188.13", "stddev": "1087.75", "min": "0.007185246", "max": "10355.826219", "missing": "38"}, "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "updatedLabel": "Avg_Spnd_Per_Dsg_Unt_Wght..."}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Avg_Spnd_Per_Clm_2021", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "475", "mean": "1731.92", "stddev": "6979.23", "min": "1.6894267516", "max": "64661.40901", "missing": "25"}, "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "updatedLabel": "Avg_Spnd_Per_Clm_2021"}, {"transformationsData": [{"feature_label": "Avg_Spnd_Per_Bene_2021", "threshold": 12806.038, "transformation_label": "Binarizer"}], "feature": "Avg_Spnd_Per_Bene_2021", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "464", "mean": "12202.59", "stddev": "64402.04", "min": "2.1564227642", "max": "704565.9", "missing": "36"}, "transformation": [{"transformation": "Binarizer", "selectedAsDefault": 1}], "updatedLabel": "Avg_Spnd_Per_Bene_2021"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Outlier_Flag_2021", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "475", "mean": "0.08", "stddev": "0.27", "min": "0", "max": "1", "missing": "39"}, "updatedLabel": "Outlier_Flag_2021"}, {"transformationsData": [{"feature_label": "Tot_Spndng_2022", "threshold": 45679210.582, "transformation_label": "Binarizer"}], "feature": "Tot_Spndng_2022", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "500", "mean": "25178948.99", "stddev": "100729563.62", "min": "33.06", "max": "9.4742593983E8", "missing": "0"}, "transformation": [{"transformation": "Binarizer", "selectedAsDefault": 1}], "updatedLabel": "Tot_Spndng_2022"}, {"transformationsData": [{"feature_label": "Tot_Dsg_Unts_2022", "threshold": 10188193.582, "transformation_label": "Binarizer"}], "feature": "Tot_Dsg_Unts_2022", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "500", "mean": "10188193.71", "stddev": "52195799.71", "min": "224.0", "max": "7.3983309035E8", "missing": "0"}, "transformation": [{"transformation": "Binarizer", "selectedAsDefault": 1}], "updatedLabel": "Tot_Dsg_Unts_2022"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Tot_Clms_2022", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "222537.77", "stddev": "1111526.68", "min": "12", "max": "15048710", "missing": "0"}, "updatedLabel": "Tot_Clms_2022"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Tot_Benes_2022", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "487", "mean": "73178.65", "stddev": "354659.0", "min": "11", "max": "5656896", "missing": "13"}, "updatedLabel": "Tot_Benes_2022"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Avg_Spnd_Per_Dsg_Unt_Wghtd_2022", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "500", "mean": "191.26", "stddev": "1109.42", "min": "0.0033761421", "max": "10449.922858", "missing": "0"}, "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "updatedLabel": "Avg_Spnd_Per_Dsg_Unt_Wght..."}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Avg_Spnd_Per_Clm_2022", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "500", "mean": "4528.93", "stddev": "26181.41", "min": "2.9569831582", "max": "337266.85958", "missing": "0"}, "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "updatedLabel": "Avg_Spnd_Per_Clm_2022"}, {"transformationsData": [{"feature_label": "Avg_Spnd_Per_Bene_2022", "threshold": 13774.69, "transformation_label": "Binarizer"}], "feature": "Avg_Spnd_Per_Bene_2022", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "475", "mean": "13775.18", "stddev": "46062.99", "min": "10.154306476", "max": "488333.77705", "missing": "13"}, "transformation": [{"transformation": "Binarizer", "selectedAsDefault": 1}], "updatedLabel": "Avg_Spnd_Per_Bene_2022"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Outlier_Flag_2022", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "0.1", "stddev": "0.29", "min": "0", "max": "1", "missing": "2"}, "updatedLabel": "Outlier_Flag_2022"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Chg_Avg_Spnd_Per_Dsg_Unt_21_22", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "475", "mean": "0.14", "stddev": "1.9", "min": "-0.722505065", "max": "39.375895774", "missing": "25"}, "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "updatedLabel": "Chg_Avg_Spnd_Per_Dsg_Unt_..."}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "CAGR_Avg_Spnd_Per_Dsg_Unt_18_22", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "464", "mean": "0.0", "stddev": "0.15", "min": "-0.754733323", "max": "1.4809627173", "missing": "23"}, "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "updatedLabel": "CAGR_Avg_Spnd_Per_Dsg_Unt..."}]}))

	#transformationPostExecutionHook(regressionmedicareautofe)

except Exception as ex: 
	logging.error(ex)


***TRAIN MODEL***

In [None]:
#%run regressionmedicareHooks.ipynb
try:
	#mlPreExecutionHook()

	dataAutoML=functionClassification(regressionmedicareautofe, ["Tot_Mftr", "Tot_Clms_2018", "Tot_Benes_2018", "Avg_Spnd_Per_Dsg_Unt_Wghtd_2018", "Avg_Spnd_Per_Clm_2018", "Avg_Spnd_Per_Bene_2018", "Outlier_Flag_2018", "Tot_Clms_2019", "Tot_Benes_2019", "Avg_Spnd_Per_Dsg_Unt_Wghtd_2019", "Avg_Spnd_Per_Clm_2019", "Avg_Spnd_Per_Bene_2019", "Outlier_Flag_2019", "Tot_Clms_2020", "Tot_Benes_2020", "Avg_Spnd_Per_Dsg_Unt_Wghtd_2020", "Avg_Spnd_Per_Clm_2020", "Outlier_Flag_2020", "Tot_Clms_2021", "Tot_Benes_2021", "Avg_Spnd_Per_Dsg_Unt_Wghtd_2021", "Avg_Spnd_Per_Clm_2021", "Outlier_Flag_2021", "Tot_Clms_2022", "Tot_Benes_2022", "Avg_Spnd_Per_Dsg_Unt_Wghtd_2022", "Avg_Spnd_Per_Clm_2022", "Outlier_Flag_2022", "Chg_Avg_Spnd_Per_Dsg_Unt_21_22", "CAGR_Avg_Spnd_Per_Dsg_Unt_18_22", "Brnd_Name_stringindexer", "Gnrc_Name_stringindexer", "Mftr_Name_stringindexer", "Tot_Dsg_Unts_2018_binarizer", "Tot_Spndng_2019_binarizer", "Tot_Dsg_Unts_2019_binarizer", "Tot_Spndng_2020_binarizer", "Tot_Dsg_Unts_2020_binarizer", "Avg_Spnd_Per_Bene_2020_binarizer", "Tot_Spndng_2021_binarizer", "Tot_Dsg_Unts_2021_binarizer", "Avg_Spnd_Per_Bene_2021_binarizer", "Tot_Spndng_2022_binarizer", "Tot_Dsg_Unts_2022_binarizer", "Avg_Spnd_Per_Bene_2022_binarizer"], "Tot_Spndng_2018_binarizer")

	#mlPostExecutionHook(dataAutoML)

except Exception as ex: 
	logging.error(ex)
#spark.stop()


***PREDICT ON TRAINED MODEL***

In [None]:
import pandas as pd
import numpy as np
import sklearn.metrics

try:
    model=dataAutoML['model']
    X_test=dataAutoML['X_test']
    y_test=dataAutoML['y_test']
    label=dataAutoML['label']
    columnNames=dataAutoML['columnNames']
    if label in columnNames:
        columnNames.remove(label)
    predicted=label+"_predicted"
    y_predicted=model.predict(X_test)
    df =pd.DataFrame(X_test , columns=columnNames)
    df[label]=y_test
    df[predicted]=y_predicted
    columnNames.insert(0,predicted)
    columnNames.insert(0,label)
    Accuracy = np.round((100 * sklearn.metrics.accuracy_score(y_true=y_test, y_pred=y_predicted)), 1)
    F1= np.round(
            (100 * sklearn.metrics.f1_score(y_true=y_test, y_pred=y_predicted, average="weighted")), 1)
    Precision= np.round((
                100 * sklearn.metrics.precision_score(y_true=y_test, y_pred=y_predicted, average="weighted")), 1)
    Recall = np.round((
                100 * sklearn.metrics.recall_score(y_true=y_test, y_pred=y_predicted, average="weighted")), 1)
    display(" Accuracy of Prediction on test data    : %s"%Accuracy)
    display(" F1 score of Prediction on test data    : %s"%F1)
    display(" Precision of Prediction on test data   : %s"%Precision)
    display(" Recall of Prediction on test data      : %s"%Recall)
    display(df.head())
except Exception as ex:
    logging.error(ex)

spark.stop()

