***GENERATED CODE FOR reg PIPELINE.***

***DON'T EDIT THIS CODE.***

***CONNECTOR FUNCTIONS TO READ DATA.***

In [None]:
import os
import datetime
import logging
import warnings
warnings.filterwarnings('ignore')
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)


class HDFSConnector:

    def fetch(spark, config):
        ################### INPUT HADOOP HOST PORT TO CONNECT WITH ###############################
        hdfs_server = str(os.environ['HDFS_SERVER'])
        hdfs_port = int(os.environ['HDFS_PORT'])
        df = spark.read.options(header='true', inferschema='true').csv(
            f"hdfs://{hdfs_server}:{hdfs_port}{eval(config)['url']}", header='true')
        display(df.limit(2).toPandas())
        return df

    def put(df, spark, config):
        return df.write.format('csv').options(header='true' if eval(config)["is_header"] == "Use Header Line" else 'false',
                                              delimiter=eval(config)["delimiter"]).save(("%s %s") % (datetime.datetime.now().strftime("%Y-%m-%d %H.%M.%S")+"_", eval(config)['url']))


***TRANSFORMATIONS FUNCTIONS THAT WILL BE APPLIED ON DATA***

In [None]:
from pyspark.sql.functions import dayofmonth, month, year, col
from pyspark.ml.feature import StringIndexer
import json
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import col, when
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import mean, stddev, min, max, col


class CleanseData:
    # def __init__(self,df):
    #     #print()

    def cleanValueForFE(self, value):
        if value == None:
            return ""
        elif str(value) == 'nan':
            return "nan"
        else:
            return value

    def replaceByMean(self, feature, df, mean_=-1):
        df1 = df
        df1 = df1.dropna()
        meanValue = self.cleanValueForFE(df1.select(
            mean(col(feature.name)).alias('mean')).collect()[0]["mean"])
        df = df.fillna(meanValue, subset=[feature.name])
        df.withColumn(feature.name, when(col(feature.name) == " ",
                      meanValue).otherwise(col(feature.name).cast("Integer")))
        return df

    def replaceByMax(self, feature, df, max_=-1):
        df1 = df
        df1 = df1.dropna()
        maxValue = self.cleanValueForFE(df1.select(
            max(col(feature.name)).alias('max')).collect()[0]["max"])
        df = df.fillna(maxValue, subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", maxValue).otherwise(col(feature.name)))
        return df

    def replaceByMin(self, feature, df, min_=-1):
        df1 = df
        df1 = df1.dropna()
        minValue = self.cleanValueForFE(df1.select(
            min(col(feature.name)).alias('min')).collect()[0]["min"])
        df = df.fillna(minValue, subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", minValue).otherwise(col(feature.name)))
        return df

    def replaceByStandardDeviation(self, feature, df, stddev_=-1):
        df1 = df
        df1 = df1.dropna()
        stddevValue = self.cleanValueForFE(df1.select(
            stddev(col(feature.name)).alias('stddev')).collect()[0]["stddev"])
        df = df.fillna(stddevValue, subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", stddevValue).otherwise(col(feature.name)))
        return df

    def replaceDateRandomly(self, feature, df):
        df1 = df
        df1 = df1.dropna()
        fillValue = self.cleanValueForFE(
            df.where(col(feature.name).isNotNull()).head(1)[0][feature.name])
        df = df.fillna(str(fillValue), subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", fillValue).otherwise(col(feature.name)))
        # print("CleanseData:replaceDateRandomly Schema : ", df.#printSchema())
        return df

    def replaceNullValues(self, fList, df):
        featuresList = df.schema.fields
        for featureObj in fList:
            for feat in featuresList:
                if featureObj["feature"] in feat.name:
                    featureName = feat
                    if "mean" in featureObj["replaceby"]:
                        df = self.replaceByMean(featureName, df)
                    elif "max" in featureObj["replaceby"]:
                        df = self.replaceByMax(featureName, df)
                    elif "min" in featureObj["replaceby"]:
                        df = self.replaceByMin(featureName, df)
                    elif "stddev" in featureObj["replaceby"]:
                        df = self.replaceByStandardDeviation(featureName, df)
                    elif "random" in featureObj["replaceby"]:
                        df = self.replaceDateRandomly(featureName, df)
        return df


def ExtractDateTransform(df, params, transformationData={}):
    transform_params = params
    dfReturn = df
    feature = transform_params['feature']
    dfReturn = dfReturn.fillna({feature: ''})
    dfReturn = dfReturn.withColumn(
        feature+'dayofmonth', dayofmonth(col(feature)))
    dfReturn = dfReturn.withColumn(feature+'month', month(col(feature)))
    dfReturn = dfReturn.withColumn(feature+'year', year(col(feature)))
    return dfReturn


def StringIndexerTransform(df, params, transformationData={}):
    dfReturn = df
    feature = params["feature"]

    dfReturn = dfReturn.fillna({feature: ''})
    outcol = feature + "_stringindexer"
    indexer = StringIndexer(
        inputCol=feature, outputCol=outcol, handleInvalid="skip")
    indexed = indexer.fit(dfReturn).transform(dfReturn)
    dfReturn = indexed
    distinct_values_list = dfReturn.select(
        outcol).distinct().rdd.map(lambda r: r[0]).collect()
    len_distinct_values_list = len(distinct_values_list)
    if len_distinct_values_list <= 4:
        changed_type_df = dfReturn.withColumn(
            outcol, dfReturn[outcol].cast(IntegerType()))
        return changed_type_df
    return dfReturn


class TransformationMain:
    # TODO: change df argument in run with following
    def run(transformationDF, config):
        configObj = json.loads(config)
        featureData = configObj["FE"]
        transformationDF = CleanseData().replaceNullValues(featureData, transformationDF)
        transformationDF = ExtractDateTransform(transformationDF, {'transformationsData': [{'feature_label': 'Date Rptd', 'transformation_label': 'Extract Date'}], 'feature': 'Date Rptd', 'type': 'date', 'selected': 'True', 'replaceby': 'random', 'stats': {
                                                'count': '', 'mean': '', 'stddev': '', 'min': '', 'max': '', 'missing': '0'}, 'transformation': [{'transformation': 'Extract Date', 'selectedAsDefault': 1}], 'generated': 'False', 'updatedLabel': 'Date Rptd'}, {'feature_label': 'Date Rptd', 'transformation_label': 'Extract Date'})
        transformationDF = transformationDF.drop('Date Rptd')
        transformationDF = ExtractDateTransform(transformationDF, {'transformationsData': [{'feature_label': 'DATE OCC', 'transformation_label': 'Extract Date'}], 'feature': 'DATE OCC', 'type': 'date', 'selected': 'True', 'replaceby': 'random', 'stats': {
            'count': '', 'mean': '', 'stddev': '', 'min': '', 'max': '', 'missing': '0'}, 'transformation': [{'transformation': 'Extract Date', 'selectedAsDefault': 1}], 'generated': 'False', 'updatedLabel': 'DATE OCC'}, {'feature_label': 'DATE OCC', 'transformation_label': 'Extract Date'})
        transformationDF = transformationDF.drop('DATE OCC')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'AREA NAME', 'transformation_label': 'String Indexer'}], 'feature': 'AREA NAME', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': '77th Street', 'max': 'Wilshire', 'missing': '0', 'distinct': '21'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'AREA NAME'}, {'feature_label': 'AREA NAME', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('AREA NAME')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Crm Cd Desc', 'transformation_label': 'String Indexer'}], 'feature': 'Crm Cd Desc', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': 'ARSON', 'max': 'VIOLATION OF RESTRAINING ORDER', 'missing': '0', 'distinct': '53'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Crm Cd Desc'}, {'feature_label': 'Crm Cd Desc', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Crm Cd Desc')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Mocodes', 'transformation_label': 'String Indexer'}], 'feature': 'Mocodes', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '452', 'mean': '881.67', 'stddev': '673.45', 'min': '0100 0344 0922 0923 0924 0925 0929 0930 1822', 'max': '935', 'missing': '48', 'distinct': '352'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Mocodes'}, {'feature_label': 'Mocodes', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Mocodes')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Vict Sex', 'transformation_label': 'String Indexer'}], 'feature': 'Vict Sex', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '466', 'mean': '', 'stddev': '', 'min': 'F', 'max': 'X', 'missing': '34', 'distinct': '3'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Vict Sex'}, {'feature_label': 'Vict Sex', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Vict Sex')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Vict Descent', 'transformation_label': 'String Indexer'}], 'feature': 'Vict Descent', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '466', 'mean': '', 'stddev': '', 'min': 'A', 'max': 'X', 'missing': '34', 'distinct': '13'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Vict Descent'}, {'feature_label': 'Vict Descent', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Vict Descent')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Premis Desc', 'transformation_label': 'String Indexer'}], 'feature': 'Premis Desc', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': 'ABANDONED BUILDING ABANDONED HOUSE', 'max': 'WEBSITE', 'missing': '0', 'distinct': '60'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Premis Desc'}, {'feature_label': 'Premis Desc', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Premis Desc')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Weapon Desc', 'transformation_label': 'String Indexer'}], 'feature': 'Weapon Desc', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '173', 'mean': '', 'stddev': '', 'min': 'AIR PISTOL/REVOLVER/RIFLE/BB GUN', 'max': 'VERBAL THREAT', 'missing': '327', 'distinct': '25'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Weapon Desc'}, {'feature_label': 'Weapon Desc', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Weapon Desc')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Status', 'transformation_label': 'String Indexer'}], 'feature': 'Status', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': 'AA', 'max': 'JO', 'missing': '0', 'distinct': '5'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Status'}, {'feature_label': 'Status', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Status')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Status Desc', 'transformation_label': 'String Indexer'}], 'feature': 'Status Desc', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': 'Adult Arrest', 'max': 'Juv Other', 'missing': '0', 'distinct': '5'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Status Desc'}, {'feature_label': 'Status Desc', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Status Desc')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'LOCATION', 'transformation_label': 'String Indexer'}], 'feature': 'LOCATION', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': '100 E  3RD                          ST', 'max': 'YUCCA                        ST', 'missing': '0', 'distinct': '479'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'LOCATION'}, {'feature_label': 'LOCATION', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('LOCATION')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Cross Street', 'transformation_label': 'String Indexer'}], 'feature': 'Cross Street', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '79', 'mean': '', 'stddev': '', 'min': '103RD                        ST', 'max': 'ZELZAH', 'missing': '468', 'distinct': '74'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Cross Street'}, {'feature_label': 'Cross Street', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Cross Street')
        display(transformationDF.limit(2).toPandas())
        return transformationDF


***AUTOML FUNCTIONS***

In [None]:
from sklearn.model_selection import train_test_split
from tpot import TPOTRegressor
import pyspark


def functionRegression(sparkDF, listOfFeatures, label):
    sparkDF.persist(pyspark.StorageLevel.MEMORY_AND_DISK)
    df = sparkDF.toPandas()
    X = (df.drop(label, axis=1))[listOfFeatures].values
    y = df[label].values
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=1, test_size=0.1)
    tpotModel = TPOTRegressor(verbosity=3, generations=10, max_time_mins=5,
                              n_jobs=-1, random_state=25, population_size=15, use_dask=True)
    tpotModel.fit(X_train, y_train)
    display(" Error rate of Model : %s" % tpotModel.score(X_test, y_test))
    data = {'model': tpotModel,
            'X_test': X_test,
            'y_test': y_test,
            'label': label,
            'columnNames': listOfFeatures}
    return data


***READING DATAFRAME***

In [None]:
############## CREATE SPARK SESSION ############################ ENTER YOUR SPARK MASTER IP AND PORT TO CONNECT TO SERVER ################
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[1]').getOrCreate()
#%run regHooks.ipynb
try:
	#sourcePreExecutionHook()

	crimedatafromtopresent = HDFSConnector.fetch(spark, "{'url': '/FileStore/platform/uploadedSourceFiles/Crime_Data_from_2020_to_Present.csv', 'filename': 'Crime_Data_from_2020_to_Present.csv', 'delimiter': ',', 'file_type': 'Delimeted', 'is_header': 'Use Header Line', 'domain': 'http://172.31.59.158', 'port': '40070', 'dirPath': '/FileStore/platform', 'server_url': '/nexusMax/NexusMaxPlatform/uploads/platform/'}")
	#sourcePostExecutionHook(crimedatafromtopresent)

except Exception as ex: 
	logging.error(ex)
#spark.stop()


***TRANSFORMING DATAFRAME***

In [None]:
#%run regHooks.ipynb
try:
	#transformationPreExecutionHook()

	regautofe = TransformationMain.run(crimedatafromtopresent,json.dumps( {"FE": [{"transformationsData": [{"transformation_label": "novalue"}], "feature": "DR_NO", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "223522546.88", "stddev": "7626606.93", "min": "190326475", "max": "241904532", "missing": "0"}, "updatedLabel": "DR_NO"}, {"transformationsData": [{"feature_label": "Date Rptd", "transformation_label": "Extract Date"}], "feature": "Date Rptd", "type": "date", "selected": "True", "replaceby": "random", "stats": {"count": "", "mean": "", "stddev": "", "min": "", "max": "", "missing": "0"}, "transformation": [{"transformation": "Extract Date", "selectedAsDefault": 1}], "generated": "False", "updatedLabel": "Date Rptd"}, {"transformationsData": [{"feature_label": "DATE OCC", "transformation_label": "Extract Date"}], "feature": "DATE OCC", "type": "date", "selected": "True", "replaceby": "random", "stats": {"count": "", "mean": "", "stddev": "", "min": "", "max": "", "missing": "0"}, "transformation": [{"transformation": "Extract Date", "selectedAsDefault": 1}], "generated": "False", "updatedLabel": "DATE OCC"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "TIME OCC", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "1316.66", "stddev": "672.21", "min": "1", "max": "2355", "missing": "0"}, "updatedLabel": "TIME OCC"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "AREA", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "10.8", "stddev": "5.88", "min": "1", "max": "21", "missing": "0"}, "updatedLabel": "AREA"}, {"transformationsData": [{"feature_label": "AREA NAME", "transformation_label": "String Indexer"}], "feature": "AREA NAME", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "77th Street", "max": "Wilshire", "missing": "0", "distinct": "21"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "AREA NAME"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Rpt Dist No", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "1158.48", "stddev": "583.29", "min": "111", "max": "2198", "missing": "0"}, "updatedLabel": "Rpt Dist No"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Part 1-2", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "1.42", "stddev": "0.49", "min": "1", "max": "2", "missing": "0"}, "updatedLabel": "Part 1-2"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Crm Cd", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "523.95", "stddev": "212.26", "min": "110", "max": "956", "missing": "0"}, "updatedLabel": "Crm Cd"}, {"transformationsData": [{"feature_label": "Crm Cd Desc", "transformation_label": "String Indexer"}], "feature": "Crm Cd Desc", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "ARSON", "max": "VIOLATION OF RESTRAINING ORDER", "missing": "0", "distinct": "53"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Crm Cd Desc"}, {"transformationsData": [{"feature_label": "Mocodes", "transformation_label": "String Indexer"}], "feature": "Mocodes", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "452", "mean": "881.67", "stddev": "673.45", "min": "0100 0344 0922 0923 0924 0925 0929 0930 1822", "max": "935", "missing": "48", "distinct": "352"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Mocodes"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Vict Age", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "28.54", "stddev": "20.84", "min": "0", "max": "91", "missing": "0"}, "updatedLabel": "Vict Age"}, {"transformationsData": [{"feature_label": "Vict Sex", "transformation_label": "String Indexer"}], "feature": "Vict Sex", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "466", "mean": "", "stddev": "", "min": "F", "max": "X", "missing": "34", "distinct": "3"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Vict Sex"}, {"transformationsData": [{"feature_label": "Vict Descent", "transformation_label": "String Indexer"}], "feature": "Vict Descent", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "466", "mean": "", "stddev": "", "min": "A", "max": "X", "missing": "34", "distinct": "13"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Vict Descent"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Premis Cd", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "443.11", "stddev": "182.26", "min": "101", "max": "801", "missing": "0"}, "updatedLabel": "Premis Cd"}, {"transformationsData": [{"feature_label": "Premis Desc", "transformation_label": "String Indexer"}], "feature": "Premis Desc", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "ABANDONED BUILDING ABANDONED HOUSE", "max": "WEBSITE", "missing": "0", "distinct": "60"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Premis Desc"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Weapon Used Cd", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "173", "mean": "369.25", "stddev": "117.3", "min": "102", "max": "515", "missing": "327"}, "updatedLabel": "Weapon Used Cd"}, {"transformationsData": [{"feature_label": "Weapon Desc", "transformation_label": "String Indexer"}], "feature": "Weapon Desc", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "173", "mean": "", "stddev": "", "min": "AIR PISTOL/REVOLVER/RIFLE/BB GUN", "max": "VERBAL THREAT", "missing": "327", "distinct": "25"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Weapon Desc"}, {"transformationsData": [{"feature_label": "Status", "transformation_label": "String Indexer"}], "feature": "Status", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "AA", "max": "JO", "missing": "0", "distinct": "5"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Status"}, {"transformationsData": [{"feature_label": "Status Desc", "transformation_label": "String Indexer"}], "feature": "Status Desc", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "Adult Arrest", "max": "Juv Other", "missing": "0", "distinct": "5"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Status Desc"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Crm Cd 1", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "483.98", "stddev": "215.23", "min": "121", "max": "956", "missing": "0"}, "updatedLabel": "Crm Cd 1"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Crm Cd 2", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "33", "mean": "862.73", "stddev": "61.46", "min": "812", "max": "998", "missing": "467"}, "updatedLabel": "Crm Cd 2"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Crm Cd 3", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "0", "mean": "", "stddev": "", "min": "", "max": "", "missing": "500"}, "updatedLabel": "Crm Cd 3"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Crm Cd 4", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "0", "mean": "", "stddev": "", "min": "", "max": "", "missing": "500"}, "updatedLabel": "Crm Cd 4"}, {"transformationsData": [{"feature_label": "LOCATION", "transformation_label": "String Indexer"}], "feature": "LOCATION", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "100 E  3RD                          ST", "max": "YUCCA                        ST", "missing": "0", "distinct": "479"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "LOCATION"}, {"transformationsData": [{"feature_label": "Cross Street", "transformation_label": "String Indexer"}], "feature": "Cross Street", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "79", "mean": "", "stddev": "", "min": "103RD                        ST", "max": "ZELZAH", "missing": "468", "distinct": "74"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Cross Street"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "LAT", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "500", "mean": "34.08", "stddev": "0.11", "min": "33.7234", "max": "34.321", "missing": "0"}, "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "updatedLabel": "LAT"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "LON", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "500", "mean": "-117.65", "stddev": "9.15", "min": "-118.6442", "max": "0.0", "missing": "0"}, "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "updatedLabel": "LON"}]}))

	#transformationPostExecutionHook(regautofe)

except Exception as ex: 
	logging.error(ex)


***TRAIN MODEL***

In [None]:
#%run regHooks.ipynb
try:
	#mlPreExecutionHook()

	dataAutoML=functionRegression(regautofe, ["DR_NO", "TIME OCC", "AREA", "Rpt Dist No", "Part 1-2", "Crm Cd", "Vict Age", "Premis Cd", "Weapon Used Cd", "Crm Cd 1", "Crm Cd 2", "Crm Cd 3", "Crm Cd 4", "LAT", "LON", "Date Rptd_dayofmonth", "Date Rptd_month", "Date Rptd_year", "DATE OCC_dayofmonth", "DATE OCC_month", "DATE OCC_year", "Crm Cd Desc_stringindexer", "Mocodes_stringindexer", "Vict Sex_stringindexer", "Vict Descent_stringindexer", "Premis Desc_stringindexer", "Weapon Desc_stringindexer", "Status_stringindexer", "Status Desc_stringindexer", "LOCATION_stringindexer", "Cross Street_stringindexer"], "AREA NAME_stringindexer")

	#mlPostExecutionHook(dataAutoML)

except Exception as ex: 
	logging.error(ex)
#spark.stop()


***PREDICT ON TRAINED MODEL***

In [None]:
import pandas as pd
import numpy as np
import sklearn.metrics

try:
    model=dataAutoML ['model']
    X_test=dataAutoML['X_test']
    y_test=dataAutoML['y_test']
    label=dataAutoML['label']
    columnNames=dataAutoML['columnNames']
    if label in columnNames:
        columnNames.remove(label)
    predicted=label+"_predicted"
    y_predicted=model.predict(X_test)
    df =pd.DataFrame(X_test , columns=columnNames)
    df[label]=y_test
    df[predicted]=y_predicted
    columnNames.insert(0,predicted)
    columnNames.insert(0,label)
    df = df[columnNames]
    R2 = np.round(sklearn.metrics.r2_score(y_test, y_predicted), 1)
    Mean_Squared_Error = np.round(sklearn.metrics.mean_squared_error(y_test, y_predicted), 1)
    Mean_Absolute_Error = np.round(sklearn.metrics.mean_absolute_error(y_test, y_predicted), 1)
    display(" R2 score of Prediction on test data    : %s"%R2)
    display(" Mean Squared Error of Prediction on test data    : %s"%Mean_Squared_Error)
    display(" Mean Absolute Error of Prediction on test data   : %s"%Mean_Absolute_Error)
    display(df.head())
except Exception as ex:
    logging.error(ex)

spark.stop()

