***GENERATED CODE FOR test PIPELINE.***

***DON'T EDIT THIS CODE.***

***CONNECTOR FUNCTIONS TO READ DATA.***

In [None]:
import os
import datetime
import logging
import warnings
warnings.filterwarnings('ignore')
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)


class HDFSConnector:

    def fetch(spark, config):
        ################### INPUT HADOOP HOST PORT TO CONNECT WITH ###############################
        hdfs_server = str(os.environ['HDFS_SERVER'])
        hdfs_port = int(os.environ['HDFS_PORT'])
        df = spark.read.options(header='true', inferschema='true').csv(
            f"hdfs://{hdfs_server}:{hdfs_port}{eval(config)['url']}", header='true')
        display(df.limit(2).toPandas())
        return df

    def put(df, spark, config):
        return df.write.format('csv').options(header='true' if eval(config)["is_header"] == "Use Header Line" else 'false',
                                              delimiter=eval(config)["delimiter"]).save(("%s %s") % (datetime.datetime.now().strftime("%Y-%m-%d %H.%M.%S")+"_", eval(config)['url']))


***TRANSFORMATIONS FUNCTIONS THAT WILL BE APPLIED ON DATA***

In [None]:
from pyspark.sql.functions import dayofmonth, month, year, col
from pyspark.ml.feature import StringIndexer
import json
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import col, when
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import mean, stddev, min, max, col


class CleanseData:
    # def __init__(self,df):
    #     #print()

    def cleanValueForFE(self, value):
        if value == None:
            return ""
        elif str(value) == 'nan':
            return "nan"
        else:
            return value

    def replaceByMean(self, feature, df, mean_=-1):
        df1 = df
        df1 = df1.dropna()
        meanValue = self.cleanValueForFE(df1.select(
            mean(col(feature.name)).alias('mean')).collect()[0]["mean"])
        df = df.fillna(meanValue, subset=[feature.name])
        df.withColumn(feature.name, when(col(feature.name) == " ",
                      meanValue).otherwise(col(feature.name).cast("Integer")))
        return df

    def replaceByMax(self, feature, df, max_=-1):
        df1 = df
        df1 = df1.dropna()
        maxValue = self.cleanValueForFE(df1.select(
            max(col(feature.name)).alias('max')).collect()[0]["max"])
        df = df.fillna(maxValue, subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", maxValue).otherwise(col(feature.name)))
        return df

    def replaceByMin(self, feature, df, min_=-1):
        df1 = df
        df1 = df1.dropna()
        minValue = self.cleanValueForFE(df1.select(
            min(col(feature.name)).alias('min')).collect()[0]["min"])
        df = df.fillna(minValue, subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", minValue).otherwise(col(feature.name)))
        return df

    def replaceByStandardDeviation(self, feature, df, stddev_=-1):
        df1 = df
        df1 = df1.dropna()
        stddevValue = self.cleanValueForFE(df1.select(
            stddev(col(feature.name)).alias('stddev')).collect()[0]["stddev"])
        df = df.fillna(stddevValue, subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", stddevValue).otherwise(col(feature.name)))
        return df

    def replaceDateRandomly(self, feature, df):
        df1 = df
        df1 = df1.dropna()
        fillValue = self.cleanValueForFE(
            df.where(col(feature.name).isNotNull()).head(1)[0][feature.name])
        df = df.fillna(str(fillValue), subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", fillValue).otherwise(col(feature.name)))
        # print("CleanseData:replaceDateRandomly Schema : ", df.#printSchema())
        return df

    def replaceNullValues(self, fList, df):
        featuresList = df.schema.fields
        for featureObj in fList:
            for feat in featuresList:
                if featureObj["feature"] in feat.name:
                    featureName = feat
                    if "mean" in featureObj["replaceby"]:
                        df = self.replaceByMean(featureName, df)
                    elif "max" in featureObj["replaceby"]:
                        df = self.replaceByMax(featureName, df)
                    elif "min" in featureObj["replaceby"]:
                        df = self.replaceByMin(featureName, df)
                    elif "stddev" in featureObj["replaceby"]:
                        df = self.replaceByStandardDeviation(featureName, df)
                    elif "random" in featureObj["replaceby"]:
                        df = self.replaceDateRandomly(featureName, df)
        return df


def ExtractDateTransform(df, params, transformationData={}):
    transform_params = params
    dfReturn = df
    feature = transform_params['feature']
    dfReturn = dfReturn.fillna({feature: ''})
    dfReturn = dfReturn.withColumn(
        feature+'dayofmonth', dayofmonth(col(feature)))
    dfReturn = dfReturn.withColumn(feature+'month', month(col(feature)))
    dfReturn = dfReturn.withColumn(feature+'year', year(col(feature)))
    return dfReturn


def StringIndexerTransform(df, params, transformationData={}):
    dfReturn = df
    feature = params["feature"]

    dfReturn = dfReturn.fillna({feature: ''})
    outcol = feature + "_stringindexer"
    indexer = StringIndexer(
        inputCol=feature, outputCol=outcol, handleInvalid="skip")
    indexed = indexer.fit(dfReturn).transform(dfReturn)
    dfReturn = indexed
    distinct_values_list = dfReturn.select(
        outcol).distinct().rdd.map(lambda r: r[0]).collect()
    len_distinct_values_list = len(distinct_values_list)
    if len_distinct_values_list <= 4:
        changed_type_df = dfReturn.withColumn(
            outcol, dfReturn[outcol].cast(IntegerType()))
        return changed_type_df
    return dfReturn


class TransformationMain:
    # TODO: change df argument in run with following
    def run(transformationDF, config):
        configObj = json.loads(config)
        featureData = configObj["FE"]
        transformationDF = CleanseData().replaceNullValues(featureData, transformationDF)
        transformationDF = ExtractDateTransform(transformationDF, {'transformationsData': [{'feature_label': 'CRASH DATE', 'transformation_label': 'Extract Date'}], 'feature': 'CRASH DATE', 'type': 'date', 'selected': 'True', 'replaceby': 'random', 'stats': {
                                                'count': '', 'mean': '', 'stddev': '', 'min': '', 'max': '', 'missing': '0'}, 'transformation': [{'transformation': 'Extract Date', 'selectedAsDefault': 1}], 'generated': 'False', 'updatedLabel': 'CRASH DATE'}, {'feature_label': 'CRASH DATE', 'transformation_label': 'Extract Date'})
        transformationDF = transformationDF.drop('CRASH DATE')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'CRASH TIME', 'transformation_label': 'String Indexer'}], 'feature': 'CRASH TIME', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': '10:00:00 AM', 'max': '9:58:00 PM', 'missing': '0', 'distinct': '335'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'CRASH TIME'}, {'feature_label': 'CRASH TIME', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('CRASH TIME')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'BOROUGH', 'transformation_label': 'String Indexer'}], 'feature': 'BOROUGH', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': 'BRONX', 'max': 'STATEN ISLAND', 'missing': '0', 'distinct': '6'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'BOROUGH'}, {'feature_label': 'BOROUGH', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('BOROUGH')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'LOCATION', 'transformation_label': 'String Indexer'}], 'feature': 'LOCATION', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '451', 'mean': '', 'stddev': '', 'min': '(40.532425, -74.192024)', 'max': '(40.901836, -73.84548)', 'missing': '37', 'distinct': '457'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'LOCATION'}, {'feature_label': 'LOCATION', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('LOCATION')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'ON STREET NAME', 'transformation_label': 'String Indexer'}], 'feature': 'ON STREET NAME', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '370', 'mean': '', 'stddev': '', 'min': '1 AVENUE', 'max': 'skyline dr', 'missing': '161', 'distinct': '259'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'ON STREET NAME'}, {'feature_label': 'ON STREET NAME', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('ON STREET NAME')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'CROSS STREET NAME', 'transformation_label': 'String Indexer'}], 'feature': 'CROSS STREET NAME', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '227', 'mean': '', 'stddev': '', 'min': '1 AVENUE', 'max': 'WINTHROP STREET', 'missing': '273', 'distinct': '195'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'CROSS STREET NAME'}, {'feature_label': 'CROSS STREET NAME', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('CROSS STREET NAME')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'OFF STREET NAME', 'transformation_label': 'String Indexer'}], 'feature': 'OFF STREET NAME', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '130', 'mean': '', 'stddev': '', 'min': '100       COLUMBIA STREET', 'max': 'MYRTLE AVENUE', 'missing': '339', 'distinct': '161'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'OFF STREET NAME'}, {'feature_label': 'OFF STREET NAME', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('OFF STREET NAME')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'CONTRIBUTING FACTOR VEHICLE 1', 'transformation_label': 'String Indexer'}], 'feature': 'CONTRIBUTING FACTOR VEHICLE 1', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': 'Aggressive Driving/Road Rage', 'max': 'View Obstructed/Limited', 'missing': '0', 'distinct': '33'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'CONTRIBUTING FACTOR VEHIC...'}, {'feature_label': 'CONTRIBUTING FACTOR VEHICLE 1', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop(
            'CONTRIBUTING FACTOR VEHICLE 1')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'CONTRIBUTING FACTOR VEHICLE 2', 'transformation_label': 'String Indexer'}], 'feature': 'CONTRIBUTING FACTOR VEHICLE 2', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '358', 'mean': '', 'stddev': '', 'min': 'Driver Inattention/Distraction', 'max': 'View Obstructed/Limited', 'missing': '142', 'distinct': '17'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'CONTRIBUTING FACTOR VEHIC...'}, {'feature_label': 'CONTRIBUTING FACTOR VEHICLE 2', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop(
            'CONTRIBUTING FACTOR VEHICLE 2')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'CONTRIBUTING FACTOR VEHICLE 3', 'transformation_label': 'String Indexer'}], 'feature': 'CONTRIBUTING FACTOR VEHICLE 3', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '67', 'mean': '', 'stddev': '', 'min': 'Driver Inattention/Distraction', 'max': 'Unspecified', 'missing': '454', 'distinct': '3'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'CONTRIBUTING FACTOR VEHIC...'}, {'feature_label': 'CONTRIBUTING FACTOR VEHICLE 3', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop(
            'CONTRIBUTING FACTOR VEHICLE 3')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'CONTRIBUTING FACTOR VEHICLE 4', 'transformation_label': 'String Indexer'}], 'feature': 'CONTRIBUTING FACTOR VEHICLE 4', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '14', 'mean': '', 'stddev': '', 'min': 'Other Vehicular', 'max': 'Unspecified', 'missing': '486', 'distinct': '2'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'CONTRIBUTING FACTOR VEHIC...'}, {'feature_label': 'CONTRIBUTING FACTOR VEHICLE 4', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop(
            'CONTRIBUTING FACTOR VEHICLE 4')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'CONTRIBUTING FACTOR VEHICLE 5', 'transformation_label': 'String Indexer'}], 'feature': 'CONTRIBUTING FACTOR VEHICLE 5', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '4', 'mean': '', 'stddev': '', 'min': 'Unspecified', 'max': 'Unspecified', 'missing': '496', 'distinct': '2'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'CONTRIBUTING FACTOR VEHIC...'}, {'feature_label': 'CONTRIBUTING FACTOR VEHICLE 5', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop(
            'CONTRIBUTING FACTOR VEHICLE 5')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'VEHICLE TYPE CODE 1', 'transformation_label': 'String Indexer'}], 'feature': 'VEHICLE TYPE CODE 1', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': '3-Door', 'max': 'Van', 'missing': '0', 'distinct': '27'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'VEHICLE TYPE CODE 1'}, {'feature_label': 'VEHICLE TYPE CODE 1', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('VEHICLE TYPE CODE 1')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'VEHICLE TYPE CODE 2', 'transformation_label': 'String Indexer'}], 'feature': 'VEHICLE TYPE CODE 2', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '305', 'mean': '', 'stddev': '', 'min': 'Bike', 'max': 'Van', 'missing': '195', 'distinct': '11'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'VEHICLE TYPE CODE 2'}, {'feature_label': 'VEHICLE TYPE CODE 2', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('VEHICLE TYPE CODE 2')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'VEHICLE TYPE CODE 3', 'transformation_label': 'String Indexer'}], 'feature': 'VEHICLE TYPE CODE 3', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '42', 'mean': '', 'stddev': '', 'min': 'Dollar Van', 'max': 'Station Wagon/Sport Utility Vehicle', 'missing': '458', 'distinct': '4'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'VEHICLE TYPE CODE 3'}, {'feature_label': 'VEHICLE TYPE CODE 3', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('VEHICLE TYPE CODE 3')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'VEHICLE TYPE CODE 4', 'transformation_label': 'String Indexer'}], 'feature': 'VEHICLE TYPE CODE 4', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '14', 'mean': '', 'stddev': '', 'min': 'Motorcycle', 'max': 'Taxi', 'missing': '486', 'distinct': '5'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'VEHICLE TYPE CODE 4'}, {'feature_label': 'VEHICLE TYPE CODE 4', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('VEHICLE TYPE CODE 4')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'VEHICLE TYPE CODE 5', 'transformation_label': 'String Indexer'}], 'feature': 'VEHICLE TYPE CODE 5', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '4', 'mean': '', 'stddev': '', 'min': 'Pick-up Truck', 'max': 'Sedan', 'missing': '497', 'distinct': '2'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'VEHICLE TYPE CODE 5'}, {'feature_label': 'VEHICLE TYPE CODE 5', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('VEHICLE TYPE CODE 5')
        display(transformationDF.limit(2).toPandas())
        return transformationDF


***AUTOML FUNCTIONS***

In [None]:
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
import pyspark


def functionClassification(sparkDF, listOfFeatures, label):
    sparkDF.persist(pyspark.StorageLevel.MEMORY_AND_DISK)
    df = (sparkDF.toPandas())
    X = (df.drop(label, axis=1))[listOfFeatures].values
    y = df[label].values
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=1, test_size=0.1)
    tpotModel = TPOTClassifier(verbosity=3, n_jobs=-1, generations=10, max_time_mins=5,
                               population_size=15, use_dask=True)
    tpotModel.fit(X_train, y_train)
    display(" Accuracy of Model : %s" % tpotModel.score(X_test, y_test))
    data = {'model': tpotModel,
            'X_test': X_test,
            'y_test': y_test,
            'label': label,
            'columnNames': listOfFeatures}
    return data


***READING DATAFRAME***

In [None]:
############## CREATE SPARK SESSION ############################ ENTER YOUR SPARK MASTER IP AND PORT TO CONNECT TO SERVER ################
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[1]').getOrCreate()
#%run testHooks.ipynb
try:
	#sourcePreExecutionHook()

	motorvehiclecollisionsnyc = HDFSConnector.fetch(spark, "{'url': '/FileStore/platform/uploadedSourceFiles/Motor Vehicle Collisions  NYC.csv', 'filename': 'Motor Vehicle Collisions  NYC.csv', 'delimiter': ',', 'file_type': 'Delimeted', 'is_header': 'Use Header Line', 'domain': 'http://172.31.59.158', 'port': '40070', 'dirPath': '/FileStore/platform', 'server_url': '/nexusMax/NexusMaxPlatform/uploads/platform/'}")
	#sourcePostExecutionHook(motorvehiclecollisionsnyc)

except Exception as ex: 
	logging.error(ex)
#spark.stop()


***TRANSFORMING DATAFRAME***

In [None]:
#%run testHooks.ipynb
try:
	#transformationPreExecutionHook()

	testautofe = TransformationMain.run(motorvehiclecollisionsnyc,json.dumps( {"FE": [{"transformationsData": [{"feature_label": "CRASH DATE", "transformation_label": "Extract Date"}], "feature": "CRASH DATE", "type": "date", "selected": "True", "replaceby": "random", "stats": {"count": "", "mean": "", "stddev": "", "min": "", "max": "", "missing": "0"}, "transformation": [{"transformation": "Extract Date", "selectedAsDefault": 1}], "generated": "False", "updatedLabel": "CRASH DATE"}, {"transformationsData": [{"feature_label": "CRASH TIME", "transformation_label": "String Indexer"}], "feature": "CRASH TIME", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "10:00:00 AM", "max": "9:58:00 PM", "missing": "0", "distinct": "335"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "CRASH TIME"}, {"transformationsData": [{"feature_label": "BOROUGH", "transformation_label": "String Indexer"}], "feature": "BOROUGH", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "BRONX", "max": "STATEN ISLAND", "missing": "0", "distinct": "6"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "BOROUGH"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "ZIP CODE", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "347", "mean": "10873.6", "stddev": "531.07", "min": "10001", "max": "11694", "missing": "189"}, "updatedLabel": "ZIP CODE"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "LATITUDE", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "463", "mean": "40.72", "stddev": "0.08", "min": "40.52238", "max": "40.9076", "missing": "37"}, "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "updatedLabel": "LATITUDE"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "LONGITUDE", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "463", "mean": "-73.92", "stddev": "0.08", "min": "-74.216", "max": "-73.72108", "missing": "37"}, "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "updatedLabel": "LONGITUDE"}, {"transformationsData": [{"feature_label": "LOCATION", "transformation_label": "String Indexer"}], "feature": "LOCATION", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "451", "mean": "", "stddev": "", "min": "(40.532425, -74.192024)", "max": "(40.901836, -73.84548)", "missing": "37", "distinct": "457"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "LOCATION"}, {"transformationsData": [{"feature_label": "ON STREET NAME", "transformation_label": "String Indexer"}], "feature": "ON STREET NAME", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "370", "mean": "", "stddev": "", "min": "1 AVENUE", "max": "skyline dr", "missing": "161", "distinct": "259"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "ON STREET NAME"}, {"transformationsData": [{"feature_label": "CROSS STREET NAME", "transformation_label": "String Indexer"}], "feature": "CROSS STREET NAME", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "227", "mean": "", "stddev": "", "min": "1 AVENUE", "max": "WINTHROP STREET", "missing": "273", "distinct": "195"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "CROSS STREET NAME"}, {"transformationsData": [{"feature_label": "OFF STREET NAME", "transformation_label": "String Indexer"}], "feature": "OFF STREET NAME", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "130", "mean": "", "stddev": "", "min": "100       COLUMBIA STREET", "max": "MYRTLE AVENUE", "missing": "339", "distinct": "161"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "OFF STREET NAME"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "NUMBER OF PERSONS INJURED", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "0.46", "stddev": "0.76", "min": "0", "max": "5", "missing": "0"}, "updatedLabel": "NUMBER OF PERSONS INJURED"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "NUMBER OF PERSONS KILLED", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "0.0", "stddev": "0.06", "min": "0", "max": "1", "missing": "0"}, "updatedLabel": "NUMBER OF PERSONS KILLED"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "NUMBER OF PEDESTRIANS INJURED", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "0.07", "stddev": "0.28", "min": "0", "max": "2", "missing": "0"}, "updatedLabel": "NUMBER OF PEDESTRIANS INJ..."}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "NUMBER OF PEDESTRIANS KILLED", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "0.0", "stddev": "0.04", "min": "0", "max": "1", "missing": "0"}, "updatedLabel": "NUMBER OF PEDESTRIANS KIL..."}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "NUMBER OF CYCLIST INJURED", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "0.04", "stddev": "0.2", "min": "0", "max": "2", "missing": "0"}, "updatedLabel": "NUMBER OF CYCLIST INJURED"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "NUMBER OF CYCLIST KILLED", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "0.0", "stddev": "0.0", "min": "0", "max": "0", "missing": "0"}, "updatedLabel": "NUMBER OF CYCLIST KILLED"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "NUMBER OF MOTORIST INJURED", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "0.42", "stddev": "0.96", "min": "0", "max": "7", "missing": "0"}, "updatedLabel": "NUMBER OF MOTORIST INJURE..."}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "NUMBER OF MOTORIST KILLED", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "0.0", "stddev": "0.0", "min": "0", "max": "0", "missing": "0"}, "updatedLabel": "NUMBER OF MOTORIST KILLED"}, {"transformationsData": [{"feature_label": "CONTRIBUTING FACTOR VEHICLE 1", "transformation_label": "String Indexer"}], "feature": "CONTRIBUTING FACTOR VEHICLE 1", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "Aggressive Driving/Road Rage", "max": "View Obstructed/Limited", "missing": "0", "distinct": "33"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "CONTRIBUTING FACTOR VEHIC..."}, {"transformationsData": [{"feature_label": "CONTRIBUTING FACTOR VEHICLE 2", "transformation_label": "String Indexer"}], "feature": "CONTRIBUTING FACTOR VEHICLE 2", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "358", "mean": "", "stddev": "", "min": "Driver Inattention/Distraction", "max": "View Obstructed/Limited", "missing": "142", "distinct": "17"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "CONTRIBUTING FACTOR VEHIC..."}, {"transformationsData": [{"feature_label": "CONTRIBUTING FACTOR VEHICLE 3", "transformation_label": "String Indexer"}], "feature": "CONTRIBUTING FACTOR VEHICLE 3", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "67", "mean": "", "stddev": "", "min": "Driver Inattention/Distraction", "max": "Unspecified", "missing": "454", "distinct": "3"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "CONTRIBUTING FACTOR VEHIC..."}, {"transformationsData": [{"feature_label": "CONTRIBUTING FACTOR VEHICLE 4", "transformation_label": "String Indexer"}], "feature": "CONTRIBUTING FACTOR VEHICLE 4", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "14", "mean": "", "stddev": "", "min": "Other Vehicular", "max": "Unspecified", "missing": "486", "distinct": "2"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "CONTRIBUTING FACTOR VEHIC..."}, {"transformationsData": [{"feature_label": "CONTRIBUTING FACTOR VEHICLE 5", "transformation_label": "String Indexer"}], "feature": "CONTRIBUTING FACTOR VEHICLE 5", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "4", "mean": "", "stddev": "", "min": "Unspecified", "max": "Unspecified", "missing": "496", "distinct": "2"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "CONTRIBUTING FACTOR VEHIC..."}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "COLLISION_ID", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "4492118.15", "stddev": "2457.99", "min": "4491064", "max": "4531603", "missing": "0"}, "updatedLabel": "COLLISION_ID"}, {"transformationsData": [{"feature_label": "VEHICLE TYPE CODE 1", "transformation_label": "String Indexer"}], "feature": "VEHICLE TYPE CODE 1", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "3-Door", "max": "Van", "missing": "0", "distinct": "27"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "VEHICLE TYPE CODE 1"}, {"transformationsData": [{"feature_label": "VEHICLE TYPE CODE 2", "transformation_label": "String Indexer"}], "feature": "VEHICLE TYPE CODE 2", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "305", "mean": "", "stddev": "", "min": "Bike", "max": "Van", "missing": "195", "distinct": "11"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "VEHICLE TYPE CODE 2"}, {"transformationsData": [{"feature_label": "VEHICLE TYPE CODE 3", "transformation_label": "String Indexer"}], "feature": "VEHICLE TYPE CODE 3", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "42", "mean": "", "stddev": "", "min": "Dollar Van", "max": "Station Wagon/Sport Utility Vehicle", "missing": "458", "distinct": "4"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "VEHICLE TYPE CODE 3"}, {"transformationsData": [{"feature_label": "VEHICLE TYPE CODE 4", "transformation_label": "String Indexer"}], "feature": "VEHICLE TYPE CODE 4", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "14", "mean": "", "stddev": "", "min": "Motorcycle", "max": "Taxi", "missing": "486", "distinct": "5"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "VEHICLE TYPE CODE 4"}, {"transformationsData": [{"feature_label": "VEHICLE TYPE CODE 5", "transformation_label": "String Indexer"}], "feature": "VEHICLE TYPE CODE 5", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "4", "mean": "", "stddev": "", "min": "Pick-up Truck", "max": "Sedan", "missing": "497", "distinct": "2"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "VEHICLE TYPE CODE 5"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "CRASH DATE - Copy", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "2022.0", "stddev": "0.0", "min": "2022", "max": "2022", "missing": "0"}, "updatedLabel": "CRASH DATE - Copy"}]}))

	#transformationPostExecutionHook(testautofe)

except Exception as ex: 
	logging.error(ex)


***TRAIN MODEL***

In [None]:
#%run testHooks.ipynb
try:
	#mlPreExecutionHook()

	dataAutoML=functionClassification(testautofe, ["ZIP CODE", "LATITUDE", "LONGITUDE", "NUMBER OF PERSONS INJURED", "NUMBER OF PEDESTRIANS INJURED", "NUMBER OF PEDESTRIANS KILLED", "NUMBER OF CYCLIST INJURED", "NUMBER OF CYCLIST KILLED", "NUMBER OF MOTORIST INJURED", "NUMBER OF MOTORIST KILLED", "COLLISION_ID", "CRASH DATE - Copy", "CRASH DATE_dayofmonth", "CRASH DATE_month", "CRASH DATE_year", "CRASH TIME_stringindexer", "BOROUGH_stringindexer", "LOCATION_stringindexer", "ON STREET NAME_stringindexer", "CROSS STREET NAME_stringindexer", "OFF STREET NAME_stringindexer", "CONTRIBUTING FACTOR VEHICLE 1_stringindexer", "CONTRIBUTING FACTOR VEHICLE 2_stringindexer", "CONTRIBUTING FACTOR VEHICLE 3_stringindexer", "CONTRIBUTING FACTOR VEHICLE 4_stringindexer", "CONTRIBUTING FACTOR VEHICLE 5_stringindexer", "VEHICLE TYPE CODE 1_stringindexer", "VEHICLE TYPE CODE 2_stringindexer", "VEHICLE TYPE CODE 3_stringindexer", "VEHICLE TYPE CODE 4_stringindexer", "VEHICLE TYPE CODE 5_stringindexer"], "NUMBER OF PERSONS KILLED")

	#mlPostExecutionHook(dataAutoML)

except Exception as ex: 
	logging.error(ex)
#spark.stop()


***PREDICT ON TRAINED MODEL***

In [None]:
import pandas as pd
import numpy as np
import sklearn.metrics

try:
    model=dataAutoML['model']
    X_test=dataAutoML['X_test']
    y_test=dataAutoML['y_test']
    label=dataAutoML['label']
    columnNames=dataAutoML['columnNames']
    if label in columnNames:
        columnNames.remove(label)
    predicted=label+"_predicted"
    y_predicted=model.predict(X_test)
    df =pd.DataFrame(X_test , columns=columnNames)
    df[label]=y_test
    df[predicted]=y_predicted
    columnNames.insert(0,predicted)
    columnNames.insert(0,label)
    Accuracy = np.round((100 * sklearn.metrics.accuracy_score(y_true=y_test, y_pred=y_predicted)), 1)
    F1= np.round(
            (100 * sklearn.metrics.f1_score(y_true=y_test, y_pred=y_predicted, average="weighted")), 1)
    Precision= np.round((
                100 * sklearn.metrics.precision_score(y_true=y_test, y_pred=y_predicted, average="weighted")), 1)
    Recall = np.round((
                100 * sklearn.metrics.recall_score(y_true=y_test, y_pred=y_predicted, average="weighted")), 1)
    display(" Accuracy of Prediction on test data    : %s"%Accuracy)
    display(" F1 score of Prediction on test data    : %s"%F1)
    display(" Precision of Prediction on test data   : %s"%Precision)
    display(" Recall of Prediction on test data      : %s"%Recall)
    display(df.head())
except Exception as ex:
    logging.error(ex)

spark.stop()

