***GENERATED CODE FOR h1bclassification PIPELINE.***

***DON'T EDIT THIS CODE.***

***CONNECTOR FUNCTIONS TO READ DATA.***

In [None]:
import os
import datetime
import logging
import warnings
warnings.filterwarnings('ignore')
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)


class HDFSConnector:

    def fetch(spark, config):
        ################### INPUT HADOOP HOST PORT TO CONNECT WITH ###############################
        hdfs_server = str(os.environ['HDFS_SERVER'])
        hdfs_port = int(os.environ['HDFS_PORT'])
        df = spark.read.options(header='true', inferschema='true').csv(
            f"hdfs://{hdfs_server}:{hdfs_port}{eval(config)['url']}", header='true')
        display(df.limit(2).toPandas())
        return df

    def put(df, spark, config):
        return df.write.format('csv').options(header='true' if eval(config)["is_header"] == "Use Header Line" else 'false',
                                              delimiter=eval(config)["delimiter"]).save(("%s %s") % (datetime.datetime.now().strftime("%Y-%m-%d %H.%M.%S")+"_", eval(config)['url']))


***TRANSFORMATIONS FUNCTIONS THAT WILL BE APPLIED ON DATA***

In [None]:
import json
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import col, when
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import mean, stddev, min, max, col


class CleanseData:
    # def __init__(self,df):
    #     #print()

    def cleanValueForFE(self, value):
        if value == None:
            return ""
        elif str(value) == 'nan':
            return "nan"
        else:
            return value

    def replaceByMean(self, feature, df, mean_=-1):
        df1 = df
        df1 = df1.dropna()
        meanValue = self.cleanValueForFE(df1.select(
            mean(col(feature.name)).alias('mean')).collect()[0]["mean"])
        df = df.fillna(meanValue, subset=[feature.name])
        df.withColumn(feature.name, when(col(feature.name) == " ",
                      meanValue).otherwise(col(feature.name).cast("Integer")))
        return df

    def replaceByMax(self, feature, df, max_=-1):
        df1 = df
        df1 = df1.dropna()
        maxValue = self.cleanValueForFE(df1.select(
            max(col(feature.name)).alias('max')).collect()[0]["max"])
        df = df.fillna(maxValue, subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", maxValue).otherwise(col(feature.name)))
        return df

    def replaceByMin(self, feature, df, min_=-1):
        df1 = df
        df1 = df1.dropna()
        minValue = self.cleanValueForFE(df1.select(
            min(col(feature.name)).alias('min')).collect()[0]["min"])
        df = df.fillna(minValue, subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", minValue).otherwise(col(feature.name)))
        return df

    def replaceByStandardDeviation(self, feature, df, stddev_=-1):
        df1 = df
        df1 = df1.dropna()
        stddevValue = self.cleanValueForFE(df1.select(
            stddev(col(feature.name)).alias('stddev')).collect()[0]["stddev"])
        df = df.fillna(stddevValue, subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", stddevValue).otherwise(col(feature.name)))
        return df

    def replaceDateRandomly(self, feature, df):
        df1 = df
        df1 = df1.dropna()
        fillValue = self.cleanValueForFE(
            df.where(col(feature.name).isNotNull()).head(1)[0][feature.name])
        df = df.fillna(str(fillValue), subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", fillValue).otherwise(col(feature.name)))
        # print("CleanseData:replaceDateRandomly Schema : ", df.#printSchema())
        return df

    def replaceNullValues(self, fList, df):
        featuresList = df.schema.fields
        for featureObj in fList:
            for feat in featuresList:
                if featureObj["feature"] in feat.name:
                    featureName = feat
                    if "mean" in featureObj["replaceby"]:
                        df = self.replaceByMean(featureName, df)
                    elif "max" in featureObj["replaceby"]:
                        df = self.replaceByMax(featureName, df)
                    elif "min" in featureObj["replaceby"]:
                        df = self.replaceByMin(featureName, df)
                    elif "stddev" in featureObj["replaceby"]:
                        df = self.replaceByStandardDeviation(featureName, df)
                    elif "random" in featureObj["replaceby"]:
                        df = self.replaceDateRandomly(featureName, df)
        return df


def StringIndexerTransform(df, params, transformationData={}):
    dfReturn = df
    feature = params["feature"]

    dfReturn = dfReturn.fillna({feature: ''})
    outcol = feature + "_stringindexer"
    indexer = StringIndexer(
        inputCol=feature, outputCol=outcol, handleInvalid="skip")
    indexed = indexer.fit(dfReturn).transform(dfReturn)
    dfReturn = indexed
    distinct_values_list = dfReturn.select(
        outcol).distinct().rdd.map(lambda r: r[0]).collect()
    len_distinct_values_list = len(distinct_values_list)
    if len_distinct_values_list <= 4:
        changed_type_df = dfReturn.withColumn(
            outcol, dfReturn[outcol].cast(IntegerType()))
        return changed_type_df
    return dfReturn


class TransformationMain:
    # TODO: change df argument in run with following
    def run(transformationDF, config):
        configObj = json.loads(config)
        featureData = configObj["FE"]
        transformationDF = CleanseData().replaceNullValues(featureData, transformationDF)
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'bcn', 'transformation_label': 'String Indexer'}], 'feature': 'bcn', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
                                                  'count': '500', 'mean': '', 'stddev': '', 'min': '(b)(3) (b)(6) (b)(7)(c)', 'max': '(b)(6)', 'missing': '0', 'distinct': '1'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'bcn'}, {'feature_label': 'bcn', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('bcn')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'country_of_birth', 'transformation_label': 'String Indexer'}], 'feature': 'country_of_birth', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': 'AFG', 'max': 'VNM', 'missing': '0', 'distinct': '39'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'country_of_birth'}, {'feature_label': 'country_of_birth', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('country_of_birth')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'country_of_nationality', 'transformation_label': 'String Indexer'}], 'feature': 'country_of_nationality', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': '(b)(3) (b)(6) (b)(7)(c)', 'max': 'VNM', 'missing': '0', 'distinct': '36'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'country_of_nationality'}, {'feature_label': 'country_of_nationality', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('country_of_nationality')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'ben_date_of_birth', 'transformation_label': 'String Indexer'}], 'feature': 'ben_date_of_birth', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': '(b)(6)', 'max': '(b)(6)', 'missing': '0', 'distinct': '2'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'ben_date_of_birth'}, {'feature_label': 'ben_date_of_birth', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('ben_date_of_birth')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'ben_year_of_birth', 'transformation_label': 'String Indexer'}], 'feature': 'ben_year_of_birth', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '1989.51', 'stddev': '5.97', 'min': '(b)(3) (b)(6) (b)(7)(c)', 'max': '1999', 'missing': '0', 'distinct': '32'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'ben_year_of_birth'}, {'feature_label': 'ben_year_of_birth', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('ben_year_of_birth')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'gender', 'transformation_label': 'String Indexer'}], 'feature': 'gender', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': 'female', 'max': 'male', 'missing': '0', 'distinct': '2'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'gender'}, {'feature_label': 'gender', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('gender')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'employer_name', 'transformation_label': 'String Indexer'}], 'feature': 'employer_name', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': '3S Business Corporation', 'max': 'my choice wireless', 'missing': '0', 'distinct': '373'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'employer_name'}, {'feature_label': 'employer_name', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('employer_name')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'FEIN', 'transformation_label': 'String Indexer'}], 'feature': 'FEIN', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '561151235.02', 'stddev': '293876371.39', 'min': '(b)(3) (b)(6) (b)(7)(c)', 'max': '980429806', 'missing': '0', 'distinct': '373'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'FEIN'}, {'feature_label': 'FEIN', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('FEIN')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'mail_addr', 'transformation_label': 'String Indexer'}], 'feature': 'mail_addr', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': '(b)(3) (b)(6) (b)(7)(c)', 'max': 'Worldwide Plaza 309 West 49th Street', 'missing': '0', 'distinct': '373'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'mail_addr'}, {'feature_label': 'mail_addr', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('mail_addr')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'city', 'transformation_label': 'String Indexer'}], 'feature': 'city', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': 'Addison', 'max': 'Wixom', 'missing': '0', 'distinct': '211'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'city'}, {'feature_label': 'city', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('city')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'state', 'transformation_label': 'String Indexer'}], 'feature': 'state', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': 'AL', 'max': 'WV', 'missing': '0', 'distinct': '36'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'state'}, {'feature_label': 'state', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('state')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'zip', 'transformation_label': 'String Indexer'}], 'feature': 'zip', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '50915.86', 'stddev': '32472.89', 'min': '01752-1291', 'max': '98121-1902', 'missing': '0', 'distinct': '365'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'zip'}, {'feature_label': 'zip', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('zip')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'agent_first_name', 'transformation_label': 'String Indexer'}], 'feature': 'agent_first_name', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '499', 'mean': '', 'stddev': '', 'min': '(b)(3) (b)(6) (b)(7)(c)', 'max': 'kim', 'missing': '1', 'distinct': '344'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'agent_first_name'}, {'feature_label': 'agent_first_name', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('agent_first_name')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'agent_last_name', 'transformation_label': 'String Indexer'}], 'feature': 'agent_last_name', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': '(b)(3) (b)(6) (b)(7)(c)', 'max': 'van Burk', 'missing': '0', 'distinct': '373'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'agent_last_name'}, {'feature_label': 'agent_last_name', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('agent_last_name')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'lottery_year', 'transformation_label': 'String Indexer'}], 'feature': 'lottery_year', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '2022.0', 'stddev': '0.0', 'min': '2022', 'max': '2022', 'missing': '0', 'distinct': '2'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'lottery_year'}, {'feature_label': 'lottery_year', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('lottery_year')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'status_type', 'transformation_label': 'String Indexer'}], 'feature': 'status_type', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': '(b)(3) (b)(6) (b)(7)(c)', 'max': 'SELECTED', 'missing': '0', 'distinct': '3'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'status_type'}, {'feature_label': 'status_type', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('status_type')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'ben_multi_reg_ind', 'transformation_label': 'String Indexer'}], 'feature': 'ben_multi_reg_ind', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '0.32', 'stddev': '0.47', 'min': '0', 'max': '1', 'missing': '0', 'distinct': '2'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'ben_multi_reg_ind'}, {'feature_label': 'ben_multi_reg_ind', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('ben_multi_reg_ind')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'RECEIPT_NUMBER', 'transformation_label': 'String Indexer'}], 'feature': 'RECEIPT_NUMBER', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '149', 'mean': '', 'stddev': '', 'min': '(b)(3) (b)(6) (b)(7)(c)', 'max': '(b)(6)', 'missing': '351', 'distinct': '2'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'RECEIPT_NUMBER'}, {'feature_label': 'RECEIPT_NUMBER', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('RECEIPT_NUMBER')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'rec_date', 'transformation_label': 'String Indexer'}], 'feature': 'rec_date', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '145', 'mean': '', 'stddev': '', 'min': '1/18/2022', 'max': '9/8/2021', 'missing': '351', 'distinct': '74'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'rec_date'}, {'feature_label': 'rec_date', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('rec_date')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'FIRST_DECISION', 'transformation_label': 'String Indexer'}], 'feature': 'FIRST_DECISION', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '145', 'mean': '', 'stddev': '', 'min': 'Approved', 'max': 'Denied', 'missing': '351', 'distinct': '2'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'FIRST_DECISION'}, {'feature_label': 'FIRST_DECISION', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('FIRST_DECISION')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'first_decision_date', 'transformation_label': 'String Indexer'}], 'feature': 'first_decision_date', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '145', 'mean': '', 'stddev': '', 'min': '1/12/2022', 'max': '9/9/2021', 'missing': '355', 'distinct': '106'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'first_decision_date'}, {'feature_label': 'first_decision_date', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('first_decision_date')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'i129_employer_name', 'transformation_label': 'String Indexer'}], 'feature': 'i129_employer_name', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '149', 'mean': '', 'stddev': '', 'min': '(b)(3) (b)(6) (b)(7)(c)', 'max': 'ZENSOFT LLC', 'missing': '355', 'distinct': '122'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'i129_employer_name'}, {'feature_label': 'i129_employer_name', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('i129_employer_name')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'PET_STREET', 'transformation_label': 'String Indexer'}], 'feature': 'PET_STREET', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '145', 'mean': '', 'stddev': '', 'min': '1 CRANBERRY HILL', 'max': 'ONE APPLE PARK WAY M/S 1041GM', 'missing': '351', 'distinct': '128'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'PET_STREET'}, {'feature_label': 'PET_STREET', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('PET_STREET')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'PET_CITY', 'transformation_label': 'String Indexer'}], 'feature': 'PET_CITY', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '145', 'mean': '', 'stddev': '', 'min': 'ARLINGTON HEIGHTS', 'max': 'WALTHAM', 'missing': '355', 'distinct': '90'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'PET_CITY'}, {'feature_label': 'PET_CITY', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('PET_CITY')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'PET_STATE', 'transformation_label': 'String Indexer'}], 'feature': 'PET_STATE', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '149', 'mean': '', 'stddev': '', 'min': '(b)(3) (b)(6) (b)(7)(c)', 'max': 'WI', 'missing': '351', 'distinct': '26'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'PET_STATE'}, {'feature_label': 'PET_STATE', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('PET_STATE')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'PET_ZIP', 'transformation_label': 'String Indexer'}], 'feature': 'PET_ZIP', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '149', 'mean': '86937300.11', 'stddev': '248605944.83', 'min': '(b)(3) (b)(6) (b)(7)(c)', 'max': '98121', 'missing': '355', 'distinct': '115'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'PET_ZIP'}, {'feature_label': 'PET_ZIP', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('PET_ZIP')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'REQUESTED_CLASS', 'transformation_label': 'String Indexer'}], 'feature': 'REQUESTED_CLASS', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '145', 'mean': '', 'stddev': '', 'min': '1B1', 'max': '1B1', 'missing': '355', 'distinct': '1'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'REQUESTED_CLASS'}, {'feature_label': 'REQUESTED_CLASS', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('REQUESTED_CLASS')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'BASIS_FOR_CLASSIFICATION', 'transformation_label': 'String Indexer'}], 'feature': 'BASIS_FOR_CLASSIFICATION', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '149', 'mean': '', 'stddev': '', 'min': '(b)(3) (b)(6) (b)(7)(c)', 'max': 'A', 'missing': '355', 'distinct': '1'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'BASIS_FOR_CLASSIFICATION'}, {'feature_label': 'BASIS_FOR_CLASSIFICATION', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop(
            'BASIS_FOR_CLASSIFICATION')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'REQUESTED_ACTION', 'transformation_label': 'String Indexer'}], 'feature': 'REQUESTED_ACTION', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '145', 'mean': '', 'stddev': '', 'min': 'A', 'max': 'B', 'missing': '351', 'distinct': '2'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'REQUESTED_ACTION'}, {'feature_label': 'REQUESTED_ACTION', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('REQUESTED_ACTION')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'NUMBER_OF_BENEFICIARIES', 'transformation_label': 'String Indexer'}], 'feature': 'NUMBER_OF_BENEFICIARIES', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '149', 'mean': '1.0', 'stddev': '0.0', 'min': '(b)(3) (b)(6) (b)(7)(c)', 'max': '1', 'missing': '351', 'distinct': '1'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'NUMBER_OF_BENEFICIARIES'}, {'feature_label': 'NUMBER_OF_BENEFICIARIES', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('NUMBER_OF_BENEFICIARIES')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'BEN_SEX', 'transformation_label': 'String Indexer'}], 'feature': 'BEN_SEX', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '145', 'mean': '', 'stddev': '', 'min': 'F', 'max': 'M', 'missing': '351', 'distinct': '3'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'BEN_SEX'}, {'feature_label': 'BEN_SEX', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('BEN_SEX')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'BEN_COUNTRY_OF_BIRTH', 'transformation_label': 'String Indexer'}], 'feature': 'BEN_COUNTRY_OF_BIRTH', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '149', 'mean': '', 'stddev': '', 'min': '(b)(3) (b)(6) (b)(7)(c)', 'max': 'VIETN', 'missing': '355', 'distinct': '21'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'BEN_COUNTRY_OF_BIRTH'}, {'feature_label': 'BEN_COUNTRY_OF_BIRTH', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('BEN_COUNTRY_OF_BIRTH')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'BEN_CURRENT_CLASS', 'transformation_label': 'String Indexer'}], 'feature': 'BEN_CURRENT_CLASS', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '115', 'mean': '', 'stddev': '', 'min': 'E2', 'max': 'UU', 'missing': '377', 'distinct': '11'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'BEN_CURRENT_CLASS'}, {'feature_label': 'BEN_CURRENT_CLASS', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('BEN_CURRENT_CLASS')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'JOB_TITLE', 'transformation_label': 'String Indexer'}], 'feature': 'JOB_TITLE', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '88', 'mean': '', 'stddev': '', 'min': 'ADVISORY CONSULTANT', 'max': 'VALIDATION ENGINEER', 'missing': '412', 'distinct': '67'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'JOB_TITLE'}, {'feature_label': 'JOB_TITLE', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('JOB_TITLE')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'DOL_ETA_CASE_NUMBER', 'transformation_label': 'String Indexer'}], 'feature': 'DOL_ETA_CASE_NUMBER', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '144', 'mean': '', 'stddev': '', 'min': '(b)(6)', 'max': '(b)(6)', 'missing': '356', 'distinct': '1'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'DOL_ETA_CASE_NUMBER'}, {'feature_label': 'DOL_ETA_CASE_NUMBER', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('DOL_ETA_CASE_NUMBER')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'WORKSITE_STREET', 'transformation_label': 'String Indexer'}], 'feature': 'WORKSITE_STREET', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '145', 'mean': '', 'stddev': '', 'min': '1 ALLEN BRADLEY DRIVE', 'max': 'SUITE 200 5800 UPLANDER WAY', 'missing': '355', 'distinct': '147'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'WORKSITE_STREET'}, {'feature_label': 'WORKSITE_STREET', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('WORKSITE_STREET')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'WORKSITE_CITY', 'transformation_label': 'String Indexer'}], 'feature': 'WORKSITE_CITY', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '149', 'mean': '', 'stddev': '', 'min': '(b)(3) (b)(6) (b)(7)(c)', 'max': 'WICHITA', 'missing': '355', 'distinct': '99'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'WORKSITE_CITY'}, {'feature_label': 'WORKSITE_CITY', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('WORKSITE_CITY')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'WORKSITE_STATE', 'transformation_label': 'String Indexer'}], 'feature': 'WORKSITE_STATE', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '145', 'mean': '', 'stddev': '', 'min': 'AZ', 'max': 'WV', 'missing': '355', 'distinct': '31'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'WORKSITE_STATE'}, {'feature_label': 'WORKSITE_STATE', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('WORKSITE_STATE')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'WORKSITE_ZIP', 'transformation_label': 'String Indexer'}], 'feature': 'WORKSITE_ZIP', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '145', 'mean': '34604301.81', 'stddev': '166017480.01', 'min': '10018', 'max': '98121', 'missing': '355', 'distinct': '125'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'WORKSITE_ZIP'}, {'feature_label': 'WORKSITE_ZIP', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('WORKSITE_ZIP')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'FULL_TIME_IND', 'transformation_label': 'String Indexer'}], 'feature': 'FULL_TIME_IND', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '1', 'mean': '', 'stddev': '', 'min': '(b)(3) (b)(6) (b)(7)(c)', 'max': '(b)(3) (b)(6) (b)(7)(c)', 'missing': '499', 'distinct': '1'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'FULL_TIME_IND'}, {'feature_label': 'FULL_TIME_IND', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('FULL_TIME_IND')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'WAGE_AMT', 'transformation_label': 'String Indexer'}], 'feature': 'WAGE_AMT', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '0', 'mean': '', 'stddev': '', 'min': '', 'max': '', 'missing': '500', 'distinct': '1'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'WAGE_AMT'}, {'feature_label': 'WAGE_AMT', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('WAGE_AMT')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'WAGE_UNIT', 'transformation_label': 'String Indexer'}], 'feature': 'WAGE_UNIT', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '1', 'mean': '', 'stddev': '', 'min': '(b)(3) (b)(6) (b)(7)(c)', 'max': '(b)(3) (b)(6) (b)(7)(c)', 'missing': '500', 'distinct': '1'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'WAGE_UNIT'}, {'feature_label': 'WAGE_UNIT', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('WAGE_UNIT')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'valid_from', 'transformation_label': 'String Indexer'}], 'feature': 'valid_from', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '143', 'mean': '', 'stddev': '', 'min': '1/24/2022', 'max': '7/7/2022', 'missing': '357', 'distinct': '34'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'valid_from'}, {'feature_label': 'valid_from', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('valid_from')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'valid_to', 'transformation_label': 'String Indexer'}], 'feature': 'valid_to', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '148', 'mean': '', 'stddev': '', 'min': '(b)(3) (b)(6) (b)(7)(c)', 'max': '9/8/2024', 'missing': '357', 'distinct': '43'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'valid_to'}, {'feature_label': 'valid_to', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('valid_to')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'NUM_OF_EMP_IN_US', 'transformation_label': 'String Indexer'}], 'feature': 'NUM_OF_EMP_IN_US', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '145', 'mean': '11260.41', 'stddev': '81383.44', 'min': '0', 'max': '8600', 'missing': '355', 'distinct': '26'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'NUM_OF_EMP_IN_US'}, {'feature_label': 'NUM_OF_EMP_IN_US', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('NUM_OF_EMP_IN_US')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'S1Q1A', 'transformation_label': 'String Indexer'}], 'feature': 'S1Q1A', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '149', 'mean': '', 'stddev': '', 'min': '(b)(3) (b)(6) (b)(7)(c)', 'max': 'Y', 'missing': '351', 'distinct': '2'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'S1Q1A'}, {'feature_label': 'S1Q1A', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('S1Q1A')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'S1Q1B', 'transformation_label': 'String Indexer'}], 'feature': 'S1Q1B', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '0', 'mean': '', 'stddev': '', 'min': '', 'max': '', 'missing': '499', 'distinct': '1'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'S1Q1B'}, {'feature_label': 'S1Q1B', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('S1Q1B')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'BEN_EDUCATION_CODE', 'transformation_label': 'String Indexer'}], 'feature': 'BEN_EDUCATION_CODE', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '149', 'mean': '', 'stddev': '', 'min': '(b)(3) (b)(6) (b)(7)(c)', 'max': 'I', 'missing': '355', 'distinct': '4'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'BEN_EDUCATION_CODE'}, {'feature_label': 'BEN_EDUCATION_CODE', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('BEN_EDUCATION_CODE')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'ED_LEVEL_DEFINITION', 'transformation_label': 'String Indexer'}], 'feature': 'ED_LEVEL_DEFINITION', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '149', 'mean': '', 'stddev': '', 'min': '(b)(3) (b)(6) (b)(7)(c)', 'max': "MASTER'S DEGREE", 'missing': '355', 'distinct': '4'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'ED_LEVEL_DEFINITION'}, {'feature_label': 'ED_LEVEL_DEFINITION', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('ED_LEVEL_DEFINITION')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'BEN_PFIELD_OF_STUDY', 'transformation_label': 'String Indexer'}], 'feature': 'BEN_PFIELD_OF_STUDY', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '149', 'mean': '', 'stddev': '', 'min': '(b)(3) (b)(6) (b)(7)(c)', 'max': 'VISUAL EFFECTS', 'missing': '355', 'distinct': '91'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'BEN_PFIELD_OF_STUDY'}, {'feature_label': 'BEN_PFIELD_OF_STUDY', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('BEN_PFIELD_OF_STUDY')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'BEN_COMP_PAID', 'transformation_label': 'String Indexer'}], 'feature': 'BEN_COMP_PAID', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '145', 'mean': '103396.15', 'stddev': '49218.62', 'min': '100000', 'max': '99688', 'missing': '351', 'distinct': '131'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'BEN_COMP_PAID'}, {'feature_label': 'BEN_COMP_PAID', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('BEN_COMP_PAID')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'DOT_CODE', 'transformation_label': 'String Indexer'}], 'feature': 'DOT_CODE', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '148', 'mean': '44.82', 'stddev': '45.95', 'min': '(b)(3) (b)(6) (b)(7)(c)', 'max': '91', 'missing': '352', 'distinct': '23'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'DOT_CODE'}, {'feature_label': 'DOT_CODE', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('DOT_CODE')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'NAICS_CODE', 'transformation_label': 'String Indexer'}], 'feature': 'NAICS_CODE', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '149', 'mean': '454638.02', 'stddev': '250441.61', 'min': '(b)(3) (b)(6) (b)(7)(c)', 'max': '999999', 'missing': '351', 'distinct': '51'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'NAICS_CODE'}, {'feature_label': 'NAICS_CODE', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('NAICS_CODE')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'S3Q1', 'transformation_label': 'String Indexer'}], 'feature': 'S3Q1', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '149', 'mean': '', 'stddev': '', 'min': '(b)(3) (b)(6) (b)(7)(c)', 'max': 'M', 'missing': '351', 'distinct': '2'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'S3Q1'}, {'feature_label': 'S3Q1', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('S3Q1')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'S4Q1', 'transformation_label': 'String Indexer'}], 'feature': 'S4Q1', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '149', 'mean': '', 'stddev': '', 'min': '(b)(3) (b)(6) (b)(7)(c)', 'max': 'Y', 'missing': '356', 'distinct': '2'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'S4Q1'}, {'feature_label': 'S4Q1', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('S4Q1')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'T_U_VAWA_FLAG', 'transformation_label': 'String Indexer'}], 'feature': 'T_U_VAWA_FLAG', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '1', 'mean': '', 'stddev': '', 'min': '(b)(3) (b)(6) (b)(7)(c)', 'max': '(b)(3) (b)(6) (b)(7)(c)', 'missing': '500', 'distinct': '0'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'T_U_VAWA_FLAG'}, {'feature_label': 'T_U_VAWA_FLAG', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('T_U_VAWA_FLAG')
        display(transformationDF.limit(2).toPandas())
        return transformationDF


***AUTOML FUNCTIONS***

In [None]:
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
import pyspark


def functionClassification(sparkDF, listOfFeatures, label):
    sparkDF.persist(pyspark.StorageLevel.MEMORY_AND_DISK)
    df = (sparkDF.toPandas())
    X = (df.drop(label, axis=1))[listOfFeatures].values
    y = df[label].values
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=1, test_size=0.1)
    tpotModel = TPOTClassifier(verbosity=3, n_jobs=-1, generations=10, max_time_mins=5,
                               population_size=15, use_dask=True)
    tpotModel.fit(X_train, y_train)
    display(" Accuracy of Model : %s" % tpotModel.score(X_test, y_test))
    data = {'model': tpotModel,
            'X_test': X_test,
            'y_test': y_test,
            'label': label,
            'columnNames': listOfFeatures}
    return data


***READING DATAFRAME***

In [None]:
############## CREATE SPARK SESSION ############################ ENTER YOUR SPARK MASTER IP AND PORT TO CONNECT TO SERVER ################
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[1]').getOrCreate()
#%run h1bclassificationHooks.ipynb
try:
	#sourcePreExecutionHook()

	hb = HDFSConnector.fetch(spark, "{'url': '/FileStore/platform/uploadedSourceFiles/H-1B.csv', 'filename': 'H-1B.csv', 'delimiter': ',', 'file_type': 'Delimeted', 'is_header': 'Use Header Line', 'domain': 'http://172.31.59.158', 'port': '40070', 'dirPath': '/FileStore/platform', 'server_url': '/nexusMax/NexusMaxPlatform/uploads/platform/'}")
	#sourcePostExecutionHook(hb)

except Exception as ex: 
	logging.error(ex)
#spark.stop()


***TRANSFORMING DATAFRAME***

In [None]:
#%run h1bclassificationHooks.ipynb
try:
	#transformationPreExecutionHook()

	autofe = TransformationMain.run(hb,json.dumps( {"FE": [{"transformationsData": [{"feature_label": "bcn", "transformation_label": "String Indexer"}], "feature": "bcn", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "(b)(3) (b)(6) (b)(7)(c)", "max": "(b)(6)", "missing": "0", "distinct": "1"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "bcn"}, {"transformationsData": [{"feature_label": "country_of_birth", "transformation_label": "String Indexer"}], "feature": "country_of_birth", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "AFG", "max": "VNM", "missing": "0", "distinct": "39"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "country_of_birth"}, {"transformationsData": [{"feature_label": "country_of_nationality", "transformation_label": "String Indexer"}], "feature": "country_of_nationality", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "(b)(3) (b)(6) (b)(7)(c)", "max": "VNM", "missing": "0", "distinct": "36"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "country_of_nationality"}, {"transformationsData": [{"feature_label": "ben_date_of_birth", "transformation_label": "String Indexer"}], "feature": "ben_date_of_birth", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "(b)(6)", "max": "(b)(6)", "missing": "0", "distinct": "2"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "ben_date_of_birth"}, {"transformationsData": [{"feature_label": "ben_year_of_birth", "transformation_label": "String Indexer"}], "feature": "ben_year_of_birth", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "1989.51", "stddev": "5.97", "min": "(b)(3) (b)(6) (b)(7)(c)", "max": "1999", "missing": "0", "distinct": "32"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "ben_year_of_birth"}, {"transformationsData": [{"feature_label": "gender", "transformation_label": "String Indexer"}], "feature": "gender", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "female", "max": "male", "missing": "0", "distinct": "2"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "gender"}, {"transformationsData": [{"feature_label": "employer_name", "transformation_label": "String Indexer"}], "feature": "employer_name", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "3S Business Corporation", "max": "my choice wireless", "missing": "0", "distinct": "373"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "employer_name"}, {"transformationsData": [{"feature_label": "FEIN", "transformation_label": "String Indexer"}], "feature": "FEIN", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "561151235.02", "stddev": "293876371.39", "min": "(b)(3) (b)(6) (b)(7)(c)", "max": "980429806", "missing": "0", "distinct": "373"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "FEIN"}, {"transformationsData": [{"feature_label": "mail_addr", "transformation_label": "String Indexer"}], "feature": "mail_addr", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "(b)(3) (b)(6) (b)(7)(c)", "max": "Worldwide Plaza 309 West 49th Street", "missing": "0", "distinct": "373"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "mail_addr"}, {"transformationsData": [{"feature_label": "city", "transformation_label": "String Indexer"}], "feature": "city", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "Addison", "max": "Wixom", "missing": "0", "distinct": "211"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "city"}, {"transformationsData": [{"feature_label": "state", "transformation_label": "String Indexer"}], "feature": "state", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "AL", "max": "WV", "missing": "0", "distinct": "36"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "state"}, {"transformationsData": [{"feature_label": "zip", "transformation_label": "String Indexer"}], "feature": "zip", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "50915.86", "stddev": "32472.89", "min": "01752-1291", "max": "98121-1902", "missing": "0", "distinct": "365"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "zip"}, {"transformationsData": [{"feature_label": "agent_first_name", "transformation_label": "String Indexer"}], "feature": "agent_first_name", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "499", "mean": "", "stddev": "", "min": "(b)(3) (b)(6) (b)(7)(c)", "max": "kim", "missing": "1", "distinct": "344"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "agent_first_name"}, {"transformationsData": [{"feature_label": "agent_last_name", "transformation_label": "String Indexer"}], "feature": "agent_last_name", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "(b)(3) (b)(6) (b)(7)(c)", "max": "van Burk", "missing": "0", "distinct": "373"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "agent_last_name"}, {"transformationsData": [{"feature_label": "lottery_year", "transformation_label": "String Indexer"}], "feature": "lottery_year", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "2022.0", "stddev": "0.0", "min": "2022", "max": "2022", "missing": "0", "distinct": "2"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "lottery_year"}, {"transformationsData": [{"feature_label": "status_type", "transformation_label": "String Indexer"}], "feature": "status_type", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "(b)(3) (b)(6) (b)(7)(c)", "max": "SELECTED", "missing": "0", "distinct": "3"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "status_type"}, {"transformationsData": [{"feature_label": "ben_multi_reg_ind", "transformation_label": "String Indexer"}], "feature": "ben_multi_reg_ind", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "0.32", "stddev": "0.47", "min": "0", "max": "1", "missing": "0", "distinct": "2"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "ben_multi_reg_ind"}, {"transformationsData": [{"feature_label": "RECEIPT_NUMBER", "transformation_label": "String Indexer"}], "feature": "RECEIPT_NUMBER", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "149", "mean": "", "stddev": "", "min": "(b)(3) (b)(6) (b)(7)(c)", "max": "(b)(6)", "missing": "351", "distinct": "2"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "RECEIPT_NUMBER"}, {"transformationsData": [{"feature_label": "rec_date", "transformation_label": "String Indexer"}], "feature": "rec_date", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "145", "mean": "", "stddev": "", "min": "1/18/2022", "max": "9/8/2021", "missing": "351", "distinct": "74"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "rec_date"}, {"transformationsData": [{"feature_label": "FIRST_DECISION", "transformation_label": "String Indexer"}], "feature": "FIRST_DECISION", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "145", "mean": "", "stddev": "", "min": "Approved", "max": "Denied", "missing": "351", "distinct": "2"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "FIRST_DECISION"}, {"transformationsData": [{"feature_label": "first_decision_date", "transformation_label": "String Indexer"}], "feature": "first_decision_date", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "145", "mean": "", "stddev": "", "min": "1/12/2022", "max": "9/9/2021", "missing": "355", "distinct": "106"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "first_decision_date"}, {"transformationsData": [{"feature_label": "i129_employer_name", "transformation_label": "String Indexer"}], "feature": "i129_employer_name", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "149", "mean": "", "stddev": "", "min": "(b)(3) (b)(6) (b)(7)(c)", "max": "ZENSOFT LLC", "missing": "355", "distinct": "122"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "i129_employer_name"}, {"transformationsData": [{"feature_label": "PET_STREET", "transformation_label": "String Indexer"}], "feature": "PET_STREET", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "145", "mean": "", "stddev": "", "min": "1 CRANBERRY HILL", "max": "ONE APPLE PARK WAY M/S 1041GM", "missing": "351", "distinct": "128"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "PET_STREET"}, {"transformationsData": [{"feature_label": "PET_CITY", "transformation_label": "String Indexer"}], "feature": "PET_CITY", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "145", "mean": "", "stddev": "", "min": "ARLINGTON HEIGHTS", "max": "WALTHAM", "missing": "355", "distinct": "90"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "PET_CITY"}, {"transformationsData": [{"feature_label": "PET_STATE", "transformation_label": "String Indexer"}], "feature": "PET_STATE", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "149", "mean": "", "stddev": "", "min": "(b)(3) (b)(6) (b)(7)(c)", "max": "WI", "missing": "351", "distinct": "26"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "PET_STATE"}, {"transformationsData": [{"feature_label": "PET_ZIP", "transformation_label": "String Indexer"}], "feature": "PET_ZIP", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "149", "mean": "86937300.11", "stddev": "248605944.83", "min": "(b)(3) (b)(6) (b)(7)(c)", "max": "98121", "missing": "355", "distinct": "115"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "PET_ZIP"}, {"transformationsData": [{"feature_label": "REQUESTED_CLASS", "transformation_label": "String Indexer"}], "feature": "REQUESTED_CLASS", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "145", "mean": "", "stddev": "", "min": "1B1", "max": "1B1", "missing": "355", "distinct": "1"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "REQUESTED_CLASS"}, {"transformationsData": [{"feature_label": "BASIS_FOR_CLASSIFICATION", "transformation_label": "String Indexer"}], "feature": "BASIS_FOR_CLASSIFICATION", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "149", "mean": "", "stddev": "", "min": "(b)(3) (b)(6) (b)(7)(c)", "max": "A", "missing": "355", "distinct": "1"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "BASIS_FOR_CLASSIFICATION"}, {"transformationsData": [{"feature_label": "REQUESTED_ACTION", "transformation_label": "String Indexer"}], "feature": "REQUESTED_ACTION", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "145", "mean": "", "stddev": "", "min": "A", "max": "B", "missing": "351", "distinct": "2"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "REQUESTED_ACTION"}, {"transformationsData": [{"feature_label": "NUMBER_OF_BENEFICIARIES", "transformation_label": "String Indexer"}], "feature": "NUMBER_OF_BENEFICIARIES", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "149", "mean": "1.0", "stddev": "0.0", "min": "(b)(3) (b)(6) (b)(7)(c)", "max": "1", "missing": "351", "distinct": "1"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "NUMBER_OF_BENEFICIARIES"}, {"transformationsData": [{"feature_label": "BEN_SEX", "transformation_label": "String Indexer"}], "feature": "BEN_SEX", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "145", "mean": "", "stddev": "", "min": "F", "max": "M", "missing": "351", "distinct": "3"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "BEN_SEX"}, {"transformationsData": [{"feature_label": "BEN_COUNTRY_OF_BIRTH", "transformation_label": "String Indexer"}], "feature": "BEN_COUNTRY_OF_BIRTH", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "149", "mean": "", "stddev": "", "min": "(b)(3) (b)(6) (b)(7)(c)", "max": "VIETN", "missing": "355", "distinct": "21"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "BEN_COUNTRY_OF_BIRTH"}, {"transformationsData": [{"feature_label": "BEN_CURRENT_CLASS", "transformation_label": "String Indexer"}], "feature": "BEN_CURRENT_CLASS", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "115", "mean": "", "stddev": "", "min": "E2", "max": "UU", "missing": "377", "distinct": "11"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "BEN_CURRENT_CLASS"}, {"transformationsData": [{"feature_label": "JOB_TITLE", "transformation_label": "String Indexer"}], "feature": "JOB_TITLE", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "88", "mean": "", "stddev": "", "min": "ADVISORY CONSULTANT", "max": "VALIDATION ENGINEER", "missing": "412", "distinct": "67"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "JOB_TITLE"}, {"transformationsData": [{"feature_label": "DOL_ETA_CASE_NUMBER", "transformation_label": "String Indexer"}], "feature": "DOL_ETA_CASE_NUMBER", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "144", "mean": "", "stddev": "", "min": "(b)(6)", "max": "(b)(6)", "missing": "356", "distinct": "1"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "DOL_ETA_CASE_NUMBER"}, {"transformationsData": [{"feature_label": "WORKSITE_STREET", "transformation_label": "String Indexer"}], "feature": "WORKSITE_STREET", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "145", "mean": "", "stddev": "", "min": "1 ALLEN BRADLEY DRIVE", "max": "SUITE 200 5800 UPLANDER WAY", "missing": "355", "distinct": "147"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "WORKSITE_STREET"}, {"transformationsData": [{"feature_label": "WORKSITE_CITY", "transformation_label": "String Indexer"}], "feature": "WORKSITE_CITY", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "149", "mean": "", "stddev": "", "min": "(b)(3) (b)(6) (b)(7)(c)", "max": "WICHITA", "missing": "355", "distinct": "99"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "WORKSITE_CITY"}, {"transformationsData": [{"feature_label": "WORKSITE_STATE", "transformation_label": "String Indexer"}], "feature": "WORKSITE_STATE", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "145", "mean": "", "stddev": "", "min": "AZ", "max": "WV", "missing": "355", "distinct": "31"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "WORKSITE_STATE"}, {"transformationsData": [{"feature_label": "WORKSITE_ZIP", "transformation_label": "String Indexer"}], "feature": "WORKSITE_ZIP", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "145", "mean": "34604301.81", "stddev": "166017480.01", "min": "10018", "max": "98121", "missing": "355", "distinct": "125"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "WORKSITE_ZIP"}, {"transformationsData": [{"feature_label": "FULL_TIME_IND", "transformation_label": "String Indexer"}], "feature": "FULL_TIME_IND", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "1", "mean": "", "stddev": "", "min": "(b)(3) (b)(6) (b)(7)(c)", "max": "(b)(3) (b)(6) (b)(7)(c)", "missing": "499", "distinct": "1"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "FULL_TIME_IND"}, {"transformationsData": [{"feature_label": "WAGE_AMT", "transformation_label": "String Indexer"}], "feature": "WAGE_AMT", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "0", "mean": "", "stddev": "", "min": "", "max": "", "missing": "500", "distinct": "1"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "WAGE_AMT"}, {"transformationsData": [{"feature_label": "WAGE_UNIT", "transformation_label": "String Indexer"}], "feature": "WAGE_UNIT", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "1", "mean": "", "stddev": "", "min": "(b)(3) (b)(6) (b)(7)(c)", "max": "(b)(3) (b)(6) (b)(7)(c)", "missing": "500", "distinct": "1"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "WAGE_UNIT"}, {"transformationsData": [{"feature_label": "valid_from", "transformation_label": "String Indexer"}], "feature": "valid_from", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "143", "mean": "", "stddev": "", "min": "1/24/2022", "max": "7/7/2022", "missing": "357", "distinct": "34"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "valid_from"}, {"transformationsData": [{"feature_label": "valid_to", "transformation_label": "String Indexer"}], "feature": "valid_to", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "148", "mean": "", "stddev": "", "min": "(b)(3) (b)(6) (b)(7)(c)", "max": "9/8/2024", "missing": "357", "distinct": "43"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "valid_to"}, {"transformationsData": [{"feature_label": "NUM_OF_EMP_IN_US", "transformation_label": "String Indexer"}], "feature": "NUM_OF_EMP_IN_US", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "145", "mean": "11260.41", "stddev": "81383.44", "min": "0", "max": "8600", "missing": "355", "distinct": "26"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "NUM_OF_EMP_IN_US"}, {"transformationsData": [{"feature_label": "S1Q1A", "transformation_label": "String Indexer"}], "feature": "S1Q1A", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "149", "mean": "", "stddev": "", "min": "(b)(3) (b)(6) (b)(7)(c)", "max": "Y", "missing": "351", "distinct": "2"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "S1Q1A"}, {"transformationsData": [{"feature_label": "S1Q1B", "transformation_label": "String Indexer"}], "feature": "S1Q1B", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "0", "mean": "", "stddev": "", "min": "", "max": "", "missing": "499", "distinct": "1"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "S1Q1B"}, {"transformationsData": [{"feature_label": "BEN_EDUCATION_CODE", "transformation_label": "String Indexer"}], "feature": "BEN_EDUCATION_CODE", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "149", "mean": "", "stddev": "", "min": "(b)(3) (b)(6) (b)(7)(c)", "max": "I", "missing": "355", "distinct": "4"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "BEN_EDUCATION_CODE"}, {"transformationsData": [{"feature_label": "ED_LEVEL_DEFINITION", "transformation_label": "String Indexer"}], "feature": "ED_LEVEL_DEFINITION", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "149", "mean": "", "stddev": "", "min": "(b)(3) (b)(6) (b)(7)(c)", "max": "MASTER'S DEGREE", "missing": "355", "distinct": "4"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "ED_LEVEL_DEFINITION"}, {"transformationsData": [{"feature_label": "BEN_PFIELD_OF_STUDY", "transformation_label": "String Indexer"}], "feature": "BEN_PFIELD_OF_STUDY", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "149", "mean": "", "stddev": "", "min": "(b)(3) (b)(6) (b)(7)(c)", "max": "VISUAL EFFECTS", "missing": "355", "distinct": "91"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "BEN_PFIELD_OF_STUDY"}, {"transformationsData": [{"feature_label": "BEN_COMP_PAID", "transformation_label": "String Indexer"}], "feature": "BEN_COMP_PAID", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "145", "mean": "103396.15", "stddev": "49218.62", "min": "100000", "max": "99688", "missing": "351", "distinct": "131"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "BEN_COMP_PAID"}, {"transformationsData": [{"feature_label": "DOT_CODE", "transformation_label": "String Indexer"}], "feature": "DOT_CODE", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "148", "mean": "44.82", "stddev": "45.95", "min": "(b)(3) (b)(6) (b)(7)(c)", "max": "91", "missing": "352", "distinct": "23"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "DOT_CODE"}, {"transformationsData": [{"feature_label": "NAICS_CODE", "transformation_label": "String Indexer"}], "feature": "NAICS_CODE", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "149", "mean": "454638.02", "stddev": "250441.61", "min": "(b)(3) (b)(6) (b)(7)(c)", "max": "999999", "missing": "351", "distinct": "51"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "NAICS_CODE"}, {"transformationsData": [{"feature_label": "S3Q1", "transformation_label": "String Indexer"}], "feature": "S3Q1", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "149", "mean": "", "stddev": "", "min": "(b)(3) (b)(6) (b)(7)(c)", "max": "M", "missing": "351", "distinct": "2"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "S3Q1"}, {"transformationsData": [{"feature_label": "S4Q1", "transformation_label": "String Indexer"}], "feature": "S4Q1", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "149", "mean": "", "stddev": "", "min": "(b)(3) (b)(6) (b)(7)(c)", "max": "Y", "missing": "356", "distinct": "2"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "S4Q1"}, {"transformationsData": [{"feature_label": "T_U_VAWA_FLAG", "transformation_label": "String Indexer"}], "feature": "T_U_VAWA_FLAG", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "1", "mean": "", "stddev": "", "min": "(b)(3) (b)(6) (b)(7)(c)", "max": "(b)(3) (b)(6) (b)(7)(c)", "missing": "500", "distinct": "0"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "T_U_VAWA_FLAG"}]}))

	#transformationPostExecutionHook(autofe)

except Exception as ex: 
	logging.error(ex)


***TRAIN MODEL***

In [None]:
#%run h1bclassificationHooks.ipynb
try:
	#mlPreExecutionHook()

	dataAutoML=functionClassification(autofe, ["bcn_stringindexer", "country_of_birth_stringindexer", "country_of_nationality_stringindexer", "ben_date_of_birth_stringindexer", "ben_year_of_birth_stringindexer", "gender_stringindexer", "employer_name_stringindexer", "FEIN_stringindexer", "mail_addr_stringindexer", "city_stringindexer", "state_stringindexer", "zip_stringindexer", "agent_first_name_stringindexer", "agent_last_name_stringindexer", "lottery_year_stringindexer", "ben_multi_reg_ind_stringindexer", "RECEIPT_NUMBER_stringindexer", "rec_date_stringindexer", "FIRST_DECISION_stringindexer", "first_decision_date_stringindexer", "i129_employer_name_stringindexer", "PET_STREET_stringindexer", "PET_CITY_stringindexer", "PET_STATE_stringindexer", "PET_ZIP_stringindexer", "REQUESTED_CLASS_stringindexer", "BASIS_FOR_CLASSIFICATION_stringindexer", "REQUESTED_ACTION_stringindexer", "NUMBER_OF_BENEFICIARIES_stringindexer", "BEN_SEX_stringindexer", "BEN_COUNTRY_OF_BIRTH_stringindexer", "BEN_CURRENT_CLASS_stringindexer", "JOB_TITLE_stringindexer", "DOL_ETA_CASE_NUMBER_stringindexer", "WORKSITE_STREET_stringindexer", "WORKSITE_CITY_stringindexer", "WORKSITE_STATE_stringindexer", "WORKSITE_ZIP_stringindexer", "FULL_TIME_IND_stringindexer", "WAGE_AMT_stringindexer", "WAGE_UNIT_stringindexer", "valid_from_stringindexer", "valid_to_stringindexer", "NUM_OF_EMP_IN_US_stringindexer", "S1Q1A_stringindexer", "S1Q1B_stringindexer", "BEN_EDUCATION_CODE_stringindexer", "ED_LEVEL_DEFINITION_stringindexer", "BEN_PFIELD_OF_STUDY_stringindexer", "BEN_COMP_PAID_stringindexer", "DOT_CODE_stringindexer", "NAICS_CODE_stringindexer", "S3Q1_stringindexer", "S4Q1_stringindexer", "T_U_VAWA_FLAG_stringindexer"], "status_type_stringindexer")

	#mlPostExecutionHook(dataAutoML)

except Exception as ex: 
	logging.error(ex)
#spark.stop()


***PREDICT ON TRAINED MODEL***

In [None]:
import pandas as pd
import numpy as np
import sklearn.metrics

try:
    model=dataAutoML['model']
    X_test=dataAutoML['X_test']
    y_test=dataAutoML['y_test']
    label=dataAutoML['label']
    columnNames=dataAutoML['columnNames']
    if label in columnNames:
        columnNames.remove(label)
    predicted=label+"_predicted"
    y_predicted=model.predict(X_test)
    df =pd.DataFrame(X_test , columns=columnNames)
    df[label]=y_test
    df[predicted]=y_predicted
    columnNames.insert(0,predicted)
    columnNames.insert(0,label)
    Accuracy = np.round((100 * sklearn.metrics.accuracy_score(y_true=y_test, y_pred=y_predicted)), 1)
    F1= np.round(
            (100 * sklearn.metrics.f1_score(y_true=y_test, y_pred=y_predicted, average="weighted")), 1)
    Precision= np.round((
                100 * sklearn.metrics.precision_score(y_true=y_test, y_pred=y_predicted, average="weighted")), 1)
    Recall = np.round((
                100 * sklearn.metrics.recall_score(y_true=y_test, y_pred=y_predicted, average="weighted")), 1)
    display(" Accuracy of Prediction on test data    : %s"%Accuracy)
    display(" F1 score of Prediction on test data    : %s"%F1)
    display(" Precision of Prediction on test data   : %s"%Precision)
    display(" Recall of Prediction on test data      : %s"%Recall)
    display(df.head())
except Exception as ex:
    logging.error(ex)

spark.stop()

