***GENERATED CODE FOR sp500 PIPELINE.***

***DON'T EDIT THIS CODE.***

***CONNECTOR FUNCTIONS TO READ DATA.***

In [None]:
import os
import datetime
import logging
import warnings
warnings.filterwarnings('ignore')
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)


class HDFSConnector:

    def fetch(spark, config):
        ################### INPUT HADOOP HOST PORT TO CONNECT WITH ###############################
        hdfs_server = str(os.environ['HDFS_SERVER'])
        hdfs_port = int(os.environ['HDFS_PORT'])
        df = spark.read.options(header='true', inferschema='true').csv(
            f"hdfs://{hdfs_server}:{hdfs_port}{eval(config)['url']}", header='true')
        display(df.limit(2).toPandas())
        return df

    def put(df, spark, config):
        return df.write.format('csv').options(header='true' if eval(config)["is_header"] == "Use Header Line" else 'false',
                                              delimiter=eval(config)["delimiter"]).save(("%s %s") % (datetime.datetime.now().strftime("%Y-%m-%d %H.%M.%S")+"_", eval(config)['url']))


***TRANSFORMATIONS FUNCTIONS THAT WILL BE APPLIED ON DATA***

In [None]:
from pyspark.sql.functions import dayofmonth, month, year, col
import json
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import Binarizer
from pyspark.sql.functions import round
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import col, when
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import mean, stddev, min, max, col


class CleanseData:
    # def __init__(self,df):
    #     #print()

    def cleanValueForFE(self, value):
        if value == None:
            return ""
        elif str(value) == 'nan':
            return "nan"
        else:
            return value

    def replaceByMean(self, feature, df, mean_=-1):
        df1 = df
        df1 = df1.dropna()
        meanValue = self.cleanValueForFE(df1.select(
            mean(col(feature.name)).alias('mean')).collect()[0]["mean"])
        df = df.fillna(meanValue, subset=[feature.name])
        df.withColumn(feature.name, when(col(feature.name) == " ",
                      meanValue).otherwise(col(feature.name).cast("Integer")))
        return df

    def replaceByMax(self, feature, df, max_=-1):
        df1 = df
        df1 = df1.dropna()
        maxValue = self.cleanValueForFE(df1.select(
            max(col(feature.name)).alias('max')).collect()[0]["max"])
        df = df.fillna(maxValue, subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", maxValue).otherwise(col(feature.name)))
        return df

    def replaceByMin(self, feature, df, min_=-1):
        df1 = df
        df1 = df1.dropna()
        minValue = self.cleanValueForFE(df1.select(
            min(col(feature.name)).alias('min')).collect()[0]["min"])
        df = df.fillna(minValue, subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", minValue).otherwise(col(feature.name)))
        return df

    def replaceByStandardDeviation(self, feature, df, stddev_=-1):
        df1 = df
        df1 = df1.dropna()
        stddevValue = self.cleanValueForFE(df1.select(
            stddev(col(feature.name)).alias('stddev')).collect()[0]["stddev"])
        df = df.fillna(stddevValue, subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", stddevValue).otherwise(col(feature.name)))
        return df

    def replaceDateRandomly(self, feature, df):
        df1 = df
        df1 = df1.dropna()
        fillValue = self.cleanValueForFE(
            df.where(col(feature.name).isNotNull()).head(1)[0][feature.name])
        df = df.fillna(str(fillValue), subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", fillValue).otherwise(col(feature.name)))
        # print("CleanseData:replaceDateRandomly Schema : ", df.#printSchema())
        return df

    def replaceNullValues(self, fList, df):
        featuresList = df.schema.fields
        for featureObj in fList:
            for feat in featuresList:
                if featureObj["feature"] in feat.name:
                    featureName = feat
                    if "mean" in featureObj["replaceby"]:
                        df = self.replaceByMean(featureName, df)
                    elif "max" in featureObj["replaceby"]:
                        df = self.replaceByMax(featureName, df)
                    elif "min" in featureObj["replaceby"]:
                        df = self.replaceByMin(featureName, df)
                    elif "stddev" in featureObj["replaceby"]:
                        df = self.replaceByStandardDeviation(featureName, df)
                    elif "random" in featureObj["replaceby"]:
                        df = self.replaceDateRandomly(featureName, df)
        return df


def ExtractDateTransform(df, params, transformationData={}):
    transform_params = params
    dfReturn = df
    feature = transform_params['feature']
    dfReturn = dfReturn.fillna({feature: ''})
    dfReturn = dfReturn.withColumn(
        feature+'dayofmonth', dayofmonth(col(feature)))
    dfReturn = dfReturn.withColumn(feature+'month', month(col(feature)))
    dfReturn = dfReturn.withColumn(feature+'year', year(col(feature)))
    return dfReturn


def StringIndexerTransform(df, params, transformationData={}):
    dfReturn = df
    feature = params["feature"]

    dfReturn = dfReturn.fillna({feature: ''})
    outcol = feature + "_stringindexer"
    indexer = StringIndexer(
        inputCol=feature, outputCol=outcol, handleInvalid="skip")
    indexed = indexer.fit(dfReturn).transform(dfReturn)
    dfReturn = indexed
    distinct_values_list = dfReturn.select(
        outcol).distinct().rdd.map(lambda r: r[0]).collect()
    len_distinct_values_list = len(distinct_values_list)
    if len_distinct_values_list <= 4:
        changed_type_df = dfReturn.withColumn(
            outcol, dfReturn[outcol].cast(IntegerType()))
        return changed_type_df
    return dfReturn


def BinarizerTransform(df, params, transformationData={}):
    dfReturn = df
    transform_params = params
    feature = transform_params['feature']
    outcol = feature + "_binarizer"
    dfReturn = dfReturn.withColumn("feature_cast", dfReturn[feature].cast("double")).drop(feature)\
        .withColumnRenamed("feature_cast", feature)

    dfReturn = dfReturn.fillna({feature: 0.0})
    binarizer = Binarizer(threshold=float(
        transformationData['threshold']), inputCol=feature, outputCol=outcol)
    binarizedDataFrame = binarizer.transform(dfReturn)

    # binarizedDataFrame=binarizedDataFrame.drop(feature).withColumnRenamed(outcol,feature)

    dfReturn = binarizedDataFrame
    dfReturn = dfReturn.withColumn(feature, round(dfReturn[feature], 2))

    return dfReturn


class TransformationMain:
    # TODO: change df argument in run with following
    def run(transformationDF, config):
        configObj = json.loads(config)
        featureData = configObj["FE"]
        transformationDF = CleanseData().replaceNullValues(featureData, transformationDF)
        transformationDF = ExtractDateTransform(transformationDF, {'transformationsData': [{'feature_label': 'Date', 'transformation_label': 'Extract Date'}], 'feature': 'Date', 'type': 'date', 'selected': 'True', 'replaceby': 'random', 'stats': {
                                                'count': '', 'mean': '', 'stddev': '', 'min': '', 'max': '', 'missing': '0'}, 'transformation': [{'transformation': 'Extract Date', 'selectedAsDefault': 1}], 'generated': 'False', 'updatedLabel': 'Date'}, {'feature_label': 'Date', 'transformation_label': 'Extract Date'})
        transformationDF = transformationDF.drop('Date')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Exchange', 'transformation_label': 'String Indexer'}], 'feature': 'Exchange', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': 'NMS', 'max': 'NYQ', 'missing': '0', 'distinct': '2'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Exchange'}, {'feature_label': 'Exchange', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Exchange')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Symbol', 'transformation_label': 'String Indexer'}], 'feature': 'Symbol', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': 'AAPL', 'max': 'XOM', 'missing': '0', 'distinct': '50'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Symbol'}, {'feature_label': 'Symbol', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Symbol')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Shortname', 'transformation_label': 'String Indexer'}], 'feature': 'Shortname', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': 'AbbVie Inc.', 'max': 'Wells Fargo & Company', 'missing': '0', 'distinct': '49'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Shortname'}, {'feature_label': 'Shortname', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Shortname')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Longname', 'transformation_label': 'String Indexer'}], 'feature': 'Longname', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': 'AbbVie Inc.', 'max': 'Wells Fargo & Company', 'missing': '0', 'distinct': '49'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Longname'}, {'feature_label': 'Longname', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Longname')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Sector', 'transformation_label': 'String Indexer'}], 'feature': 'Sector', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': 'Basic Materials', 'max': 'Technology', 'missing': '0', 'distinct': '9'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Sector'}, {'feature_label': 'Sector', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Sector')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Industry', 'transformation_label': 'String Indexer'}], 'feature': 'Industry', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': 'Aerospace & Defense', 'max': 'Tobacco', 'missing': '0', 'distinct': '30'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Industry'}, {'feature_label': 'Industry', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Industry')
        transformationDF = BinarizerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Ebitda', 'threshold': 2106018441.28, 'transformation_label': 'Binarizer'}], 'feature': 'Ebitda', 'type': 'real', 'selected': 'True', 'replaceby': 'mean', 'stats': {
            'count': '500', 'mean': '32126601984.0', 'stddev': '39856989180.06', 'min': '3.97707008E8', 'max': '1.49547008E11', 'missing': '0'}, 'transformation': [{'transformation': 'Binarizer', 'selectedAsDefault': 1}], 'updatedLabel': 'Ebitda'}, {'feature_label': 'Ebitda', 'threshold': 2106018441.28, 'transformation_label': 'Binarizer'})
        transformationDF = transformationDF.drop('Ebitda')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'City', 'transformation_label': 'String Indexer'}], 'feature': 'City', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': 'Armonk', 'max': 'Woking', 'missing': '0', 'distinct': '37'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'City'}, {'feature_label': 'City', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('City')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'State', 'transformation_label': 'String Indexer'}], 'feature': 'State', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': 'AR', 'max': 'WA', 'missing': '0', 'distinct': '17'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'State'}, {'feature_label': 'State', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('State')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Country', 'transformation_label': 'String Indexer'}], 'feature': 'Country', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '500', 'mean': '', 'stddev': '', 'min': 'Ireland', 'max': 'United States', 'missing': '0', 'distinct': '3'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Country'}, {'feature_label': 'Country', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Country')
        transformationDF = BinarizerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Fulltimeemployees', 'threshold': 210215.18, 'transformation_label': 'Binarizer'}], 'feature': 'Fulltimeemployees', 'type': 'real', 'selected': 'True', 'replaceby': 'mean', 'stats': {
            'count': '500', 'mean': '210215.18', 'stddev': '364585.21', 'min': '3892.0', 'max': '2100000.0', 'missing': '0'}, 'transformation': [{'transformation': 'Binarizer', 'selectedAsDefault': 1}], 'updatedLabel': 'Fulltimeemployees'}, {'feature_label': 'Fulltimeemployees', 'threshold': 210215.18, 'transformation_label': 'Binarizer'})
        transformationDF = transformationDF.drop('Fulltimeemployees')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'Longbusinesssummary', 'transformation_label': 'String Indexer'}], 'feature': 'Longbusinesssummary', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {'count': '500', 'mean': '', 'stddev': '', 'min': "AbbVie Inc. discovers, develops, manufactures, and sells pharmaceuticals worldwide. The company offers Humira, an injection for autoimmune and intestinal Behçet's diseases, and pyoderma gangrenosum; Skyrizi to treat moderate to severe plaque psoriasis, psoriatic disease, and Crohn's disease; Rinvoq to treat rheumatoid and psoriatic arthritis, ankylosing spondylitis, atopic dermatitis, axial spondyloarthropathy, ulcerative colitis, and Crohn's disease; Imbruvica for the treatment of adult patients with blood cancers; Epkinly to treat lymphoma; Elahere to treat cancer; and Venclexta/Venclyxto to treat blood cancers. It also provides facial injectables, plastics and regenerative medicine, body contouring, and skincare products; botox therapeutic; Vraylar for depressive disorder; Duopa and Duodopa to treat advanced Parkinson's disease; Ubrelvy for the acute treatment of migraine in adults; and Qulipta for episodic and chronic migraine. In addition, the company offers Ozurdex for eye diseases; Lumigan/Ganfort and Alphagan/Combigan for the reduction of elevated intraocular pressure in patients with open angle glaucoma or ocular hypertension; Restasis to increase tear production; and other eye care products. Further, it provides Mavyret/Maviret to treat chronic hepatitis C virus genotype 1-6 infection; Creon, a pancreatic enzyme therapy; Lupron to treat advanced prostate cancer, endometriosis and central precocious puberty, and patients with anemia caused by uterine fibroids; Linzess/Constella to treat irritable bowel syndrome with constipation and chronic idiopathic constipation; and Synthroid for hypothyroidism. It has collaborations with Calico Life Sciences LLC; REGENXBIO Inc.; Janssen Biotech, Inc.; Evolveimmune Therapeutics, Inc.; Genentech, Inc.; and Tentarix Biotherapeutics, LP. The company was incorporated in 2012 and is headquartered in North Chicago, Illinois.",
                                                                                                                                                                                                                                                                                        'max': 'Wells Fargo & Company, a financial services company, provides diversified banking, investment, mortgage, and consumer and commercial finance products and services in the United States and internationally. The company operates through four segments: Consumer Banking and Lending; Commercial Banking; Corporate and Investment Banking; and Wealth and Investment Management. The Consumer Banking and Lending segment offers diversified financial products and services for consumers and small businesses. Its financial products and services include checking and savings accounts, and credit and debit cards, as well as home, auto, personal, and small business lending services. The Commercial Banking segment provides financial solutions to private, family owned, and certain public companies. Its products and services include banking and credit products across various industry sectors and municipalities, secured lending and lease products, and treasury management services. The Corporate and Investment Banking segment offers a suite of capital markets, banking, and financial products and services, such as corporate banking, investment banking, treasury management, commercial real estate lending and servicing, equity, and fixed income solutions, as well as sales, trading, and research capabilities services to corporate, commercial real estate, government, and institutional clients. The Wealth and Investment Management segment provides personalized wealth management, brokerage, financial planning, lending, private banking, and trust and fiduciary products and services to affluent, high-net worth, and ultra-high-net worth clients. It also operates through financial advisors in brokerage and wealth offices, consumer bank branches, independent offices, and digitally through WellsTrade and Intuitive Investor. The company was founded in 1852 and is headquartered in San Francisco, California.', 'missing': '0', 'distinct': '49'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'Longbusinesssummary'}, {'feature_label': 'Longbusinesssummary', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('Longbusinesssummary')
        display(transformationDF.limit(2).toPandas())
        return transformationDF


***AUTOML FUNCTIONS***

In [None]:
from sklearn.model_selection import train_test_split
from tpot import TPOTRegressor
import pyspark


def functionRegression(sparkDF, listOfFeatures, label):
    sparkDF.persist(pyspark.StorageLevel.MEMORY_AND_DISK)
    df = sparkDF.toPandas()
    X = (df.drop(label, axis=1))[listOfFeatures].values
    y = df[label].values
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=1, test_size=0.1)
    tpotModel = TPOTRegressor(verbosity=3, generations=10, max_time_mins=5,
                              n_jobs=-1, random_state=25, population_size=15, use_dask=True)
    tpotModel.fit(X_train, y_train)
    display(" Error rate of Model : %s" % tpotModel.score(X_test, y_test))
    data = {'model': tpotModel,
            'X_test': X_test,
            'y_test': y_test,
            'label': label,
            'columnNames': listOfFeatures}
    return data


***READING DATAFRAME***

In [None]:
############## CREATE SPARK SESSION ############################ ENTER YOUR SPARK MASTER IP AND PORT TO CONNECT TO SERVER ################
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[1]').getOrCreate()
#%run sp500Hooks.ipynb
try:
	#sourcePreExecutionHook()

	reducedsptimeseries = HDFSConnector.fetch(spark, "{'url': '/FileStore/platform/uploadedSourceFiles/reduced_sp500_timeseries (1).csv', 'filename': 'reduced_sp500_timeseries (1).csv', 'delimiter': ',', 'file_type': 'Delimeted', 'is_header': 'Use Header Line', 'domain': 'http://172.31.59.158', 'port': '40070', 'dirPath': '/FileStore/platform', 'server_url': '/nexusMax/NexusMaxPlatform/uploads/platform/'}")
	#sourcePostExecutionHook(reducedsptimeseries)

except Exception as ex: 
	logging.error(ex)
#spark.stop()


***TRANSFORMING DATAFRAME***

In [None]:
#%run sp500Hooks.ipynb
try:
	#transformationPreExecutionHook()

	spautofe = TransformationMain.run(reducedsptimeseries,json.dumps( {"FE": [{"transformationsData": [{"feature_label": "Date", "transformation_label": "Extract Date"}], "feature": "Date", "type": "date", "selected": "True", "replaceby": "random", "stats": {"count": "", "mean": "", "stddev": "", "min": "", "max": "", "missing": "0"}, "transformation": [{"transformation": "Extract Date", "selectedAsDefault": 1}], "generated": "False", "updatedLabel": "Date"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "S&P500", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "500", "mean": "2064.26", "stddev": "28.6", "min": "2002.61", "max": "2090.57", "missing": "0"}, "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "updatedLabel": "S&P500"}, {"transformationsData": [{"feature_label": "Exchange", "transformation_label": "String Indexer"}], "feature": "Exchange", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "NMS", "max": "NYQ", "missing": "0", "distinct": "2"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Exchange"}, {"transformationsData": [{"feature_label": "Symbol", "transformation_label": "String Indexer"}], "feature": "Symbol", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "AAPL", "max": "XOM", "missing": "0", "distinct": "50"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Symbol"}, {"transformationsData": [{"feature_label": "Shortname", "transformation_label": "String Indexer"}], "feature": "Shortname", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "AbbVie Inc.", "max": "Wells Fargo & Company", "missing": "0", "distinct": "49"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Shortname"}, {"transformationsData": [{"feature_label": "Longname", "transformation_label": "String Indexer"}], "feature": "Longname", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "AbbVie Inc.", "max": "Wells Fargo & Company", "missing": "0", "distinct": "49"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Longname"}, {"transformationsData": [{"feature_label": "Sector", "transformation_label": "String Indexer"}], "feature": "Sector", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "Basic Materials", "max": "Technology", "missing": "0", "distinct": "9"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Sector"}, {"transformationsData": [{"feature_label": "Industry", "transformation_label": "String Indexer"}], "feature": "Industry", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "Aerospace & Defense", "max": "Tobacco", "missing": "0", "distinct": "30"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Industry"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Currentprice", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "500", "mean": "299.53", "stddev": "238.11", "min": "44.17", "max": "1091.25", "missing": "0"}, "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "updatedLabel": "Currentprice"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Marketcap", "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "type": "numeric", "replaceby": "mean", "selected": "True", "stats": {"count": "500", "mean": "699087720284.16", "stddev": "888538072925.68", "min": "182225174528", "max": "3846819807232", "missing": "0"}, "updatedLabel": "Marketcap"}, {"transformationsData": [{"feature_label": "Ebitda", "threshold": 2106018441.28, "transformation_label": "Binarizer"}], "feature": "Ebitda", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "500", "mean": "32126601984.0", "stddev": "39856989180.06", "min": "3.97707008E8", "max": "1.49547008E11", "missing": "0"}, "transformation": [{"transformation": "Binarizer", "selectedAsDefault": 1}], "updatedLabel": "Ebitda"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Revenuegrowth", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "500", "mean": "0.11", "stddev": "0.19", "min": "-0.064", "max": "1.224", "missing": "0"}, "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "updatedLabel": "Revenuegrowth"}, {"transformationsData": [{"feature_label": "City", "transformation_label": "String Indexer"}], "feature": "City", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "Armonk", "max": "Woking", "missing": "0", "distinct": "37"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "City"}, {"transformationsData": [{"feature_label": "State", "transformation_label": "String Indexer"}], "feature": "State", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "AR", "max": "WA", "missing": "0", "distinct": "17"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "State"}, {"transformationsData": [{"feature_label": "Country", "transformation_label": "String Indexer"}], "feature": "Country", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "Ireland", "max": "United States", "missing": "0", "distinct": "3"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Country"}, {"transformationsData": [{"feature_label": "Fulltimeemployees", "threshold": 210215.18, "transformation_label": "Binarizer"}], "feature": "Fulltimeemployees", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "500", "mean": "210215.18", "stddev": "364585.21", "min": "3892.0", "max": "2100000.0", "missing": "0"}, "transformation": [{"transformation": "Binarizer", "selectedAsDefault": 1}], "updatedLabel": "Fulltimeemployees"}, {"transformationsData": [{"feature_label": "Longbusinesssummary", "transformation_label": "String Indexer"}], "feature": "Longbusinesssummary", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "", "stddev": "", "min": "AbbVie Inc. discovers, develops, manufactures, and sells pharmaceuticals worldwide. The company offers Humira, an injection for autoimmune and intestinal Beh\u00e7et's diseases, and pyoderma gangrenosum; Skyrizi to treat moderate to severe plaque psoriasis, psoriatic disease, and Crohn's disease; Rinvoq to treat rheumatoid and psoriatic arthritis, ankylosing spondylitis, atopic dermatitis, axial spondyloarthropathy, ulcerative colitis, and Crohn's disease; Imbruvica for the treatment of adult patients with blood cancers; Epkinly to treat lymphoma; Elahere to treat cancer; and Venclexta/Venclyxto to treat blood cancers. It also provides facial injectables, plastics and regenerative medicine, body contouring, and skincare products; botox therapeutic; Vraylar for depressive disorder; Duopa and Duodopa to treat advanced Parkinson's disease; Ubrelvy for the acute treatment of migraine in adults; and Qulipta for episodic and chronic migraine. In addition, the company offers Ozurdex for eye diseases; Lumigan/Ganfort and Alphagan/Combigan for the reduction of elevated intraocular pressure in patients with open angle glaucoma or ocular hypertension; Restasis to increase tear production; and other eye care products. Further, it provides Mavyret/Maviret to treat chronic hepatitis C virus genotype 1-6 infection; Creon, a pancreatic enzyme therapy; Lupron to treat advanced prostate cancer, endometriosis and central precocious puberty, and patients with anemia caused by uterine fibroids; Linzess/Constella to treat irritable bowel syndrome with constipation and chronic idiopathic constipation; and Synthroid for hypothyroidism. It has collaborations with Calico Life Sciences LLC; REGENXBIO Inc.; Janssen Biotech, Inc.; Evolveimmune Therapeutics, Inc.; Genentech, Inc.; and Tentarix Biotherapeutics, LP. The company was incorporated in 2012 and is headquartered in North Chicago, Illinois.", "max": "Wells Fargo & Company, a financial services company, provides diversified banking, investment, mortgage, and consumer and commercial finance products and services in the United States and internationally. The company operates through four segments: Consumer Banking and Lending; Commercial Banking; Corporate and Investment Banking; and Wealth and Investment Management. The Consumer Banking and Lending segment offers diversified financial products and services for consumers and small businesses. Its financial products and services include checking and savings accounts, and credit and debit cards, as well as home, auto, personal, and small business lending services. The Commercial Banking segment provides financial solutions to private, family owned, and certain public companies. Its products and services include banking and credit products across various industry sectors and municipalities, secured lending and lease products, and treasury management services. The Corporate and Investment Banking segment offers a suite of capital markets, banking, and financial products and services, such as corporate banking, investment banking, treasury management, commercial real estate lending and servicing, equity, and fixed income solutions, as well as sales, trading, and research capabilities services to corporate, commercial real estate, government, and institutional clients. The Wealth and Investment Management segment provides personalized wealth management, brokerage, financial planning, lending, private banking, and trust and fiduciary products and services to affluent, high-net worth, and ultra-high-net worth clients. It also operates through financial advisors in brokerage and wealth offices, consumer bank branches, independent offices, and digitally through WellsTrade and Intuitive Investor. The company was founded in 1852 and is headquartered in San Francisco, California.", "missing": "0", "distinct": "49"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "Longbusinesssummary"}, {"transformationsData": [{"transformation_label": "novalue"}], "feature": "Weight", "type": "real", "selected": "True", "replaceby": "mean", "stats": {"count": "500", "mean": "0.01", "stddev": "0.02", "min": "0.0032784613042062", "max": "0.0692091524397274", "missing": "0"}, "transformation": [{"transformation": "novalue", "selectedAsDefault": 1}], "updatedLabel": "Weight"}]}))

	#transformationPostExecutionHook(spautofe)

except Exception as ex: 
	logging.error(ex)


***TRAIN MODEL***

In [None]:
#%run sp500Hooks.ipynb
try:
	#mlPreExecutionHook()

	dataAutoML=functionRegression(spautofe, ["S&P500", "Marketcap", "Revenuegrowth", "Weight", "Date_dayofmonth", "Date_month", "Date_year", "Exchange_stringindexer", "Symbol_stringindexer", "Shortname_stringindexer", "Longname_stringindexer", "Sector_stringindexer", "Industry_stringindexer", "Ebitda_binarizer", "City_stringindexer", "State_stringindexer", "Country_stringindexer", "Fulltimeemployees_binarizer", "Longbusinesssummary_stringindexer"], "Currentprice")

	#mlPostExecutionHook(dataAutoML)

except Exception as ex: 
	logging.error(ex)
#spark.stop()


***PREDICT ON TRAINED MODEL***

In [None]:
import pandas as pd
import numpy as np
import sklearn.metrics

try:
    model=dataAutoML ['model']
    X_test=dataAutoML['X_test']
    y_test=dataAutoML['y_test']
    label=dataAutoML['label']
    columnNames=dataAutoML['columnNames']
    if label in columnNames:
        columnNames.remove(label)
    predicted=label+"_predicted"
    y_predicted=model.predict(X_test)
    df =pd.DataFrame(X_test , columns=columnNames)
    df[label]=y_test
    df[predicted]=y_predicted
    columnNames.insert(0,predicted)
    columnNames.insert(0,label)
    df = df[columnNames]
    R2 = np.round(sklearn.metrics.r2_score(y_test, y_predicted), 1)
    Mean_Squared_Error = np.round(sklearn.metrics.mean_squared_error(y_test, y_predicted), 1)
    Mean_Absolute_Error = np.round(sklearn.metrics.mean_absolute_error(y_test, y_predicted), 1)
    display(" R2 score of Prediction on test data    : %s"%R2)
    display(" Mean Squared Error of Prediction on test data    : %s"%Mean_Squared_Error)
    display(" Mean Absolute Error of Prediction on test data   : %s"%Mean_Absolute_Error)
    display(df.head())
except Exception as ex:
    logging.error(ex)

spark.stop()

