***GENERATED CODE FOR recommendedreviewtarget PIPELINE.***

***DON'T EDIT THIS CODE.***

***CONNECTOR FUNCTIONS TO READ DATA.***

In [None]:
import os
import datetime
import logging
import warnings
warnings.filterwarnings('ignore')
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)


class HDFSConnector:

    def fetch(spark, config):
        ################### INPUT HADOOP HOST PORT TO CONNECT WITH ###############################
        hdfs_server = str(os.environ['HDFS_SERVER'])
        hdfs_port = int(os.environ['HDFS_PORT'])
        df = spark.read.options(header='true', inferschema='true').csv(
            f"hdfs://{hdfs_server}:{hdfs_port}{eval(config)['url']}", header='true')
        display(df.limit(2).toPandas())
        return df

    def put(df, spark, config):
        return df.write.format('csv').options(header='true' if eval(config)["is_header"] == "Use Header Line" else 'false',
                                              delimiter=eval(config)["delimiter"]).save(("%s %s") % (datetime.datetime.now().strftime("%Y-%m-%d %H.%M.%S")+"_", eval(config)['url']))


***TRANSFORMATIONS FUNCTIONS THAT WILL BE APPLIED ON DATA***

In [None]:
import json
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import col, when
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import mean, stddev, min, max, col


class CleanseData:
    # def __init__(self,df):
    #     #print()

    def cleanValueForFE(self, value):
        if value == None:
            return ""
        elif str(value) == 'nan':
            return "nan"
        else:
            return value

    def replaceByMean(self, feature, df, mean_=-1):
        df1 = df
        df1 = df1.dropna()
        meanValue = self.cleanValueForFE(df1.select(
            mean(col(feature.name)).alias('mean')).collect()[0]["mean"])
        df = df.fillna(meanValue, subset=[feature.name])
        df.withColumn(feature.name, when(col(feature.name) == " ",
                      meanValue).otherwise(col(feature.name).cast("Integer")))
        return df

    def replaceByMax(self, feature, df, max_=-1):
        df1 = df
        df1 = df1.dropna()
        maxValue = self.cleanValueForFE(df1.select(
            max(col(feature.name)).alias('max')).collect()[0]["max"])
        df = df.fillna(maxValue, subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", maxValue).otherwise(col(feature.name)))
        return df

    def replaceByMin(self, feature, df, min_=-1):
        df1 = df
        df1 = df1.dropna()
        minValue = self.cleanValueForFE(df1.select(
            min(col(feature.name)).alias('min')).collect()[0]["min"])
        df = df.fillna(minValue, subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", minValue).otherwise(col(feature.name)))
        return df

    def replaceByStandardDeviation(self, feature, df, stddev_=-1):
        df1 = df
        df1 = df1.dropna()
        stddevValue = self.cleanValueForFE(df1.select(
            stddev(col(feature.name)).alias('stddev')).collect()[0]["stddev"])
        df = df.fillna(stddevValue, subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", stddevValue).otherwise(col(feature.name)))
        return df

    def replaceDateRandomly(self, feature, df):
        df1 = df
        df1 = df1.dropna()
        fillValue = self.cleanValueForFE(
            df.where(col(feature.name).isNotNull()).head(1)[0][feature.name])
        df = df.fillna(str(fillValue), subset=[feature.name])
        df = df.withColumn(feature.name,
                           when(col(feature.name) == " ", fillValue).otherwise(col(feature.name)))
        # print("CleanseData:replaceDateRandomly Schema : ", df.#printSchema())
        return df

    def replaceNullValues(self, fList, df):
        featuresList = df.schema.fields
        for featureObj in fList:
            for feat in featuresList:
                if featureObj["feature"] in feat.name:
                    featureName = feat
                    if "mean" in featureObj["replaceby"]:
                        df = self.replaceByMean(featureName, df)
                    elif "max" in featureObj["replaceby"]:
                        df = self.replaceByMax(featureName, df)
                    elif "min" in featureObj["replaceby"]:
                        df = self.replaceByMin(featureName, df)
                    elif "stddev" in featureObj["replaceby"]:
                        df = self.replaceByStandardDeviation(featureName, df)
                    elif "random" in featureObj["replaceby"]:
                        df = self.replaceDateRandomly(featureName, df)
        return df


def StringIndexerTransform(df, params, transformationData={}):
    dfReturn = df
    feature = params["feature"]

    dfReturn = dfReturn.fillna({feature: ''})
    outcol = feature + "_stringindexer"
    indexer = StringIndexer(
        inputCol=feature, outputCol=outcol, handleInvalid="skip")
    indexed = indexer.fit(dfReturn).transform(dfReturn)
    dfReturn = indexed
    distinct_values_list = dfReturn.select(
        outcol).distinct().rdd.map(lambda r: r[0]).collect()
    len_distinct_values_list = len(distinct_values_list)
    if len_distinct_values_list <= 4:
        changed_type_df = dfReturn.withColumn(
            outcol, dfReturn[outcol].cast(IntegerType()))
        return changed_type_df
    return dfReturn


class TransformationMain:
    # TODO: change df argument in run with following
    def run(transformationDF, config):
        configObj = json.loads(config)
        featureData = configObj["FE"]
        transformationDF = CleanseData().replaceNullValues(featureData, transformationDF)
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'index', 'transformation_label': 'String Indexer'}], 'feature': 'index', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
                                                  'count': '500', 'mean': '211.5', 'stddev': '122.54', 'min': '*I like the fact it uses double A batteries compared to an internal rechargeable battery.', 'max': 'in between each secret is so satisfying."', 'missing': '0', 'distinct': '499'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'index'}, {'feature_label': 'index', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('index')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'url', 'transformation_label': 'String Indexer'}], 'feature': 'url', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '469', 'mean': '', 'stddev': '', 'min': ' and they kind of fight over it.  They really like the retro Mario on the controller', 'max': 'yes: I recommend this product', 'missing': '31', 'distinct': '38'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'url'}, {'feature_label': 'url', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('url')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'name', 'transformation_label': 'String Indexer'}], 'feature': 'name', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '451', 'mean': '', 'stddev': '', 'min': ' I just wouldn’t recommend now .... I’m fairly certain those issues will be ironed out and I’ll do a later review most likely recommending this TV."', 'max': 'yes: I recommend this product', 'missing': '49', 'distinct': '22'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'name'}, {'feature_label': 'name', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('name')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'brand', 'transformation_label': 'String Indexer'}], 'feature': 'brand', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '451', 'mean': '0.0', 'stddev': '0.0', 'min': ' #1 is eARC (awesome)', 'max': 'yes: I recommend this product', 'missing': '49', 'distinct': '17'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'brand'}, {'feature_label': 'brand', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('brand')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'sku', 'transformation_label': 'String Indexer'}], 'feature': 'sku', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '453', 'mean': '261287.22', 'stddev': '105089.96', 'min': ' #4 is a regular 2.0 (works for most devices you want to plug in to get 1080p or 4k)', 'max': 'yes: I recommend this product', 'missing': '47', 'distinct': '13'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'sku'}, {'feature_label': 'sku', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('sku')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'reviewer_name', 'transformation_label': 'String Indexer'}], 'feature': 'reviewer_name', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '453', 'mean': '43.0', 'stddev': '148.96', 'min': ' BUT port #2 and #4 are the new HDMI2.1-120hz ports', 'max': 'zachR', 'missing': '47', 'distinct': '418'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'reviewer_name'}, {'feature_label': 'reviewer_name', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('reviewer_name')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'review_title', 'transformation_label': 'String Indexer'}], 'feature': 'review_title', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '430', 'mean': '0.0', 'stddev': '0.0', 'min': ' and they are currently practically USELESS if you want to get 4K and Dolby 5.1 on them. They are currently not compatible with the PlayStation 5', 'max': 'look out gamerz, here comez awsome', 'missing': '70', 'distinct': '393'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'review_title'}, {'feature_label': 'review_title', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('review_title')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'review_description', 'transformation_label': 'String Indexer'}], 'feature': 'review_description', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '445', 'mean': '4.29', 'stddev': '1.31', 'min': ' and it is super buggy with the new Google Chromecast with Google TV. It will not get 5.1 audio with the new Chromecast', 'max': 'yes: I recommend this product', 'missing': '55', 'distinct': '432'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'review_description'}, {'feature_label': 'review_description', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('review_description')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'recommended_review', 'transformation_label': 'String Indexer'}], 'feature': 'recommended_review', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '422', 'mean': '4.25', 'stddev': '0.9', 'min': ' and to get 4K', 'max': 'yes: I recommend this product', 'missing': '78', 'distinct': '18'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'recommended_review'}, {'feature_label': 'recommended_review', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('recommended_review')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'verifed_purchaser', 'transformation_label': 'String Indexer'}], 'feature': 'verifed_purchaser', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '259', 'mean': '631.67', 'stddev': '948.44', 'min': ' I have to manually switch the input from Auto to 1.4', 'max': 'yes: I recommend this product', 'missing': '241', 'distinct': '18'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'verifed_purchaser'}, {'feature_label': 'verifed_purchaser', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('verifed_purchaser')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'helpful_count', 'transformation_label': 'String Indexer'}], 'feature': 'helpful_count', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '424', 'mean': '18.48', 'stddev': '174.23', 'min': ' 2', 'max': 'https://www.gamestop.com/on/demandware.store/Sites-gamestop-us-Site/default/Bazaarvoice-RatingAndReviews?Filter=Productid%3a11180293%7cIsRatingsOnly%3aeq%3afalse&Sort=submissiontime%3adesc&Offset=260', 'missing': '76', 'distinct': '33'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'helpful_count'}, {'feature_label': 'helpful_count', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('helpful_count')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'not_helpful_count', 'transformation_label': 'String Indexer'}], 'feature': 'not_helpful_count', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {'count': '423', 'mean': '7154157.59', 'stddev': '36919434.23', 'min': ' 3',
                                                                                                                                                                                                                                                                                    'max': 'https://www.gamestop.com/on/demandware.store/Sites-gamestop-us-Site/default/Bazaarvoice-RatingAndReviews?Filter=Productid%3a11180293%7cIsRatingsOnly%3aeq%3afalse&Sort=submissiontime%3adesc&Offset=250', 'missing': '77', 'distinct': '46'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'not_helpful_count'}, {'feature_label': 'not_helpful_count', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('not_helpful_count')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'reviewed_at', 'transformation_label': 'String Indexer'}], 'feature': 'reviewed_at', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {'count': '422', 'mean': '151353366.75', 'stddev': '98647732.52', 'min': ' and Experimental',
                                                                                                                                                                                                                                                                        'max': 'https://www.gamestop.com/on/demandware.store/Sites-gamestop-us-Site/default/Bazaarvoice-RatingAndReviews?Filter=Productid%3a11180293%7cIsRatingsOnly%3aeq%3afalse&Sort=submissiontime%3adesc&Offset=180', 'missing': '78', 'distinct': '55'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'reviewed_at'}, {'feature_label': 'reviewed_at', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('reviewed_at')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'images', 'transformation_label': 'String Indexer'}], 'feature': 'images', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {'count': '54', 'mean': '152036096.0', 'stddev': '131667363.86', 'min': ' and no larger maps to choose from. In the first version you had a large selection of maps',
                                                                                                                                                                                                                                                              'max': 'https://www.gamestop.com/on/demandware.store/Sites-gamestop-us-Site/default/Bazaarvoice-RatingAndReviews?Filter=Productid%3a11111433%7cIsRatingsOnly%3aeq%3afalse&Sort=submissiontime%3adesc&Offset=0', 'missing': '446', 'distinct': '45'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'images'}, {'feature_label': 'images', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('images')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'rating', 'transformation_label': 'String Indexer'}], 'feature': 'rating', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '405', 'mean': '312430.68', 'stddev': '6177828.04', 'min': ' and experimental units are really not that great', 'max': 'c58d87e2-558d-5eaa-96b1-2264f00e7e67', 'missing': '95', 'distinct': '19'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'rating'}, {'feature_label': 'rating', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('rating')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'average_rating', 'transformation_label': 'String Indexer'}], 'feature': 'average_rating', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '398', 'mean': '5.35', 'stddev': '18.12', 'min': ' and all of them are the same size pretty much. I want to be clear one more time about the gameplay here: This is not a merely ""different"" approach. I\'m pretty open-minded and I\'ll try just about anything. This is a GUTTED approach to the first version\'s gameplay. It takes many of the great features out', 'max': 'ef72459e-8f5c-57dc-af42-0127d2903498', 'missing': '102', 'distinct': '17'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'average_rating'}, {'feature_label': 'average_rating', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('average_rating')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'reviews_count', 'transformation_label': 'String Indexer'}], 'feature': 'reviews_count', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {'count': '396', 'mean': '982.99', 'stddev': '1007.7', 'min': " and doesn't replace them - it does away with them altogether. Fewer units with no tiers or variety. Smaller selection of maps with almost no size difference in any. Worse graphics. Weak experimental units. Gameplay consists primarily of unit churning and zerg tactics. I gave this game the benefit of the doubt because the first one was so great. Please",
                                                                                                                                                                                                                                                                            'max': 'https://www.gamestop.com/on/demandware.store/Sites-gamestop-us-Site/default/Bazaarvoice-RatingAndReviews?Filter=Productid%3a11180293%7cIsRatingsOnly%3aeq%3afalse&Sort=submissiontime%3adesc&Offset=210', 'missing': '104', 'distinct': '14'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'reviews_count'}, {'feature_label': 'reviews_count', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('reviews_count')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'reviews_link', 'transformation_label': 'String Indexer'}], 'feature': 'reviews_link', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {'count': '394', 'mean': '75770394.0', 'stddev': '131235787.95', 'min': ' don\'t make the same mistake I did. Do not buy this game."',
                                                                                                                                                                                                                                                                          'max': 'https://www.gamestop.com/on/demandware.store/Sites-gamestop-us-Site/default/Bazaarvoice-RatingAndReviews?Filter=Productid%3a11180293%7cIsRatingsOnly%3aeq%3afalse&Sort=submissiontime%3adesc&Offset=260', 'missing': '106', 'distinct': '52'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'reviews_link'}, {'feature_label': 'reviews_link', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('reviews_link')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'comment_id', 'transformation_label': 'String Indexer'}], 'feature': 'comment_id', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {'count': '395', 'mean': '166241965.38', 'stddev': '55963828.42', 'min': ' but even hordes of units can crumble in no time it seems.  Regular units are just knock offs of the original ones and are only really good when you upgrade them via research.  However i do think the research is a cleaver aspect of the game and the brand new units/structures that were added are a nice touch',
                                                                                                                                                                                                                                                                      'max': 'https://www.gamestop.com/on/demandware.store/Sites-gamestop-us-Site/default/Bazaarvoice-RatingAndReviews?Filter=Productid%3a11180293%7cIsRatingsOnly%3aeq%3afalse&Sort=submissiontime%3adesc&Offset=230', 'missing': '105', 'distinct': '395'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'comment_id'}, {'feature_label': 'comment_id', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('comment_id')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'uniq_id', 'transformation_label': 'String Indexer'}], 'feature': 'uniq_id', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '394', 'mean': '117776414.8', 'stddev': '113745915.72', 'min': ' but it does usually take awhile to get to them in the research tree even with the research facilities which u can build to increase research income.  completely  Also the variety of units is gone there use to be at least 2 sets of everything... a lower cost', 'max': 'ff3fad3f-62ee-54c5-bab0-6009af0cf671', 'missing': '106', 'distinct': '394'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'uniq_id'}, {'feature_label': 'uniq_id', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('uniq_id')
        transformationDF = StringIndexerTransform(transformationDF, {'transformationsData': [{'feature_label': 'scraped_at', 'transformation_label': 'String Indexer'}], 'feature': 'scraped_at', 'type': 'string', 'selected': 'True', 'replaceby': 'max', 'stats': {
            'count': '394', 'mean': '13.5', 'stddev': '12.02', 'min': ' but on the other side it makes the game harder because now you can no longer make/build units/structures without first having the resources available', 'max': 'b5141b5e-d8d2-578f-a02e-1e47f25018ab', 'missing': '106', 'distinct': '17'}, 'transformation': [{'transformation': 'String Indexer', 'selectedAsDefault': 1}], 'updatedLabel': 'scraped_at'}, {'feature_label': 'scraped_at', 'transformation_label': 'String Indexer'})
        transformationDF = transformationDF.drop('scraped_at')
        display(transformationDF.limit(2).toPandas())
        return transformationDF


***AUTOML FUNCTIONS***

In [None]:
from sklearn.model_selection import train_test_split
from tpot import TPOTRegressor
import pyspark


def functionRegression(sparkDF, listOfFeatures, label):
    sparkDF.persist(pyspark.StorageLevel.MEMORY_AND_DISK)
    df = sparkDF.toPandas()
    X = (df.drop(label, axis=1))[listOfFeatures].values
    y = df[label].values
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=1, test_size=0.1)
    tpotModel = TPOTRegressor(verbosity=3, generations=10, max_time_mins=5,
                              n_jobs=-1, random_state=25, population_size=15, use_dask=True)
    tpotModel.fit(X_train, y_train)
    display(" Error rate of Model : %s" % tpotModel.score(X_test, y_test))
    data = {'model': tpotModel,
            'X_test': X_test,
            'y_test': y_test,
            'label': label,
            'columnNames': listOfFeatures}
    return data


***READING DATAFRAME***

In [None]:
############## CREATE SPARK SESSION ############################ ENTER YOUR SPARK MASTER IP AND PORT TO CONNECT TO SERVER ################
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[1]').getOrCreate()
#%run recommendedreviewtargetHooks.ipynb
try:
	#sourcePreExecutionHook()

	gamestopproductreviews = HDFSConnector.fetch(spark, "{'url': '/FileStore/platform/uploadedSourceFiles/gamestop_product_reviews.csv', 'filename': 'gamestop_product_reviews.csv', 'delimiter': ',', 'file_type': 'Delimeted', 'is_header': 'Use Header Line', 'domain': 'http://172.31.59.158', 'port': '40070', 'dirPath': '/FileStore/platform', 'server_url': '/nexusMax/NexusMaxPlatform/uploads/platform/'}")
	#sourcePostExecutionHook(gamestopproductreviews)

except Exception as ex: 
	logging.error(ex)
#spark.stop()


***TRANSFORMING DATAFRAME***

In [None]:
#%run recommendedreviewtargetHooks.ipynb
try:
	#transformationPreExecutionHook()

	recommendedreviewtargetautofe = TransformationMain.run(gamestopproductreviews,json.dumps( {"FE": [{"transformationsData": [{"feature_label": "index", "transformation_label": "String Indexer"}], "feature": "index", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "500", "mean": "211.5", "stddev": "122.54", "min": "*I like the fact it uses double A batteries compared to an internal rechargeable battery.", "max": "in between each secret is so satisfying.\"", "missing": "0", "distinct": "499"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "index"}, {"transformationsData": [{"feature_label": "url", "transformation_label": "String Indexer"}], "feature": "url", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "469", "mean": "", "stddev": "", "min": " and they kind of fight over it.  They really like the retro Mario on the controller", "max": "yes: I recommend this product", "missing": "31", "distinct": "38"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "url"}, {"transformationsData": [{"feature_label": "name", "transformation_label": "String Indexer"}], "feature": "name", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "451", "mean": "", "stddev": "", "min": " I just wouldn\u2019t recommend now .... I\u2019m fairly certain those issues will be ironed out and I\u2019ll do a later review most likely recommending this TV.\"", "max": "yes: I recommend this product", "missing": "49", "distinct": "22"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "name"}, {"transformationsData": [{"feature_label": "brand", "transformation_label": "String Indexer"}], "feature": "brand", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "451", "mean": "0.0", "stddev": "0.0", "min": " #1 is eARC (awesome)", "max": "yes: I recommend this product", "missing": "49", "distinct": "17"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "brand"}, {"transformationsData": [{"feature_label": "sku", "transformation_label": "String Indexer"}], "feature": "sku", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "453", "mean": "261287.22", "stddev": "105089.96", "min": " #4 is a regular 2.0 (works for most devices you want to plug in to get 1080p or 4k)", "max": "yes: I recommend this product", "missing": "47", "distinct": "13"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "sku"}, {"transformationsData": [{"feature_label": "reviewer_name", "transformation_label": "String Indexer"}], "feature": "reviewer_name", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "453", "mean": "43.0", "stddev": "148.96", "min": " BUT port #2 and #4 are the new HDMI2.1-120hz ports", "max": "zachR", "missing": "47", "distinct": "418"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "reviewer_name"}, {"transformationsData": [{"feature_label": "review_title", "transformation_label": "String Indexer"}], "feature": "review_title", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "430", "mean": "0.0", "stddev": "0.0", "min": " and they are currently practically USELESS if you want to get 4K and Dolby 5.1 on them. They are currently not compatible with the PlayStation 5", "max": "look out gamerz, here comez awsome", "missing": "70", "distinct": "393"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "review_title"}, {"transformationsData": [{"feature_label": "review_description", "transformation_label": "String Indexer"}], "feature": "review_description", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "445", "mean": "4.29", "stddev": "1.31", "min": " and it is super buggy with the new Google Chromecast with Google TV. It will not get 5.1 audio with the new Chromecast", "max": "yes: I recommend this product", "missing": "55", "distinct": "432"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "review_description"}, {"transformationsData": [{"feature_label": "recommended_review", "transformation_label": "String Indexer"}], "feature": "recommended_review", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "422", "mean": "4.25", "stddev": "0.9", "min": " and to get 4K", "max": "yes: I recommend this product", "missing": "78", "distinct": "18"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "recommended_review"}, {"transformationsData": [{"feature_label": "verifed_purchaser", "transformation_label": "String Indexer"}], "feature": "verifed_purchaser", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "259", "mean": "631.67", "stddev": "948.44", "min": " I have to manually switch the input from Auto to 1.4", "max": "yes: I recommend this product", "missing": "241", "distinct": "18"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "verifed_purchaser"}, {"transformationsData": [{"feature_label": "helpful_count", "transformation_label": "String Indexer"}], "feature": "helpful_count", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "424", "mean": "18.48", "stddev": "174.23", "min": " 2", "max": "https://www.gamestop.com/on/demandware.store/Sites-gamestop-us-Site/default/Bazaarvoice-RatingAndReviews?Filter=Productid%3a11180293%7cIsRatingsOnly%3aeq%3afalse&Sort=submissiontime%3adesc&Offset=260", "missing": "76", "distinct": "33"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "helpful_count"}, {"transformationsData": [{"feature_label": "not_helpful_count", "transformation_label": "String Indexer"}], "feature": "not_helpful_count", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "423", "mean": "7154157.59", "stddev": "36919434.23", "min": " 3", "max": "https://www.gamestop.com/on/demandware.store/Sites-gamestop-us-Site/default/Bazaarvoice-RatingAndReviews?Filter=Productid%3a11180293%7cIsRatingsOnly%3aeq%3afalse&Sort=submissiontime%3adesc&Offset=250", "missing": "77", "distinct": "46"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "not_helpful_count"}, {"transformationsData": [{"feature_label": "reviewed_at", "transformation_label": "String Indexer"}], "feature": "reviewed_at", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "422", "mean": "151353366.75", "stddev": "98647732.52", "min": " and Experimental", "max": "https://www.gamestop.com/on/demandware.store/Sites-gamestop-us-Site/default/Bazaarvoice-RatingAndReviews?Filter=Productid%3a11180293%7cIsRatingsOnly%3aeq%3afalse&Sort=submissiontime%3adesc&Offset=180", "missing": "78", "distinct": "55"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "reviewed_at"}, {"transformationsData": [{"feature_label": "images", "transformation_label": "String Indexer"}], "feature": "images", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "54", "mean": "152036096.0", "stddev": "131667363.86", "min": " and no larger maps to choose from. In the first version you had a large selection of maps", "max": "https://www.gamestop.com/on/demandware.store/Sites-gamestop-us-Site/default/Bazaarvoice-RatingAndReviews?Filter=Productid%3a11111433%7cIsRatingsOnly%3aeq%3afalse&Sort=submissiontime%3adesc&Offset=0", "missing": "446", "distinct": "45"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "images"}, {"transformationsData": [{"feature_label": "rating", "transformation_label": "String Indexer"}], "feature": "rating", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "405", "mean": "312430.68", "stddev": "6177828.04", "min": " and experimental units are really not that great", "max": "c58d87e2-558d-5eaa-96b1-2264f00e7e67", "missing": "95", "distinct": "19"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "rating"}, {"transformationsData": [{"feature_label": "average_rating", "transformation_label": "String Indexer"}], "feature": "average_rating", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "398", "mean": "5.35", "stddev": "18.12", "min": " and all of them are the same size pretty much. I want to be clear one more time about the gameplay here: This is not a merely \"\"different\"\" approach. I'm pretty open-minded and I'll try just about anything. This is a GUTTED approach to the first version's gameplay. It takes many of the great features out", "max": "ef72459e-8f5c-57dc-af42-0127d2903498", "missing": "102", "distinct": "17"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "average_rating"}, {"transformationsData": [{"feature_label": "reviews_count", "transformation_label": "String Indexer"}], "feature": "reviews_count", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "396", "mean": "982.99", "stddev": "1007.7", "min": " and doesn't replace them - it does away with them altogether. Fewer units with no tiers or variety. Smaller selection of maps with almost no size difference in any. Worse graphics. Weak experimental units. Gameplay consists primarily of unit churning and zerg tactics. I gave this game the benefit of the doubt because the first one was so great. Please", "max": "https://www.gamestop.com/on/demandware.store/Sites-gamestop-us-Site/default/Bazaarvoice-RatingAndReviews?Filter=Productid%3a11180293%7cIsRatingsOnly%3aeq%3afalse&Sort=submissiontime%3adesc&Offset=210", "missing": "104", "distinct": "14"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "reviews_count"}, {"transformationsData": [{"feature_label": "reviews_link", "transformation_label": "String Indexer"}], "feature": "reviews_link", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "394", "mean": "75770394.0", "stddev": "131235787.95", "min": " don't make the same mistake I did. Do not buy this game.\"", "max": "https://www.gamestop.com/on/demandware.store/Sites-gamestop-us-Site/default/Bazaarvoice-RatingAndReviews?Filter=Productid%3a11180293%7cIsRatingsOnly%3aeq%3afalse&Sort=submissiontime%3adesc&Offset=260", "missing": "106", "distinct": "52"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "reviews_link"}, {"transformationsData": [{"feature_label": "comment_id", "transformation_label": "String Indexer"}], "feature": "comment_id", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "395", "mean": "166241965.38", "stddev": "55963828.42", "min": " but even hordes of units can crumble in no time it seems.  Regular units are just knock offs of the original ones and are only really good when you upgrade them via research.  However i do think the research is a cleaver aspect of the game and the brand new units/structures that were added are a nice touch", "max": "https://www.gamestop.com/on/demandware.store/Sites-gamestop-us-Site/default/Bazaarvoice-RatingAndReviews?Filter=Productid%3a11180293%7cIsRatingsOnly%3aeq%3afalse&Sort=submissiontime%3adesc&Offset=230", "missing": "105", "distinct": "395"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "comment_id"}, {"transformationsData": [{"feature_label": "uniq_id", "transformation_label": "String Indexer"}], "feature": "uniq_id", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "394", "mean": "117776414.8", "stddev": "113745915.72", "min": " but it does usually take awhile to get to them in the research tree even with the research facilities which u can build to increase research income.  completely  Also the variety of units is gone there use to be at least 2 sets of everything... a lower cost", "max": "ff3fad3f-62ee-54c5-bab0-6009af0cf671", "missing": "106", "distinct": "394"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "uniq_id"}, {"transformationsData": [{"feature_label": "scraped_at", "transformation_label": "String Indexer"}], "feature": "scraped_at", "type": "string", "selected": "True", "replaceby": "max", "stats": {"count": "394", "mean": "13.5", "stddev": "12.02", "min": " but on the other side it makes the game harder because now you can no longer make/build units/structures without first having the resources available", "max": "b5141b5e-d8d2-578f-a02e-1e47f25018ab", "missing": "106", "distinct": "17"}, "transformation": [{"transformation": "String Indexer", "selectedAsDefault": 1}], "updatedLabel": "scraped_at"}]}))

	#transformationPostExecutionHook(recommendedreviewtargetautofe)

except Exception as ex: 
	logging.error(ex)


***TRAIN MODEL***

In [None]:
#%run recommendedreviewtargetHooks.ipynb
try:
	#mlPreExecutionHook()

	dataAutoML=functionRegression(recommendedreviewtargetautofe, ["index_stringindexer", "url_stringindexer", "name_stringindexer", "brand_stringindexer", "sku_stringindexer", "reviewer_name_stringindexer", "review_title_stringindexer", "review_description_stringindexer", "verifed_purchaser_stringindexer", "helpful_count_stringindexer", "not_helpful_count_stringindexer", "reviewed_at_stringindexer", "images_stringindexer", "rating_stringindexer", "average_rating_stringindexer", "reviews_count_stringindexer", "reviews_link_stringindexer", "comment_id_stringindexer", "uniq_id_stringindexer", "scraped_at_stringindexer"], "recommended_review_stringindexer")

	#mlPostExecutionHook(dataAutoML)

except Exception as ex: 
	logging.error(ex)
#spark.stop()


***PREDICT ON TRAINED MODEL***

In [None]:
import pandas as pd
import numpy as np
import sklearn.metrics

try:
    model=dataAutoML ['model']
    X_test=dataAutoML['X_test']
    y_test=dataAutoML['y_test']
    label=dataAutoML['label']
    columnNames=dataAutoML['columnNames']
    if label in columnNames:
        columnNames.remove(label)
    predicted=label+"_predicted"
    y_predicted=model.predict(X_test)
    df =pd.DataFrame(X_test , columns=columnNames)
    df[label]=y_test
    df[predicted]=y_predicted
    columnNames.insert(0,predicted)
    columnNames.insert(0,label)
    df = df[columnNames]
    R2 = np.round(sklearn.metrics.r2_score(y_test, y_predicted), 1)
    Mean_Squared_Error = np.round(sklearn.metrics.mean_squared_error(y_test, y_predicted), 1)
    Mean_Absolute_Error = np.round(sklearn.metrics.mean_absolute_error(y_test, y_predicted), 1)
    display(" R2 score of Prediction on test data    : %s"%R2)
    display(" Mean Squared Error of Prediction on test data    : %s"%Mean_Squared_Error)
    display(" Mean Absolute Error of Prediction on test data   : %s"%Mean_Absolute_Error)
    display(df.head())
except Exception as ex:
    logging.error(ex)

spark.stop()

