In [1]:
from pyspark import SparkContext, SparkConf, RDD
from pyspark.statcounter import StatCounter
from datetime import datetime

In [2]:
config = SparkConf()\
            .set("spark.driver.memory", "4g")\
            .set("spark.executor.memory", "8g")

In [3]:
sc = SparkContext(appName='taxi_duration_lowlevel', conf=config)

your 131072x1 screen size is bogus. expect trouble
25/04/03 00:34:28 WARN Utils: Your hostname, DESKTOP-0H87CFM resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/04/03 00:34:28 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/03 00:34:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
#Convert data into rdd
raw_rdd = sc.textFile('train.csv')

#Remove the csv header row
header = raw_rdd.first()

#Strip the header row
rdd_no_header = raw_rdd.filter(lambda row: row != header)

                                                                                

In [5]:
def vectorize(row: str):
    """Vectorize the features and labels."""

    #Split the string for elements, excluding the first feature (`id`)
    values = [item for item in row.split(',')]

    #Split the timestamps features into time elements
    pickup_time = datetime.strptime(values[2], '%Y-%m-%d %H:%M:%S').strftime('%Y:%m:%d:%H:%M:%S').split(':')
    dropoff_time = datetime.strptime(values[3], '%Y-%m-%d %H:%M:%S').strftime('%Y:%m:%d:%H:%M:%S').split(':')
    
    #Encode the `store_and_fwd_flag` into binary values
    values[9] = 1 if values[9] == 'Y' else 0

    #Cast strings into number
    raw_vector = [float(value) for value in [values[1]] + pickup_time + dropoff_time + values[4:]]
    
    #Build and return the LabeledPoint-like row
    return {'label': raw_vector[-1], 'features' : tuple(raw_vector[:-1])}

rdd = rdd_no_header.map(vectorize).repartition(8).cache()

In [6]:
def partition_feature_grouping(partition, usable_features: list):
    feature_stats = {}

    for row in partition:
        for feature in usable_features:
            # Store feature-wise values in a dictionary
            if feature not in feature_stats:
                feature_stats[feature] = []

            feature_stats[feature].append((row['features'][feature], row['label']))

    results = []
    
    for feature, values in feature_stats.items():
        results.append((feature, values))

    return iter(results)


def find_feature_best_split(values: list[list[float ] ], parent_info: StatCounter):
            sorted_values = sorted(values)

            parent_count = parent_info.count()
            parent_pow_sum = parent_info.variance() + parent_info.mean() ** 2
            
            best_split = (-float("inf"), None) 


            left_sum, left_pow_sum, left_count = 0, 0, 0
            
            for i in range(0, parent_count - 1):
                feature_value, label = sorted_values[i]

                left_sum += label
                left_pow_sum += label ** 2
                left_count += 1

                if feature_value == sorted_values[i+1][0]:
                    continue

                right_count = parent_count - left_count

                right_sum = parent_info.sum() - left_sum
                right_pow_sum = parent_pow_sum - left_pow_sum

                
                left_var = left_pow_sum / left_count - (left_sum / left_count)**2 if left_count > 0 else 0
                right_var = right_pow_sum / right_count - (right_sum / right_count)**2 if right_count > 0 else 0

                var_reduction = parent_info.variance() - left_var * (left_count / parent_count) - right_var * (right_count / parent_count)     

                
                if var_reduction > best_split[0]:
                    best_split = ( var_reduction, (feature_value, sorted_values[i+1][0]) )

            return best_split


def splitter(iterator, split_feature: int, split_point: float, operand):
    ret = []
    for row in iterator:
        if operand(row['features'][split_feature], split_point):
            ret.append(row)

    return iter(ret)

In [9]:
class DecisionTreeRegressor:
    def __init__(self, max_depth = 1):
        self.max_depth = max_depth
        self.rules = None
        pass

    def set_maxDepth(self, depth):
        self.max_depth = depth

    def fit(self, train_rdd: RDD):
        self.num_features = len(train_rdd.first()['features'])
        self.usable_features = [i for i in range(self.num_features)]
        sample_size = train_rdd.count()
        
        return self.__build_rule_tree_recursive(train_rdd, self.usable_features, sample_size)

    def transform(self, rdd: RDD):
        
        pass

    def display_rule_tree(self, model: dict):
        self.__display_tree_recusive(model)





    def __display_tree_recusive(self, rules: dict, indent = 0):
        if not rules:
            return

        print(f"{indent * ' '}If  features[{rules['split_feature']}] <= {rules['split_point']}")

        if len(rules['left']) == 1:
            print(f"{indent  * ' ' + '   '}Predict: {rules['left']['prediction']}")
        else:
            self.__display_tree_recusive(rules['left'], indent + 5)
        
        if rules['right']:
            print(f"{indent * ' '}Else  features[{rules['split_feature']}] > {rules['split_point']}")

            if len(rules['right']) == 1:
                print(f"{indent * ' ' + '   '}Predict: {rules['right']['prediction']}")

            else:
                self.__display_tree_recusive(rules['right'], indent + 5)




    def __find_best_split(self, rdd: RDD, usable_features: list):         
        
        parent_info = rdd.mapPartitions(lambda partition: [StatCounter([row['label'] for row in partition])], True)\
                        .reduce(lambda stat1, stat2: stat1.mergeStats(stat2))
        

        processed_rdd = rdd.mapPartitions(lambda partition: partition_feature_grouping(partition, usable_features), True)\
                      .reduceByKey(lambda l1, l2: l1 + l2).cache()
        

        best_split_per_feature = processed_rdd.mapValues(lambda values: find_feature_best_split(values, parent_info))\
                                  

        best_split = max(best_split_per_feature.collect(),key=lambda x: (x[1][0], -x[0]))


        return best_split[0], best_split[1][1]


    def __build_rule_tree_recursive(self, parent: RDD, usable_features: list, sample_size, depth = 0):
        if sample_size == 0:
            return None
        
        if depth == self.max_depth:
            mean = parent.mapPartitions(lambda partition: [StatCounter([row['label'] for row in partition])], True)\
                        .reduce(lambda stat1, stat2: stat1.mergeStats(stat2)).mean()

            return {'prediction' : mean}

        split_feature, split_bound = self.__find_best_split(parent, usable_features)
        

        

        if split_feature == None or split_bound == None:
            mean = parent.mapPartitions(lambda partition: [StatCounter([row['label'] for row in partition])], True)\
                        .reduce(lambda stat1, stat2: stat1.mergeStats(stat2)).mean()
            
            return {'prediction' : mean}

        mid_point = (split_bound[0] + split_bound[1]) / 2

        left_rdd = parent.mapPartitions(lambda iterator: splitter(iterator, split_feature, mid_point, lambda a,b: a <= b), True).cache()
        right_rdd = parent.mapPartitions(lambda iterator: splitter(iterator, split_feature, mid_point, lambda a,b: a >= b), True).cache()
        
        if depth > 0:
            parent.unpersist()



        left_sample_size = left_rdd.count()
     
        return {
            'split_feature' : split_feature,
            'split_point' : mid_point,
            'left' : self.__build_rule_tree_recursive(left_rdd, usable_features, left_sample_size, depth + 1),
            'right' : self.__build_rule_tree_recursive(right_rdd, usable_features, sample_size - left_sample_size, depth + 1)
        }

In [10]:
estimator = DecisionTreeRegressor(5)
model = estimator.fit(rdd)
estimator.display_rule_tree(model)

25/04/03 00:37:36 WARN BlockManager: Task 584 already completed, not releasing lock for rdd_7_0
                                                                                

If  features[14] <= -73.88602066040039
     If  features[16] <= -73.91103744506836
          If  features[17] <= 40.70149040222168
               If  features[15] <= 40.725751876831055
                    If  features[17] <= 40.701486587524414
                       Predict: 1019.1713947990552
                    Else  features[17] > 40.701486587524414
                       Predict: 28934.666666666668
               Else  features[15] > 40.725751876831055
                    If  features[15] <= 40.72575569152832
                       Predict: 22868.0
                    Else  features[15] > 40.72575569152832
                       Predict: 1901.497137558458
          Else  features[17] > 40.70149040222168
               If  features[0] <= 1.5
                    If  features[14] <= -73.92168045043945
                       Predict: 712.2167951813683
                    Else  features[14] > -73.92168045043945
                       Predict: 2017.9384793964011
               Else  feat

                                                                                