# Decision Tree Regressor with Pyspark low-level API

### Importing necessary packages

In [1]:
from pyspark import SparkContext, SparkConf, RDD
from pyspark.statcounter import StatCounter
from datetime import datetime
import math

## 1. Preparation

(Optional) Set the memory usage configurations for Pyspark session:

In [2]:
#Set the config for spark to enhance performance
config = SparkConf()\
            .set("spark.driver.memory", "4g")\
            .set("spark.executor.memory", "4g")

Let's initialize a Spark session:

In [3]:
#pyspark init
sc = SparkContext(appName='taxi_duration_lowlevel', conf=config).getOrCreate()

your 131072x1 screen size is bogus. expect trouble
25/04/12 09:26:38 WARN Utils: Your hostname, HP-Envy resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/04/12 09:26:38 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/12 09:26:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Read input files:

In [4]:
def read_csv(filepath: str):
    """Read csv file into a rdd of values and a list of feature columns separately."""
    #Read the data into rdd
    raw_rdd = sc.textFile(filepath)

    #Remove the csv header row
    header = raw_rdd.first()

    #Strip the header row
    rdd_no_header = raw_rdd.filter(lambda row: row != header)

    return rdd_no_header, header.split(',')


raw_train_rdd, train_header = read_csv('train.csv')
raw_test_rdd, test_header = read_csv('test.csv')

                                                                                

## 2. Data preprocessing

Filter usable columns:

In [5]:
def extract_column(row: str, header: list[str], excluding_features: list[str]):
    """Extract usable feature columns with given excluding filter."""
    #Split the row with delimiter `,`
    values = row.split(',')
    
    #Filter values
    return dict((header[i], values[i]) for i in range(len(header)) if header[i] not in excluding_features)

Preprocess features into usable form:

In [None]:
# Statistics for Z-score normalization
def compute_stats(rdd: RDD, feature_indices: list):
    # Initialize zeroValue with lists for sums and sums of squares
    num_features = len(feature_indices)
    zero_value = (0, [0.0] * num_features, [0.0] * num_features)  # (count, sums, sums_of_squares)

    # Aggregate to compute count, sum, and sum of squares for each feature
    stats = rdd.map(lambda row: [row['features'][i] for i in feature_indices]) \
               .aggregate(zero_value,
                          lambda acc, val: (acc[0] + 1,
                                           [acc[1][i] + val[i] for i in range(len(val))],
                                           [acc[2][i] + val[i]**2 for i in range(len(val))]),
                          lambda acc1, acc2: (acc1[0] + acc2[0],
                                             [acc1[1][i] + acc2[1][i] for i in range(len(acc1[1]))],
                                             [acc1[2][i] + acc2[2][i] for i in range(len(acc1[2]))]))

    # Extract count, sums, and sums of squares
    count = stats[0]
    if count == 0:  # Handle empty RDD
        return [0.0] * num_features, [1.0] * num_features

    # Compute means
    means = [s / count for s in stats[1]]

    # Compute variances and standard deviations
    variances = [(stats[2][i] / count - (means[i])**2) for i in range(len(means))]
    stddevs = [math.sqrt(var) if var > 0 else 1.0 for var in variances]

    return means, stddevs

def preprocess_data(row: dict, label_col=None, means=None, stddevs=None):
    """- Flatten `pickup_datetime` into separate elements (day, month, year,...).
       - Encode `store_and_fwd_flag` to binary values.
       - Cast strings to numeric values.
       - Add Haversine distance."""
    dt = datetime.strptime(row['pickup_datetime'], '%Y-%m-%d %H:%M:%S')
    row['store_and_fwd_flag'] = 1 if row['store_and_fwd_flag'] == 'Y' else 0
    R = 6371
    lon1, lat1 = math.radians(float(row['pickup_longitude'])), math.radians(float(row['pickup_latitude']))
    lon2, lat2 = math.radians(float(row['dropoff_longitude'])), math.radians(float(row['dropoff_latitude']))
    dlon, dlat = lon2 - lon1, lat2 - lat1
    a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    distance_km = R * c
    features = [
        float(row['vendor_id']),
        float(dt.year), float(dt.month), float(dt.day), float(dt.hour),
        float(dt.minute), float(dt.second),
        float(row['passenger_count']),
        float(row['pickup_longitude']),
        float(row['pickup_latitude']),
        float(row['dropoff_longitude']),
        float(row['dropoff_latitude']),
        row['store_and_fwd_flag'],
        distance_km
    ]
    if means and stddevs:
        features = [(features[i] - means[i]) / stddevs[i] for i in range(len(features))]
    label = float(row[label_col]) if label_col else None
    if label is not None:
        label = math.log(label + 1)
    return {'id': row.get('id'), 'features': features, 'label': label, 'distance_km': distance_km}

# train_rdd = raw_train_rdd.map(lambda row: extract_column(row, train_header, ['id', 'dropoff_datetime']))\
#                          .map(lambda row: preprocess_data(row, 'trip_duration'))

    
# test_rdd = raw_test_rdd.map(lambda row: extract_column(row, test_header, ['id']))\
#                        .map(lambda row: preprocess_data(row, None)) 
                
# Preprocess data with outlier filtering and Z-score standardization
# Step 1: Extract columns and filter outliers for train_rdd
train_rdd_raw = raw_train_rdd.map(lambda row: extract_column(row, train_header, ['dropoff_datetime'])).cache()

# Filter outliers based on trip_duration and passenger_count
train_rdd_filtered = train_rdd_raw.map(lambda row: (row, float(row['trip_duration']), float(row['passenger_count']))) \
                                  .filter(lambda x: (x[1] > 0) and (x[1] < 36000) and (x[2] >= 1) and (x[2] <= 6)) \
                                  .map(lambda x: x[0])

# Step 2: Compute distance_km and filter based on distance_km, but keep the original row
train_rdd_with_distance = train_rdd_filtered.map(lambda row: (row, preprocess_data(row, 'trip_duration')))
train_rdd_filtered = train_rdd_with_distance.filter(lambda x: (x[1]['distance_km'] > 0) and (x[1]['distance_km'] < 100)) \
                                            .map(lambda x: x[1]).cache()

# Step 3: Compute stats on filtered data
feature_indices = list(range(14))
means, stddevs = compute_stats(train_rdd_filtered, feature_indices)

# Step 4: Apply preprocessing with Z-score standardization on the original data
train_rdd = train_rdd_with_distance.filter(lambda x: (x[1]['distance_km'] > 0) and (x[1]['distance_km'] < 100)) \
                                   .map(lambda x: preprocess_data(x[0], 'trip_duration', means, stddevs))

# For test_rdd, apply preprocessing and filtering in one pass
test_rdd_raw = raw_test_rdd.map(lambda row: extract_column(row, test_header, [])).cache()
test_rdd_with_distance = test_rdd_raw.map(lambda row: (row, preprocess_data(row, None)))
test_rdd = test_rdd_with_distance.filter(lambda x: (x[1]['distance_km'] > 0) and (x[1]['distance_km'] < 100)) \
                                 .map(lambda x: preprocess_data(x[0], None, means, stddevs))

                                                                                

## 3. Model training

Split data

In [7]:
# Split train/validation 
train_rdd, val_rdd = train_rdd.randomSplit([0.8, 0.2], seed=24)
train_rdd = train_rdd.cache()
val_rdd = val_rdd.cache()
test_rdd = test_rdd.cache()

# Free up memory
train_rdd_raw.unpersist()
train_rdd_filtered.unpersist()
test_rdd_raw.unpersist()

PythonRDD[9] at RDD at PythonRDD.scala:53

### Ultility functions

`partition_feature_grouping` for flattening each row of dataset into a tuple of (feature_value, row_label):

In [8]:
def partition_feature_grouping(partition, usable_features: list):
    """Read each row and convert into a list of (feature_value, row_label), then flatten the results."""
    feature_stats = {}
    for row in partition:
        for feature in usable_features:
            # Store feature-wise values in a dictionary
            if feature not in feature_stats:
                feature_stats[feature] = []

            feature_stats[feature].append((row['features'][feature], row['label']))

    results = []
    
    for feature, values in feature_stats.items():
        results.append((feature, values))

    return iter(results)

`find_split_feature` for finding splitting point with maximum variance reduction on the domain of a feature:

In [9]:
def find_split_feature(values: list, parent_info: StatCounter):
    # Sort values by feature value (ascending order)
    sorted_values = sorted(values, key=lambda x: x[0])

    # Get total number of data points
    parent_count = parent_info.count()

    # Get variance, mean, and sum of the whole dataset
    parent_var = parent_info.variance()
    parent_mean = parent_info.mean()
    parent_sum = parent_info.sum()

    # Calculate parent's sum of squares: Σx_i^2 = n * (Var(X) + E[X]^2)
    parent_pow_sum = parent_count * (parent_var + parent_mean ** 2)

    # If not enough data to split, return no split
    if parent_count < 2:
        return (-float("inf"), None)

    # Initialize variables for left partition
    left_sum, left_pow_sum, left_count = 0, 0, 0
    best_split = (-float("inf"), None)

    # Iterate through possible split points
    for i in range(parent_count - 1):
        val, label = sorted_values[i]

        # Accumulate stats for left partition
        left_sum += label
        left_pow_sum += label ** 2
        left_count += 1

        # Skip splitting between identical feature values
        if val == sorted_values[i + 1][0]:
            continue

        # Compute right partition stats by subtracting left from parent
        right_count = parent_count - left_count
        right_sum = parent_sum - left_sum
        right_pow_sum = parent_pow_sum - left_pow_sum

        # Compute variance for each partition
        left_var = (left_pow_sum / left_count - (left_sum / left_count) ** 2) if left_count > 0 else 0
        right_var = (right_pow_sum / right_count - (right_sum / right_count) ** 2) if right_count > 0 else 0

        # Calculate weighted variance of children
        weighted_child_var = (left_var * left_count + right_var * right_count) / parent_count

        # Compute variance reduction
        var_reduction = parent_var - weighted_child_var

        # Update best split if variance reduction is positive and greater than current best
        if var_reduction > 0 and var_reduction > best_split[0]:
            best_split = (var_reduction, (val + sorted_values[i + 1][0]) / 2)

    return best_split

Sub-function `splitter` for splitting the rows of the parent dataset according to the splitting point and operands:

In [10]:
def splitter(iterator, split_feature: int, split_point: float, operand):
    ret = []
    for row in iterator:
        if operand(row['features'][split_feature], split_point):
            ret.append(row)

    return iter(ret)

Main class `DecisionTreeRegressor` for building and executing Decision Tree Regressor Algorithm:

Synchronized with maxDepth=10 from sections 3.2.1 and 3.2.2 (other parameters like maxBins and minInstancesPerNode cannot be directly applied in manual implementation)


In [None]:
class DecisionTreeRegressor:
    def __init__(self, max_depth = 10): # Synchronized with maxDepth=10 from sections 3.2.1 and 3.2.2
        #Initialize the estimator with given depth (if any)
        self.max_depth = max_depth
        self.rules = None
        self.num_features = None

    def set_maxDepth(self, depth):
        """Set current maximum depth for the estimator."""
        self.max_depth = depth

    def fit(self, train_rdd: RDD):
        """Execute Decision Tree Algorithm recursively on a given rdd based on variance reduction and the current maximum depth."""
        self.num_features = len(train_rdd.first()['features'])
        self.usable_features = [i for i in range(self.num_features)]
        sample_size = train_rdd.count()
        print(f"Starting tree construction with {sample_size} samples, max_depth={self.max_depth}")
        self.rules = self.__build_rule_tree_recursive(train_rdd, self.usable_features, sample_size)
        print("Tree construction completed")
        return self

    def transform(self, rdd: RDD):
        """Make predictions on an RDD using the decision tree rules."""
        def predict_row(row):
            # Start from the root of the decision tree
            node = self.rules

            # Traverse the tree until reaching a leaf node (which contains a prediction)
            while 'prediction' not in node:
                # Compare the feature value with the split point to decide the direction
                if row['features'][node['split_feature']] <= node['split_point']:
                    node = node['left']  # Go to the left subtree
                else:
                    node = node['right']  # Go to the right subtree
            prediction = math.exp(node['prediction']) - 1  # Chuyển ngược từ log
            # Return a tuple of (row ID, predicted value)
            return (row['id'], prediction)

        # Apply prediction to each row in the RDD
        return rdd.map(lambda row: predict_row(row))
    
    def evaluate(self, rdd: RDD):
        # Evaluate the model using RMSE and R² on a dataset with labels
        predictions = self.transform(rdd)
        actual_and_pred = predictions.join(rdd.map(lambda row: (row['id'], math.exp(row['label']) - 1)))

        # Compute RMSE
        mse = actual_and_pred.map(lambda x: (x[1][0] - x[1][1]) ** 2).mean()
        rmse = math.sqrt(mse)

        # Compute R²
        # Step 1: Compute mean of actual labels (y_bar)
        actual_labels = rdd.map(lambda row: math.exp(row['label']) - 1)
        y_mean = actual_labels.mean()
        
        # Step 2: Compute SS_tot (total sum of squares)
        ss_tot = actual_labels.map(lambda y: (y - y_mean) ** 2).sum()
        
        # Step 3: Compute SS_res (sum of squared residuals)
        ss_res = actual_and_pred.map(lambda x: (x[1][0] - x[1][1]) ** 2).sum()
        
        # Step 4: Compute R²
        r2 = 1 - (ss_res / ss_tot) if ss_tot != 0 else 0

        return rmse, r2

    def display_rule_tree(self):
        """Recursively display the rules of a Decision Tree model."""
        print("\nDecision Tree Rules:")
        self.__display_tree_recursive(self.rules)

    def __display_tree_recursive(self, rules: dict, indent = 0):
        #Stopping condition
        if not rules:
            return

        if 'prediction' in rules:  #Is a leaf condition
            prediction = math.exp(rules['prediction']) - 1  # Reverse from log
            print(f"{indent * ' '}Predict: {prediction:.2f}")
            return

        #Print the splitting point information and call recursion of the left and right child
        print(f"{indent * ' '}If feature[{rules['split_feature']}] <= {rules['split_point']:.2f}")
        self.__display_tree_recursive(rules['left'], indent + 4)
        print(f"{indent * ' '}Else feature[{rules['split_feature']}] > {rules['split_point']:.2f}")
        self.__display_tree_recursive(rules['right'], indent + 4)      

    def __find_best_split(self, rdd: RDD, usable_features: list):         
        """Find the best splitting point with maximum variance reduction of the input dataset."""

        #Compute the dataset statistics and store into a StatCounter object
        parent_info = rdd.mapPartitions(lambda partition: [StatCounter([row['label'] for row in partition])], True)\
                        .reduce(lambda stat1, stat2: stat1.mergeStats(stat2))
        
        #Convert each row into a list of (row_feature_value, row_label) and flatten the results
        processed_rdd = rdd.mapPartitions(lambda partition: partition_feature_grouping(partition, usable_features), True)\
                      .reduceByKey(lambda l1, l2: l1 + l2).cache()
        
        #Find the best splitting point for each features
        best_split_per_feature = processed_rdd.mapValues(lambda values: find_split_feature(values, parent_info))\
                                  
        #Find the best splitting point for the dataset
        best_split = max(best_split_per_feature.collect(),key=lambda x: (x[1][0], -x[0]))

        return best_split[0], best_split[1][1]


    def __build_rule_tree_recursive(self, parent: RDD, usable_features: list, sample_size, depth = 0):
        """Recursively build the decision tree by finding the splitting point with maximum variance reduction and split the dataset with this point, up to the maximum depth."""
        print(f"Building tree at depth {depth}, sample size: {sample_size}")
        #Stopping condition
        if sample_size == 0:
            return None
        
        #Return the mean of un-splitted subset as the prediction if reached the depth limit
        if depth == self.max_depth or sample_size < 2:
            mean = parent.mapPartitions(lambda partition: [StatCounter([row['label'] for row in partition])], True)\
                        .reduce(lambda stat1, stat2: stat1.mergeStats(stat2)).mean()

            return {'prediction' : mean}

        #Find the best splitting point for the input dataset
        split_feature, split_point = self.__find_best_split(parent, usable_features)
        
        #Return the mean of un-splitted subset as the prediction if no splitting point is valid but has not reached the depth limit yet
        if split_point == None:
            mean = parent.mapPartitions(lambda partition: [StatCounter([row['label'] for row in partition])], True)\
                        .reduce(lambda stat1, stat2: stat1.mergeStats(stat2)).mean()
            
            return {'prediction' : mean}

        #Split the dataset into left and right subsets
        left_rdd = parent.mapPartitions(lambda iterator: splitter(iterator, split_feature, split_point, lambda a,b: a <= b), True).cache()
        right_rdd = parent.mapPartitions(lambda iterator: splitter(iterator, split_feature, split_point, lambda a,b: a >= b), True).cache()
        
        #Un-cache the parent dataset (excluding the input one)
        if depth > 0:
            parent.unpersist()

        #Compute the size of the left subset
        left_sample_size = left_rdd.count()
     
        #Build the dict of information with recursion call
        return {
            'split_feature' : split_feature,
            'split_point' : split_point,
            'left' : self.__build_rule_tree_recursive(left_rdd, usable_features, left_sample_size, depth + 1),
            'right' : self.__build_rule_tree_recursive(right_rdd, usable_features, sample_size - left_sample_size, depth + 1)
        }

In [12]:
# Can not train with max_depth=10
estimator = DecisionTreeRegressor(max_depth=5) 
model = estimator.fit(train_rdd)
estimator.display_rule_tree()

25/04/12 09:27:12 WARN BlockManager: Task 8 already completed, not releasing lock for rdd_10_0
                                                                                

Starting tree construction with 1161023 samples, max_depth=5
Building tree at depth 0, sample size: 1161023


                                                                                

Building tree at depth 1, sample size: 579030


                                                                                

Building tree at depth 2, sample size: 188790


                                                                                

Building tree at depth 3, sample size: 3008


                                                                                

Building tree at depth 4, sample size: 1671


                                                                                

Building tree at depth 5, sample size: 1050
Building tree at depth 5, sample size: 621


                                                                                

Building tree at depth 4, sample size: 1337


                                                                                

Building tree at depth 5, sample size: 1216
Building tree at depth 5, sample size: 121


                                                                                

Building tree at depth 3, sample size: 185782


                                                                                

Building tree at depth 4, sample size: 54252


                                                                                

Building tree at depth 5, sample size: 53836
Building tree at depth 5, sample size: 416


                                                                                

Building tree at depth 4, sample size: 131530


                                                                                

Building tree at depth 5, sample size: 98094
Building tree at depth 5, sample size: 33436


                                                                                

Building tree at depth 2, sample size: 390240


                                                                                

Building tree at depth 3, sample size: 185344


                                                                                

Building tree at depth 4, sample size: 147755


                                                                                

Building tree at depth 5, sample size: 22860
Building tree at depth 5, sample size: 124895


                                                                                

Building tree at depth 4, sample size: 37589


                                                                                

Building tree at depth 5, sample size: 19289
Building tree at depth 5, sample size: 18300


                                                                                

Building tree at depth 3, sample size: 204896


                                                                                

Building tree at depth 4, sample size: 31009


                                                                                

Building tree at depth 5, sample size: 23080
Building tree at depth 5, sample size: 7929


                                                                                

Building tree at depth 4, sample size: 173887


                                                                                

Building tree at depth 5, sample size: 135719
Building tree at depth 5, sample size: 38168


                                                                                

Building tree at depth 1, sample size: 581993


                                                                                

Building tree at depth 2, sample size: 385334


                                                                                

Building tree at depth 3, sample size: 213226


                                                                                

Building tree at depth 4, sample size: 36217


                                                                                

Building tree at depth 5, sample size: 19082
Building tree at depth 5, sample size: 17135


                                                                                

Building tree at depth 4, sample size: 177009


                                                                                

Building tree at depth 5, sample size: 138309
Building tree at depth 5, sample size: 38700


                                                                                

Building tree at depth 3, sample size: 172108


                                                                                

Building tree at depth 4, sample size: 34957


                                                                                

Building tree at depth 5, sample size: 19428
Building tree at depth 5, sample size: 15529


                                                                                

Building tree at depth 4, sample size: 137151


                                                                                

Building tree at depth 5, sample size: 92529
Building tree at depth 5, sample size: 44622


                                                                                

Building tree at depth 2, sample size: 196659


                                                                                

Building tree at depth 3, sample size: 148444


                                                                                

Building tree at depth 4, sample size: 91295


                                                                                

Building tree at depth 5, sample size: 24376
Building tree at depth 5, sample size: 66919


                                                                                

Building tree at depth 4, sample size: 57149


                                                                                

Building tree at depth 5, sample size: 12100
Building tree at depth 5, sample size: 45049


                                                                                

Building tree at depth 3, sample size: 48215


                                                                                

Building tree at depth 4, sample size: 21121


                                                                                

Building tree at depth 5, sample size: 3963
Building tree at depth 5, sample size: 17158


                                                                                

Building tree at depth 4, sample size: 27094


                                                                                

Building tree at depth 5, sample size: 2958
Building tree at depth 5, sample size: 24136




Tree construction completed

Decision Tree Rules:
If feature[13] <= -0.34
    If feature[13] <= -0.62
        If feature[13] <= -0.87
            If feature[13] <= -0.87
                If feature[13] <= -0.87
                    Predict: 17.67
                Else feature[13] > -0.87
                    Predict: 24.29
            Else feature[13] > -0.87
                If feature[8] <= 2.32
                    Predict: 65.94
                Else feature[8] > 2.32
                    Predict: 19.05
        Else feature[13] > -0.87
            If feature[13] <= -0.72
                If feature[8] <= 2.34
                    Predict: 203.02
                Else feature[8] > 2.34
                    Predict: 49.66
            Else feature[13] > -0.72
                If feature[11] <= 0.49
                    Predict: 318.73
                Else feature[11] > 0.49
                    Predict: 240.49
    Else feature[13] > -0.62
        If feature[13] <= -0.50
            If feature[11] <=

                                                                                

In [13]:
# Evaluate on validation set
val_rmse, val_r2 = model.evaluate(val_rdd)
print(f"Validation RMSE: {val_rmse}")
print(f"Validation R²: {val_r2}")

                                                                                

Validation RMSE: 431.81822822875625
Validation R²: 0.6010753112901581


                                                                                

## 4. Sample predictions for a few test cases (test file)

In [14]:
# Predict and display samples
predictions_rdd = estimator.transform(test_rdd)
print("Sample Predictions:")
predictions_samples = predictions_rdd.take(5)
for pred in predictions_samples:
    print(f"ID: {pred[0]}, Prediction: {pred[1]}")

# Stop SparkContext
sc.stop()

Sample Predictions:


25/04/12 09:35:04 WARN BlockManager: Task 999 already completed, not releasing lock for rdd_12_0
                                                                                

ID: id3004672, Prediction: 812.191225632504
ID: id3505355, Prediction: 812.191225632504
ID: id1217141, Prediction: 465.0235860725819
ID: id2150126, Prediction: 1363.012524057769
ID: id1598245, Prediction: 318.7302742288561
