In [None]:
from pyspark import SparkContext
from pyspark import RDD
from math import exp, log

In [None]:
sc = SparkContext(appName='creditcard_lowlevel')

In [None]:
#Convert data into rdd
raw_rdd = sc.textFile('creditcard.csv')

#Remove the csv header row
header = raw_rdd.first()

rdd_no_header = raw_rdd.filter(lambda row: row != header)

def vectorize(row: str):
    """Vectorize the features and labels."""
    values = [float(item) for item in row.split(',')]

    #One-hot encoding for labels
    one_hot = [1 if index == values[-1] else 0 for index in range(2)]

    #Insert an additional feature with value 1.0 as bias, then pack the features and encoded label into a LabeledPoint-like object for later use
    return (one_hot, [1.0] + values[:-1])

rdd = rdd_no_header.map(vectorize)

In [None]:
def compute_softmax(x: tuple[float], weights, return_prob = True):
    """Compute the dot product of input values with each row of the weights matrix, then apply softmax function to get the probability of each class
    . Return a vector of probabilities or a single converted class."""

    #Compute the dot product of x with each row of weights
    x_mul_w = [ sum(w * attr_value for w, attr_value in zip(w_row, x)) for w_row in weights]

    #Apply e^(value) on each dot product
    exp_ret = [exp(v) for v in x_mul_w]

    #Sum the e^(dot_product) to get the denominator
    sum_exp = sum(exp_ret)

    #Divide each e^(dot_product) with sum to get the probability of class
    ret = [v / sum_exp for v in exp_ret]

    #Return raw vector of probabilities
    if return_prob:
        return ret
    
    #Or get the class with highest probability
    posit_index = max(ret)
    return [1 if value == posit_index else 0 for value in ret]



def compute_gradient(x: tuple[float], predicted: tuple[float], true_label: tuple[float]):
    """Compute the gradient of weights matrix at a specific data point."""

    #Compute the different between the predicted probability of a class with its true probability
    output_diff = [pred - true_lb for pred, true_lb in zip(predicted, true_label)]

    #Compute the gradient of weights matrix
    this_x__w_gradient = [[diff * attr_value for attr_value in x]  for diff in output_diff]

    return this_x__w_gradient



def compute_crossEntropy(predicted: tuple[float], true_label: tuple[float]):
    """Compute the Cross-Entropy loss at a specific data point."""

    #Apply ln() function on each class probability
    predicted_logged = [log(value) for value in predicted]

    #Compute the Cross-Entropy value using the vector of natural log applied probabilities and the true_probabilities vector
    return -sum(pred_v * true_v for pred_v, true_v in zip(predicted_logged, true_label))



def softmax_regression(data: RDD, learning_rate = 1.0, max_epoch = 10, init_weights = list | None):
    """Run softmax regression on a RDD-based dataset with given learning rate and number of epochs."""

    #Replicate the weights matrix if given, or else generate a 0 weights matrix
    running_weights = []
    if init_weights:
        running_weights = init_weights.copy()
    else:
        first_row = data.first()
        running_weights = [[0.0] * len(first_row[0])] * len(first_row[1])

    #Log for Cross-Entropy loss
    cross_entropy_log = []



    def gradient_crossEntropy(row: tuple[list[float], list[int]], weights: list[list[float]]):
        """Wrapper function for computing the gradient of weights matrix and the Cross-Entropy loss at a data point stored in a pyspark.RDD"""

        #Make a prediction
        predicted_prob = compute_softmax(row[0], weights)

        #Compute the gradient of weights matrix
        gradient_w = compute_gradient(row[0], predicted_prob, row[1])

        #Compute the Cross-Entropy loss
        loss = compute_crossEntropy(predicted_prob, row[1])

        return gradient_w, loss
    
    

    def matrix_add(matA: list[list[float]], matB: list[list[float]]):
        """Function for adding two matrices element-wise."""
        return [[valueA + valueB for valueA, valueB in zip(rowA, rowB)] for rowA, rowB in zip(matA, matB) ]
    


    #Number of data points
    data_size = data.count()

    #Main loop for training
    for i in range(max_epoch):
        #Predict the probabilites of a data point, then compute the gradient of weights matrix and the Cross-Entropy loss at a data point
        rdd_computed = data.map(lambda row: gradient_crossEntropy(row, running_weights))

        #Use rdd.reduce to compute the sum of all gradient matrices and the sum of all Cross-Entropy values
        gradient_sum, loss_sum = rdd_computed.reduce(lambda row1, row2: (
                                                matrix_add(row1[0], row2[0]), #New sum of gradient matrix
                                                row1[1] + row2[1] #New sum of Cross-Entropy
                                            )
                                    )

        #The final gradient of weights matrix is the average of all gradient matrix
        gradient_w = [[value / data_size for value in row] for row in gradient_sum]

        #The final loss value is the average of all Cross-Entropy values
        loss = loss_sum / data_size
        cross_entropy_log.append(loss)

        #Update the weights matrix for next epoch
        running_weights = [
                            [    old_w - learning_rate * grad_w    for old_w, grad_w in zip(row, row_grad)]
                        for row, row_grad in zip(running_weights, gradient_w)
                    ]

        #Log the weights matrix to console
        print(running_weights)

    return running_weights, cross_entropy_log
