In [1]:
import math
import csv
from io import StringIO
from pyspark import SparkConf, SparkContext

In [2]:
#Set the config for spark to enhance performance
config = SparkConf()\
            .set("spark.driver.memory", "4g")\
            .set("spark.executor.memory", "4g")

In [3]:
sc = SparkContext(appName='creditcard_lowlevel', conf=config)

your 131072x1 screen size is bogus. expect trouble
25/04/12 22:09:55 WARN Utils: Your hostname, minhnhat resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/04/12 22:09:55 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/12 22:09:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Parse input file

In [4]:
def parse_line(line):
    return next(csv.reader(StringIO(line)))

raw_rdd = sc.textFile("creditcard.csv")
header = raw_rdd.first() 
data_rdd = raw_rdd.filter(lambda line: line != header) \
                  .map(parse_line) \
                  .map(lambda row: [float(x) for x in row]) \
                  .map(lambda row: (row[:-1], row[-1])) 

                                                                                

In [5]:
#Split data into training and testing sets
train_size = 0.8
train_rdd, test_rdd = data_rdd.randomSplit([train_size, 1 - train_size], seed=24)

## Define function for Logistic Regression

In [6]:
class LogisticRegression:
    def __init__(self, learning_rate=0.01, num_iterations=100):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.weights = None

    def _add_bias(self, features):
        return features + [1.0]

    def dot_product(self, features, weights):
        return sum(f * w for f, w in zip(features, weights))

    def sigmoid(self, z):
        try:
            return 1.0 / (1.0 + math.exp(-z))
        except OverflowError:
            return 0.0 if z < 0 else 1.0

    def compute_gradient(self, features, label, weights):
        features = self._add_bias(features)
        prediction = self.sigmoid(self.dot_product(features, weights))
        error = prediction - label
        return [error * x for x in features]

    def predict(self, features):
        features = self._add_bias(features)
        prob = self.sigmoid(self.dot_product(features, self.weights))
        return 1.0 if prob >= 0.5 else 0.0

    def fit(self, data_rdd):
        first_record = self._add_bias(data_rdd.first()[0])
        n_features = len(first_record)
        self.weights = [0.0] * n_features

        for _ in range(self.num_iterations):
            gradient = data_rdd.map(
                lambda x: self.compute_gradient(x[0], x[1], self.weights)
            ).reduce(
                lambda a, b: [a_i + b_i for a_i, b_i in zip(a, b)]
            )
            self.weights = [w - self.learning_rate * g for w, g in zip(self.weights, gradient)]

    @property
    def coefficients(self):
        if self.weights:
            return self.weights[:-1]
        return None

    @property
    def intercept(self):
        if self.weights:
            return self.weights[-1]
        return None

## Fit the model

In [None]:
learning_rate = 0.01 
num_iterations = 20

log_reg = LogisticRegression(learning_rate=learning_rate, num_iterations=num_iterations)
log_reg.fit(train_rdd)

                                                                                

In [8]:
# Display model coefficients and intercept
print("Coefficients: ", log_reg.coefficients)
print("Intercept: ", log_reg.intercept)

Coefficients:  [-25393714.614999983, -90.26964798832012, 64.33602612889246, -137.09278453798973, 85.48357804966686, -56.39250697888101, -31.311152800252938, -112.91279753740908, 19.232147295830192, -46.88656278853463, -110.66616849556625, 72.49988894211533, -122.3967840808258, -2.1339775209086533, -129.85055139574962, -2.7888253881343488, -81.72940989200954, -131.522031517476, -43.243275001845355, 13.586343460879762, 3.3420234031935663, 14.916243632360867, 0.9985570692958942, -2.3292528479761527, -1.6482545033412652, 0.534583406466673, 0.38999401263095357, 1.2195822228369448, 0.898262787700131, -23834.346049999695]
Intercept:  -266.64999999999986


## Evaluate the model

In [9]:
def compute_accuracy(predictions_rdd):
    correct = predictions_rdd.filter(lambda x: x[0] == x[1]).count()
    total = predictions_rdd.count()
    return correct / total if total > 0 else 0.0

def compute_precision(predictions_rdd, label):
    tp = predictions_rdd.filter(lambda x: x[0] == label and x[1] == label).count()
    pred_pos = predictions_rdd.filter(lambda x: x[0] == label).count()
    return tp / pred_pos if pred_pos > 0 else 0.0

def compute_recall(predictions_rdd, label):
    tp = predictions_rdd.filter(lambda x: x[0] == label and x[1] == label).count()
    actual_pos = predictions_rdd.filter(lambda x: x[1] == label).count()
    return tp / actual_pos if actual_pos > 0 else 0.0

def compute_auc(predictions_rdd):
    n_pos = predictions_rdd.filter(lambda x: x[1] == 1).count()
    n_neg = predictions_rdd.filter(lambda x: x[1] == 0).count()
    if n_pos == 0 or n_neg == 0:
        return 0.0
    data = predictions_rdd.collect()
    data.sort(key=lambda x: x[0])
    rank_sum = 0.0
    rank = 1
    i = 0
    while i < len(data):
        j = i
        while j < len(data) and data[j][0] == data[i][0]:
            j += 1
        group_size = j - i
        avg_rank = (2 * rank + group_size - 1) / 2.0
        for k in range(i, j):
            if data[k][1] == 1:
                rank_sum += avg_rank
        rank += group_size
        i = j
    return (rank_sum - (n_pos * (n_pos + 1) / 2.0)) / (n_pos * n_neg)

In [10]:
#Evaluate the model on training data
predictions_train = train_rdd.map(lambda p: (log_reg.predict(p[0]), p[1]))

print("Train Accuracy:", compute_accuracy(predictions_train))
print("Train Precision: ", [compute_precision(predictions_train, label) for label in [0, 1]])
print("Train Recall: ", [compute_recall(predictions_train, label) for label in [0, 1]])
print("Train AUC:", compute_auc(predictions_train))

                                                                                

Train Accuracy: 0.9982481328784626


                                                                                

Train Precision:  [0.9982481328784626, 0.0]


                                                                                

Train Recall:  [1.0, 0.0]




Train AUC: 0.5


                                                                                

In [11]:
#Evaluate the model on test data
predictions_test = test_rdd.map(lambda p: (log_reg.predict(p[0]), p[1]))

print("Test Accuracy:", compute_accuracy(predictions_test))
print("Test Precision: ", [compute_precision(predictions_test, label) for label in [0, 1]])
print("Test Recall: ", [compute_recall(predictions_test, label) for label in [0, 1]])
print("Train AUC:", compute_auc(predictions_test))

                                                                                

Test Accuracy: 0.9983698510078878


                                                                                

Test Precision:  [0.9983698510078878, 0.0]


                                                                                

Test Recall:  [1.0, 0.0]




Train AUC: 0.5


                                                                                

In [12]:
#Stop the Spark session
sc.stop()