In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.classification import GBTClassifier, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


# Create a SparkSession
spark = SparkSession.builder \
    .appName("CC_Fraud") \
    .config("spark.executor.instances", "2") \
    .getOrCreate()


In [2]:
# Load your dataset into a Spark DataFrame
df = spark.read.csv("./data/clean_train.csv", header=True, inferSchema=True)

# Define your features and target column
feature_columns = [col for col in df.columns if col != "is_fraud"]
label_column = "is_fraud"

In [3]:
indexers = [StringIndexer(inputCol=col, outputCol=col+"_index") for col in ["merchant", "category", "gender", "job"]]

indexed_data = df
for indexer in indexers:
    indexed_data = indexer.fit(indexed_data).transform(indexed_data)

indexed_data = indexed_data.drop("merchant", "category", "gender", "job")
indexed_data = indexed_data.withColumnRenamed("merchant_index", "merchant").withColumnRenamed("category_index", "category").withColumnRenamed("gender_index", "gender").withColumnRenamed("job_index", "job")

In [4]:

assembler = VectorAssembler(inputCols=feature_columns + [col+"_index" for col in ["merchant", "category", "gender", "job"]], outputCol="features")
classifier = RandomForestClassifier(featuresCol="features", labelCol=label_column, numTrees=10, maxBins=700)

# Construct the pipeline
pipeline = Pipeline(stages=indexers + [assembler, classifier])

In [5]:
# Split data into training and test sets
train_data, test_data = indexed_data.randomSplit([0.8, 0.2], seed=42)

In [6]:
# Train the pipeline
model = pipeline.fit(train_data)

In [7]:
model.transform(train_data)

DataFrame[_c0: int, amt: double, lat: double, long: double, city_pop: int, unix_time: int, merch_lat: double, merch_long: double, is_fraud: int, merchant: double, category: double, gender: double, job: double, merchant_index: double, category_index: double, gender_index: double, job_index: double, features: vector, rawPrediction: vector, probability: vector, prediction: double]

In [8]:
# Make predictions on test data
predictions = model.transform(test_data)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol=label_column, predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)


Accuracy: 0.9949855137062625


In [10]:
model.save("./model/")

In [None]:
# Stop SparkSession
# spark.stop()

In [11]:
# import pickle
# with open('./modelp.pkl', "wb") as f:
#     pickle.dump(model, f)


In [None]:
model

PipelineModel_4da14774fdc5