In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.classification import GBTClassifier, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


# Create a SparkSession
spark = SparkSession.builder \
    .appName("CC_Fraud") \
    .config("spark.executor.instances", "2") \
    .getOrCreate()


In [2]:
# Load your dataset into a Spark DataFrame
df = spark.read.csv("./data/clean_train.csv", header=True, inferSchema=True)

# Define your features and target column
feature_columns = [col for col in df.columns if col != "is_fraud"]
label_column = "is_fraud"

In [3]:
indexers = [StringIndexer(inputCol=col, outputCol=col+"_index") for col in ["merchant", "category", "gender", "job"]]

indexed_data = df
for indexer in indexers:
    indexed_data = indexer.fit(indexed_data).transform(indexed_data)

indexed_data = indexed_data.drop("merchant", "category", "gender", "job")
indexed_data = indexed_data.withColumnRenamed("merchant_index", "merchant").withColumnRenamed("category_index", "category").withColumnRenamed("gender_index", "gender").withColumnRenamed("job_index", "job")

In [28]:
# indexers = [StringIndexer(inputCol=col, outputCol=col+"_index") for col in ["merchant", "category", "gender", "job"]]
cols = ["merchant", "category", "gender", "job"]
for i in range(len(indexers)):
    indexers[i].save('C:/Users/saritajoshi/Documents/Repos/Credit-Card-Fraud-Detection-Spark/indexers/'+cols[i])
    # loadedIndexer = StringIndexer.load(stringIndexerPath)

In [4]:

assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
classifier = RandomForestClassifier(featuresCol="features", labelCol=label_column, numTrees=10, maxBins=700)

# Construct the pipeline
pipeline = Pipeline(stages=indexers + [assembler, classifier])

In [5]:
# Split data into training and test sets
train_data, test_data = indexed_data.randomSplit([0.8, 0.2], seed=42)

In [29]:
indexed_data.head()

Row(_c0=0, amt=166.8, lat=39.3426, long=-114.8859, city_pop=450, unix_time=1371108903, merch_lat=40.088507, merch_long=-113.895268, is_fraud=0, merchant=69.0, category=1.0, gender=1.0, job=316.0)

In [6]:
# Train the pipeline
model = pipeline.fit(train_data)

In [7]:
# Make predictions on test data
predictions = model.transform(test_data)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol=label_column, predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(predictions)
print("Accuracy:", accuracy)


Accuracy: 0.9951611051455045


In [8]:
model.save("/model1")

In [9]:
# Stop SparkSession
# spark.stop()

In [10]:
# import pickle
# with open('./modelp.pkl', "wb") as f:
#     pickle.dump(model, f)


In [12]:
# from pyspark.ml import Pipeline, Pipeline

In [13]:
# path="./model/"
# model.write().overwrite().save( path)
# newmodel= PipelineModel.load(path)
	

StringIndexer_7b7f9d07b41a__inputCols
StringIndexer_80b4a0b467a7__inputCols
StringIndexer_6420fbb5dcea__inputCols
StringIndexer_27d615d0c5d4__inputCols
