In [20]:
#Import relevant libraries
import findspark
findspark.init()
from pyspark.sql import SparkSession #Import spark session
from pyspark.ml.classification import LogisticRegression,RandomForestClassifier,GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.feature import VectorAssembler
import matplotlib.pyplot as plt
%matplotlib inline


In [5]:
spark=SparkSession.builder.appName('fraudDetect').getOrCreate()

In [6]:
data=spark.read.csv('FraudData.csv',inferSchema=True,header=True)


In [7]:
rowSize = data.count()
colSize = len(data.columns)
print(rowSize,colSize)

6362620 11


In [8]:
#Get a glimpse of data schema
data.printSchema()

root
 |-- step: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- nameOrig: string (nullable = true)
 |-- oldbalanceOrg: double (nullable = true)
 |-- newbalanceOrig: double (nullable = true)
 |-- nameDest: string (nullable = true)
 |-- oldbalanceDest: double (nullable = true)
 |-- newbalanceDest: double (nullable = true)
 |-- isFraud: integer (nullable = true)
 |-- isFlaggedFraud: integer (nullable = true)



In [9]:
#data.describe().show()
data.na.drop().show()


+----+--------+---------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|step|    type|   amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud|
+----+--------+---------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|   1| PAYMENT|  9839.64|C1231006815|     170136.0|     160296.36|M1979787155|           0.0|           0.0|      0|             0|
|   1| PAYMENT|  1864.28|C1666544295|      21249.0|      19384.72|M2044282225|           0.0|           0.0|      0|             0|
|   1|TRANSFER|    181.0|C1305486145|        181.0|           0.0| C553264065|           0.0|           0.0|      1|             0|
|   1|CASH_OUT|    181.0| C840083671|        181.0|           0.0|  C38997010|       21182.0|           0.0|      1|             0|
|   1| PAYMENT| 11668.14|C2048537720|      41554.0|      29885.86|M123070170

In [10]:
rowSize = data.count()
colSize = len(data.columns)
print(rowSize,colSize)

6362620 11


In [11]:
headers=data.columns
print (headers)

['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig', 'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud', 'isFlaggedFraud']


In [12]:
colList = [ item[0] for item in data.dtypes if item[1].startswith('string')]
dataInt=data.select([column for column in data.columns if column not in colList])
data_Features=dataInt.drop('isFraud')
data_Label=dataInt.select('isFraud')
assembler=VectorAssembler(inputCols=data_Features.columns,outputCol='features')
output=assembler.transform(dataInt)
finalData=output.select('features','isFraud')
#Split Data for Training and Testing
trainFraud,testFraud = finalData.randomSplit([0.7,0.3])

#Model Data with Logistic Regression
#model_Logistic=LogisticRegression(labelCol='isFraud')
#fit_Logistic=model_Logistic.fit(trainFraud)

###refactor Logistic Regression Part
lgtc=LogisticRegression(labelCol='isFraud')
lgtc_model=lgtc.fit(trainFraud)
lgtc_preds=lgtc_model.transform(testFraud)
fraud_eval=BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='isFraud')
auc_lgtc=fraud_eval.evaluate(lgtc_preds)
auc_lgtc




0.7548944639373066

In [13]:
lgtc_summary=lgtc_model.summary
lgtc_summary.predictions.describe().show()


+-------+--------------------+--------------------+
|summary|             isFraud|          prediction|
+-------+--------------------+--------------------+
|  count|             4455007|             4455007|
|   mean|0.001297192125624045|7.353523799176971E-4|
| stddev|0.035993189758784364| 0.02710741230246246|
|    min|                 0.0|                 0.0|
|    max|                 1.0|                 1.0|
+-------+--------------------+--------------------+



In [24]:

fraud_eval.setMetricName("areaUnderPR")
auc_lgtc=fraud_eval.evaluate(lgtc_preds)
auc_lgtc
#fraud_eval.getMetricName()

#lgtc_pred_labels=lgtc_model.evaluate(testFraud)
#lgtc_pred_labels.predictions.show()


0.7058601345233559

In [25]:
#Model Data with Random Forest
rfc=RandomForestClassifier(labelCol='isFraud',featuresCol='features',numTrees=150)
rfc_model=rfc.fit(trainFraud)



In [26]:
rfc_preds=rfc_model.transform(testFraud)
auc2=fraud_eval.evaluate(rfc_preds)

In [27]:
auc2

0.6728552000326327

In [28]:
gbt=GBTClassifier(labelCol='isFraud',featuresCol='features')
gbt_model=gbt.fit(trainFraud)
gbt_preds=gbt_model.transform(testFraud)
auc3=fraud_eval.evaluate(gbt_preds)
auc3

0.7618026881753232