###  Importing Basic Spark Libraries

In [2]:
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("CTR_prediction_logistic").setMaster("yarn-client")
sc = SparkContext(conf=conf)
sc

# EDA

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Logistic_CTR').getOrCreate()

In [4]:
spark

### Load Data

In [5]:
df = spark.read.csv('s3a://sparkmldatasets/creditcard_fraud.csv', header=True, inferSchema=True)

In [6]:
df

DataFrame[Time: int, V1: double, V2: double, V3: double, V4: double, V5: double, V6: double, V7: double, V8: double, V9: double, V10: double, V11: double, V12: double, V13: double, V14: double, V15: double, V16: double, V17: double, V18: double, V19: double, V20: double, V21: double, V22: double, V23: double, V24: double, V25: double, V26: double, V27: double, V28: double, Amount: double, Class: int]

In [7]:
df.show(10)

+----+------------------+-------------------+------------------+------------------+-------------------+-------------------+--------------------+------------------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------------+-------------------+-------------------+------------------+-------------------+-------------------+--------------------+-------------------+------+-----+
|Time|                V1|                 V2|                V3|                V4|                 V5|                 V6|                  V7|                V8|                V9|                V10|               V11|               V12|               V13|               V14|               V15|                V16|                V17|                V18|                V19|                V20|                 V21|          

In [8]:
df.printSchema()

root
 |-- Time: integer (nullable = true)
 |-- V1: double (nullable = true)
 |-- V2: double (nullable = true)
 |-- V3: double (nullable = true)
 |-- V4: double (nullable = true)
 |-- V5: double (nullable = true)
 |-- V6: double (nullable = true)
 |-- V7: double (nullable = true)
 |-- V8: double (nullable = true)
 |-- V9: double (nullable = true)
 |-- V10: double (nullable = true)
 |-- V11: double (nullable = true)
 |-- V12: double (nullable = true)
 |-- V13: double (nullable = true)
 |-- V14: double (nullable = true)
 |-- V15: double (nullable = true)
 |-- V16: double (nullable = true)
 |-- V17: double (nullable = true)
 |-- V18: double (nullable = true)
 |-- V19: double (nullable = true)
 |-- V20: double (nullable = true)
 |-- V21: double (nullable = true)
 |-- V22: double (nullable = true)
 |-- V23: double (nullable = true)
 |-- V24: double (nullable = true)
 |-- V25: double (nullable = true)
 |-- V26: double (nullable = true)
 |-- V27: double (nullable = true)
 |-- V28: double (null

In [9]:
df.columns

['Time',
 'V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'Amount',
 'Class']

In [10]:
df.head(5)

[Row(Time=0, V1=-1.3598071336738, V2=-0.0727811733098497, V3=2.53634673796914, V4=1.37815522427443, V5=-0.338320769942518, V6=0.462387777762292, V7=0.239598554061257, V8=0.0986979012610507, V9=0.363786969611213, V10=0.0907941719789316, V11=-0.551599533260813, V12=-0.617800855762348, V13=-0.991389847235408, V14=-0.311169353699879, V15=1.46817697209427, V16=-0.470400525259478, V17=0.207971241929242, V18=0.0257905801985591, V19=0.403992960255733, V20=0.251412098239705, V21=-0.018306777944153, V22=0.277837575558899, V23=-0.110473910188767, V24=0.0669280749146731, V25=0.128539358273528, V26=-0.189114843888824, V27=0.133558376740387, V28=-0.0210530534538215, Amount=149.62, Class=0),
 Row(Time=0, V1=1.19185711131486, V2=0.26615071205963, V3=0.16648011335321, V4=0.448154078460911, V5=0.0600176492822243, V6=-0.0823608088155687, V7=-0.0788029833323113, V8=0.0851016549148104, V9=-0.255425128109186, V10=-0.166974414004614, V11=1.61272666105479, V12=1.06523531137287, V13=0.48909501589608, V14=-0.14

# Data Preparation

In [11]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, OneHotEncoderEstimator

### Vector Assembler

In [12]:
vec_assembler = VectorAssembler(inputCols=['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'], outputCol="features")

### Pipeline

In [13]:
# Make the pipeline
final_pipe = Pipeline(stages=[vec_assembler])

In [14]:
piped_data = final_pipe.fit(df).transform(df)

In [15]:
piped_data.show(10)

+----+------------------+-------------------+------------------+------------------+-------------------+-------------------+--------------------+------------------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------------+-------------------+-------------------+------------------+-------------------+-------------------+--------------------+-------------------+------+-----+--------------------+
|Time|                V1|                 V2|                V3|                V4|                 V5|                 V6|                  V7|                V8|                V9|                V10|               V11|               V12|               V13|               V14|               V15|                V16|                V17|                V18|                V19|                V20|          

In [16]:
piped_data.columns

['Time',
 'V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'Amount',
 'Class',
 'features']

In [17]:
piped_dataset = piped_data.withColumnRenamed("Class", "label")

In [18]:
piped_dataset.show(10)

+----+------------------+-------------------+------------------+------------------+-------------------+-------------------+--------------------+------------------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------------+-------------------+-------------------+------------------+-------------------+-------------------+--------------------+-------------------+------+-----+--------------------+
|Time|                V1|                 V2|                V3|                V4|                 V5|                 V6|                  V7|                V8|                V9|                V10|               V11|               V12|               V13|               V14|               V15|                V16|                V17|                V18|                V19|                V20|          

In [19]:
model_data = piped_dataset.select(['label','features'])

In [20]:
model_data.show(10)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    0|[0.0,-1.359807133...|
|    0|[0.0,1.1918571113...|
|    0|[1.0,-1.358354061...|
|    0|[1.0,-0.966271711...|
|    0|[2.0,-1.158233093...|
|    0|[2.0,-0.425965884...|
|    0|[4.0,1.2296576345...|
|    0|[7.0,-0.644269442...|
|    0|[7.0,-0.894286082...|
|    0|[9.0,-0.338261752...|
+-----+--------------------+
only showing top 10 rows



In [21]:
model_data.head(10)

[Row(label=0, features=DenseVector([0.0, -1.3598, -0.0728, 2.5363, 1.3782, -0.3383, 0.4624, 0.2396, 0.0987, 0.3638, 0.0908, -0.5516, -0.6178, -0.9914, -0.3112, 1.4682, -0.4704, 0.208, 0.0258, 0.404, 0.2514, -0.0183, 0.2778, -0.1105, 0.0669, 0.1285, -0.1891, 0.1336, -0.0211, 149.62])),
 Row(label=0, features=DenseVector([0.0, 1.1919, 0.2662, 0.1665, 0.4482, 0.06, -0.0824, -0.0788, 0.0851, -0.2554, -0.167, 1.6127, 1.0652, 0.4891, -0.1438, 0.6356, 0.4639, -0.1148, -0.1834, -0.1458, -0.0691, -0.2258, -0.6387, 0.1013, -0.3398, 0.1672, 0.1259, -0.009, 0.0147, 2.69])),
 Row(label=0, features=DenseVector([1.0, -1.3584, -1.3402, 1.7732, 0.3798, -0.5032, 1.8005, 0.7915, 0.2477, -1.5147, 0.2076, 0.6245, 0.0661, 0.7173, -0.1659, 2.3459, -2.8901, 1.11, -0.1214, -2.2619, 0.525, 0.248, 0.7717, 0.9094, -0.6893, -0.3276, -0.1391, -0.0554, -0.0598, 378.66])),
 Row(label=0, features=DenseVector([1.0, -0.9663, -0.1852, 1.793, -0.8633, -0.0103, 1.2472, 0.2376, 0.3774, -1.387, -0.055, -0.2265, 0.1782, 0.507

# Model Building

### Create Train Test Data

In [22]:
training, test = model_data.randomSplit([.7, .3])

In [23]:
training.show(10)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    0|[0.0,-1.359807133...|
|    0|[0.0,1.1918571113...|
|    0|[1.0,-1.358354061...|
|    0|[4.0,1.2296576345...|
|    0|[7.0,-0.894286082...|
|    0|[9.0,-0.338261752...|
|    0|[10.0,1.249998742...|
|    0|[10.0,1.449043781...|
|    0|[11.0,1.069373587...|
|    0|[12.0,-2.79185476...|
+-----+--------------------+
only showing top 10 rows



In [24]:
training.count()

69902

### Logistic Regression Model

In [25]:
from pyspark.ml.classification import LogisticRegression

In [26]:
lr = LogisticRegression(featuresCol='features', labelCol='label')

In [27]:
lr

LogisticRegression_45a7a143f103d9566c1b

In [28]:
model = lr.fit(training)

In [29]:
model

LogisticRegression_45a7a143f103d9566c1b

# Model Evaluation

In [30]:
# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(model.coefficients))
print("Intercept: " + str(model.intercept))

Coefficients: [4.400690760895576e-06,-0.028460817435821524,0.2773373825335272,0.3456364682466031,1.32074645867686,-0.263719670459238,-0.6977088799121022,0.31687211375230817,-0.09115687560469564,-0.07773800125690833,-0.8739401614225907,0.00023537012955616367,0.36960886086746614,-0.6228831549218068,-0.7370549898382779,0.30798552131468737,0.7998782958403539,-0.037626099318844565,-0.6031209843059473,0.14880545981338628,-0.1777106385198148,0.6872175406466675,0.6021648458469963,-0.3123269323707678,-0.9225198967195894,0.19251397466988285,1.3980407103371713,-1.258047222061941,-0.7529451161527994,4.139918407466929e-05]
Intercept: -10.257498924


### Plot ROC

In [31]:
import matplotlib.pyplot as plt
plt.clf()


In [None]:

trainingSummary = model.summary
roc = trainingSummary.roc.toPandas()
plt.plot(roc['FPR'],roc['TPR'])
plt.ylabel('False Positive Rate')
plt.xlabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()


In [None]:
%matplot plt

In [33]:
print('Training set areaUnderROC: ' + str(trainingSummary.areaUnderROC))

Training set areaUnderROC: 0.976386897422


### Precision Recall Curve

In [None]:
plt.clf()

In [None]:
pr = trainingSummary.pr.toPandas()
plt.plot(pr['recall'],pr['precision'])
plt.ylabel('Precision')
plt.xlabel('Recall')
plt.show()

In [None]:
%matplot plt

### Accuracy 

In [35]:
summary = model.evaluate(test)

In [36]:
summary.accuracy

0.9986045120776157

In [37]:
output = model.transform(test)

In [38]:
output

DataFrame[label: int, features: vector, rawPrediction: vector, probability: vector, prediction: double]

### Probability Output

In [39]:
output.show(10)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|    0|[1.0,-0.966271711...|[13.0921861453254...|[0.99999793872988...|       0.0|
|    0|[2.0,-1.158233093...|[8.81053266991848...|[0.99985086845849...|       0.0|
|    0|[2.0,-0.425965884...|[8.92847349698417...|[0.99986745734614...|       0.0|
|    0|[7.0,-0.644269442...|[7.14347617020689...|[0.99921062246973...|       0.0|
|    0|[10.0,0.384978215...|[15.5823122089710...|[0.99999982912138...|       0.0|
|    0|[12.0,-0.75241704...|[8.72262462647686...|[0.99983716733383...|       0.0|
|    0|[18.0,1.166616382...|[6.71630556086741...|[0.99879046081666...|       0.0|
|    0|[22.0,-2.07429467...|[9.19100249447501...|[0.99989805778673...|       0.0|
|    0|[23.0,1.059387115...|[8.64344700608271...|[0.99982375284685...|       0.0|
|    0|[24.0,1.2

In [40]:
output.select('label','prediction').show(10)

+-----+----------+
|label|prediction|
+-----+----------+
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
+-----+----------+
only showing top 10 rows



### ROC

In [41]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [42]:
evaluator = BinaryClassificationEvaluator()

In [43]:
print('Test Area Under ROC', evaluator.evaluate(output))

('Test Area Under ROC', 0.9644677830734226)
