__To predict whether or not a particular internet user will click on an ad based off the features of that user__

In [87]:
# Initialize pyspark
import findspark
findspark.init()
import pyspark

In [88]:
# Initialize and create ba spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Advertisement').getOrCreate()

In [89]:
# Import statements to setup ML
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors

In [90]:
# Using Spark to read in the advertising data csv file
data = spark.read.csv('advertising.csv', header=True, inferSchema=True)

In [91]:
# Printing the first row of the dataframe
data.head()

Row(Daily Time Spent on Site=68.95, Age=35, Area Income=61833.9, Daily Internet Usage=256.09, Ad Topic Line='Cloned 5thgeneration orchestration', City='Wrightburgh', Male=0, Country='Tunisia', Timestamp=datetime.datetime(2016, 3, 27, 0, 53, 11), Clicked on Ad=0)

In [92]:
# Printing the schema of the dataframe
data.printSchema()

root
 |-- Daily Time Spent on Site: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Area Income: double (nullable = true)
 |-- Daily Internet Usage: double (nullable = true)
 |-- Ad Topic Line: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Male: integer (nullable = true)
 |-- Country: string (nullable = true)
 |-- Timestamp: timestamp (nullable = true)
 |-- Clicked on Ad: integer (nullable = true)



In [93]:
#Converting the "Timestamp" column to numerical type by getting hour from the timestamp

from pyspark.sql.functions import hour

filtered_data = data.withColumn("Hour", hour(data['Timestamp']))

In [94]:
filtered_data.count()

1000

Checking out whether the string columns "City", "Country" and "Ad Topic Line" are useful or not

In [95]:
filtered_data.groupBy('City').count().count()

969

In [96]:
filtered_data.groupBy('Country').count().count()

237

In [97]:
filtered_data.groupBy('Country').count().show()

+--------------------+-----+
|             Country|count|
+--------------------+-----+
|                Chad|    4|
|            Paraguay|    3|
|            Anguilla|    6|
|               Macao|    3|
|Heard Island and ...|    3|
|               Yemen|    3|
|             Senegal|    8|
|             Tokelau|    4|
|              Sweden|    4|
|French Southern T...|    5|
|            Kiribati|    1|
|              Guyana|    5|
|              Jersey|    6|
|             Eritrea|    7|
|         Philippines|    6|
|      Norfolk Island|    5|
|               Tonga|    5|
|            Djibouti|    2|
|           Singapore|    6|
|            Malaysia|    3|
+--------------------+-----+
only showing top 20 rows



In [98]:
filtered_data.groupBy('Ad Topic Line').count().count()

1000

Since the "City", "Country" and "Ad Topic Line" columns contains large number of categories, these are not much useful features

In [99]:
filtered_data.columns

['Daily Time Spent on Site',
 'Age',
 'Area Income',
 'Daily Internet Usage',
 'Ad Topic Line',
 'City',
 'Male',
 'Country',
 'Timestamp',
 'Clicked on Ad',
 'Hour']

In [100]:
assembler = VectorAssembler(inputCols=['Daily Time Spent on Site','Age','Area Income','Male',
                            'Daily Internet Usage','Hour'], outputCol='features')

#Splitting the resultane data into training data and testing data
#Training data is to train the model
#Testing data is to test the builted model

In [101]:
train_data,test_data = filtered_data.randomSplit([0.7,0.3])

In [102]:
train_data.count()

707

In [103]:
test_data.count()

293

In [104]:
#Creating a logistic regression model object
lr = LogisticRegression(labelCol='Clicked on Ad', featuresCol='features')

In [105]:
#Setting Up the Pipeline
from pyspark.ml import Pipeline

In [106]:
pipeline = Pipeline(stages=[assembler,lr])

In [107]:
#Fitting the pipeline to training set.
model = pipeline.fit(train_data)

In [108]:
#Getting Results on Test Set
results = model.transform(test_data)

In [109]:
results.printSchema()

root
 |-- Daily Time Spent on Site: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Area Income: double (nullable = true)
 |-- Daily Internet Usage: double (nullable = true)
 |-- Ad Topic Line: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Male: integer (nullable = true)
 |-- Country: string (nullable = true)
 |-- Timestamp: timestamp (nullable = true)
 |-- Clicked on Ad: integer (nullable = true)
 |-- Hour: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [110]:
output = results.select('Clicked on Ad','rawPrediction','prediction','probability','features')

In [111]:
output.show(3)

+-------------+--------------------+----------+--------------------+--------------------+
|Clicked on Ad|       rawPrediction|prediction|         probability|            features|
+-------------+--------------------+----------+--------------------+--------------------+
|            1|[-15.670988027771...|       1.0|[1.56378234495458...|[32.84,40.0,41232...|
|            1|[-15.808367525958...|       1.0|[1.36305429752195...|[33.52,43.0,42191...|
|            1|[-13.306885409085...|       1.0|[1.66300047506299...|[34.3,41.0,53167....|
+-------------+--------------------+----------+--------------------+--------------------+
only showing top 3 rows



### MODEL EVALUATION

__1) Converting the data to rdd and evaluating using MulticlassMetrics to print the confusion matrix__

In [112]:
from pyspark.mllib.evaluation import MulticlassMetrics

In [113]:
clean_result = output.withColumn('Clicked on Ad',output['Clicked on Ad'].cast('double'))

In [114]:
clean_result.select('prediction','Clicked on Ad').show(3)

+----------+-------------+
|prediction|Clicked on Ad|
+----------+-------------+
|       1.0|          1.0|
|       1.0|          1.0|
|       1.0|          1.0|
+----------+-------------+
only showing top 3 rows



In [115]:
predictionAndLabel = clean_result.select('prediction','Clicked on Ad').rdd

In [116]:
metrics = MulticlassMetrics(predictionAndLabel)

In [117]:
#Printing the confusion matrix
print(metrics.confusionMatrix())

DenseMatrix([[145.,  10.],
             [  3., 135.]])


In [118]:
#Printing the Accuracy
print(metrics.accuracy)

0.9556313993174061


In [119]:
metrics.recall()

0.9556313993174061

In [120]:
metrics.precision()

0.9556313993174061

__2) Evaluating using BinaryClassificationEvaluator__

In [121]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [122]:
bin_eval = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction', labelCol='Clicked on Ad')

In [123]:
#Calculating Area Under ROC
AOC = bin_eval.evaluate(output)

In [124]:
#Printing Area Under ROC
print(AOC)

0.9866760168302944


__3) Evaluating using MulticlassClassificationEvaluator__

In [125]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [126]:
multi_eval = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='Clicked on Ad')

In [127]:
#Calculating Area Under ROC
AOC_2 = multi_eval.evaluate(output)

In [128]:
#Printing Area Under ROC
print(AOC_2)

0.9556676190056566


In [None]:
spark.stop()