### Problem Statement: we have provided with online e-commerce website dataset with various input features - like duration,region,month,bounce_rated etc..
* Target : Revenue(Yes/No)
* Goal : To create a well trained supervised model for prediction of revenue as

In [2]:
sc

In [3]:
spark

### 1.Read Dataset

In [4]:
online_data = spark.read.csv("file:///home/hadoop/Downloads/Online Shoppers Intention.csv", inferSchema = True, header = True)
online_data.head()

Row(Administrative=0, Administrative_Duration=0.0, Informational=0, Informational_Duration=0.0, ProductRelated=1, ProductRelated_Duration=0.0, BounceRates=0.2, ExitRates=0.2, PageValues=0.0, SpecialDay=0.0, Month='Feb', OperatingSystems=1, Browser=1, Region=1, TrafficType=1, VisitorType='Returning_Visitor', Weekend=False, Revenue=False)

In [5]:
online_data.show()

+--------------+-----------------------+-------------+----------------------+--------------+-----------------------+-----------+-----------+----------+----------+-----+----------------+-------+------+-----------+-----------------+-------+-------+
|Administrative|Administrative_Duration|Informational|Informational_Duration|ProductRelated|ProductRelated_Duration|BounceRates|  ExitRates|PageValues|SpecialDay|Month|OperatingSystems|Browser|Region|TrafficType|      VisitorType|Weekend|Revenue|
+--------------+-----------------------+-------------+----------------------+--------------+-----------------------+-----------+-----------+----------+----------+-----+----------------+-------+------+-----------+-----------------+-------+-------+
|             0|                    0.0|            0|                   0.0|             1|                    0.0|        0.2|        0.2|       0.0|       0.0|  Feb|               1|      1|     1|          1|Returning_Visitor|  false|  false|
|           

### 2. Show Schema Of DataFrame

In [6]:
online_data.cache()

DataFrame[Administrative: int, Administrative_Duration: double, Informational: int, Informational_Duration: double, ProductRelated: int, ProductRelated_Duration: double, BounceRates: double, ExitRates: double, PageValues: double, SpecialDay: double, Month: string, OperatingSystems: int, Browser: int, Region: int, TrafficType: int, VisitorType: string, Weekend: boolean, Revenue: boolean]

In [7]:
online_data.printSchema()

root
 |-- Administrative: integer (nullable = true)
 |-- Administrative_Duration: double (nullable = true)
 |-- Informational: integer (nullable = true)
 |-- Informational_Duration: double (nullable = true)
 |-- ProductRelated: integer (nullable = true)
 |-- ProductRelated_Duration: double (nullable = true)
 |-- BounceRates: double (nullable = true)
 |-- ExitRates: double (nullable = true)
 |-- PageValues: double (nullable = true)
 |-- SpecialDay: double (nullable = true)
 |-- Month: string (nullable = true)
 |-- OperatingSystems: integer (nullable = true)
 |-- Browser: integer (nullable = true)
 |-- Region: integer (nullable = true)
 |-- TrafficType: integer (nullable = true)
 |-- VisitorType: string (nullable = true)
 |-- Weekend: boolean (nullable = true)
 |-- Revenue: boolean (nullable = true)



### 3.Data Wrangling
        * Is there any missing value?

In [8]:
from pyspark.sql.functions import *

In [9]:
online_data.select([count(when(isnull(col), col)).alias(col) for col in online_data.columns]).show()

+--------------+-----------------------+-------------+----------------------+--------------+-----------------------+-----------+---------+----------+----------+-----+----------------+-------+------+-----------+-----------+-------+-------+
|Administrative|Administrative_Duration|Informational|Informational_Duration|ProductRelated|ProductRelated_Duration|BounceRates|ExitRates|PageValues|SpecialDay|Month|OperatingSystems|Browser|Region|TrafficType|VisitorType|Weekend|Revenue|
+--------------+-----------------------+-------------+----------------------+--------------+-----------------------+-----------+---------+----------+----------+-----+----------------+-------+------+-----------+-----------+-------+-------+
|            14|                     14|           14|                    14|            14|                     14|         14|       14|         0|         0|    0|               0|      0|     0|          0|          0|      0|      0|
+--------------+-----------------------+----

In [10]:
#Drop rows with any null values in any column

shopper_df = online_data.na.drop()

In [11]:
shopper_df.count()

12316

In [19]:
!pip install numpy pandas matplotlib seaborn

Defaulting to user installation because normal site-packages is not writeable


In [12]:
shopper_df.toPandas().head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,-1.0,0,-1.0,1,-1.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


#### 4.Data Preprocessing
     * Transformation of Catergorical values into numerical dataframe

In [13]:
shopper_df.select(['VisitorType']).distinct().show()

+-----------------+
|      VisitorType|
+-----------------+
|      New_Visitor|
|            Other|
|Returning_Visitor|
+-----------------+



In [14]:
shopper_df = shopper_df.withColumn('VisitorType',regexp_replace('VisitorType','New_Visitor',"0"))
shopper_df = shopper_df.withColumn('VisitorType',regexp_replace('VisitorType','Other',"1"))
shopper_df = shopper_df.withColumn('VisitorType',regexp_replace('VisitorType','Returning_Visitor',"2"))

     * StringIndexer() - encode catergorical values into numerical lables as '0','1','2'.....

In [15]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

In [16]:
indexer = StringIndexer(inputCol='Month',outputCol='Month_index')

# Fit StringIndexer() model on the DataFrame 
shopper_df1 = indexer.fit(shopper_df).transform(shopper_df)
shopper_df1.show()

+--------------+-----------------------+-------------+----------------------+--------------+-----------------------+-----------+-----------+----------+----------+-----+----------------+-------+------+-----------+-----------+-------+-------+-----------+
|Administrative|Administrative_Duration|Informational|Informational_Duration|ProductRelated|ProductRelated_Duration|BounceRates|  ExitRates|PageValues|SpecialDay|Month|OperatingSystems|Browser|Region|TrafficType|VisitorType|Weekend|Revenue|Month_index|
+--------------+-----------------------+-------------+----------------------+--------------+-----------------------+-----------+-----------+----------+----------+-----+----------------+-------+------+-----------+-----------+-------+-------+-----------+
|             0|                    0.0|            0|                   0.0|             1|                    0.0|        0.2|        0.2|       0.0|       0.0|  Feb|               1|      1|     1|          1|          2|  false|  false| 

In [17]:
shopper_df1.printSchema()

root
 |-- Administrative: integer (nullable = true)
 |-- Administrative_Duration: double (nullable = true)
 |-- Informational: integer (nullable = true)
 |-- Informational_Duration: double (nullable = true)
 |-- ProductRelated: integer (nullable = true)
 |-- ProductRelated_Duration: double (nullable = true)
 |-- BounceRates: double (nullable = true)
 |-- ExitRates: double (nullable = true)
 |-- PageValues: double (nullable = true)
 |-- SpecialDay: double (nullable = true)
 |-- Month: string (nullable = true)
 |-- OperatingSystems: integer (nullable = true)
 |-- Browser: integer (nullable = true)
 |-- Region: integer (nullable = true)
 |-- TrafficType: integer (nullable = true)
 |-- VisitorType: string (nullable = true)
 |-- Weekend: boolean (nullable = true)
 |-- Revenue: boolean (nullable = true)
 |-- Month_index: double (nullable = false)



In [18]:
shopper_df2 = shopper_df1.drop('Month')

In [19]:
from pyspark.sql.types import IntegerType
shopper_df2 = shopper_df2.withColumn('VisitorType',col('VisitorType').cast("integer"))

### 5. Features Vector

In [20]:
from pyspark.ml.feature import VectorAssembler

In [21]:
print(shopper_df2.columns)

['Administrative', 'Administrative_Duration', 'Informational', 'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration', 'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType', 'Weekend', 'Revenue', 'Month_index']


In [22]:
vector_assembler = VectorAssembler(inputCols=['Administrative', 'Administrative_Duration', 'Informational', 'Informational_Duration', 'ProductRelated', 
                                               'ProductRelated_Duration', 'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'OperatingSystems', 'Browser', 
                                               'Region', 'TrafficType', 'VisitorType', 'Weekend', 'Month_index'],outputCol='feature')

In [24]:
shopper_df3 = vector_assembler.transform(shopper_df2)
shopper_df3.select(['feature']).show(truncate=False)

+---------------------------------------------------------------------------------------------------+
|feature                                                                                            |
+---------------------------------------------------------------------------------------------------+
|(17,[4,6,7,10,11,12,13,14,16],[1.0,0.2,0.2,1.0,1.0,1.0,1.0,2.0,9.0])                               |
|(17,[4,5,7,10,11,12,13,14,16],[2.0,64.0,0.1,2.0,2.0,1.0,2.0,2.0,9.0])                              |
|[0.0,-1.0,0.0,-1.0,1.0,-1.0,0.2,0.2,0.0,0.0,4.0,1.0,9.0,3.0,2.0,0.0,9.0]                           |
|(17,[4,5,6,7,10,11,12,13,14,16],[2.0,2.666666667,0.05,0.14,3.0,2.0,2.0,4.0,2.0,9.0])               |
|[0.0,0.0,0.0,0.0,10.0,627.5,0.02,0.05,0.0,0.0,3.0,3.0,1.0,4.0,2.0,1.0,9.0]                         |
|(17,[4,5,6,7,10,11,12,13,14,16],[19.0,154.2166667,0.015789474,0.024561404,2.0,2.0,1.0,3.0,2.0,9.0])|
|[0.0,-1.0,0.0,-1.0,1.0,-1.0,0.2,0.2,0.0,0.4,2.0,4.0,3.0,3.0,2.0,0.0,9.0]         

### 6.Split Dataset into Train & Test
         * randomSplit()

In [29]:
shopper_df3 = shopper_df3.withColumn('Revenue',col('Revenue').cast("integer"))

In [30]:
train, test = shopper_df3.randomSplit([0.8,0.2],seed=123)

In [31]:
train.show()

+--------------+-----------------------+-------------+----------------------+--------------+-----------------------+-----------+-----------+----------+----------+----------------+-------+------+-----------+-----------+-------+-------+-----------+--------------------+
|Administrative|Administrative_Duration|Informational|Informational_Duration|ProductRelated|ProductRelated_Duration|BounceRates|  ExitRates|PageValues|SpecialDay|OperatingSystems|Browser|Region|TrafficType|VisitorType|Weekend|Revenue|Month_index|             feature|
+--------------+-----------------------+-------------+----------------------+--------------+-----------------------+-----------+-----------+----------+----------+----------------+-------+------+-----------+-----------+-------+-------+-----------+--------------------+
|             0|                   -1.0|            0|                  -1.0|             1|                   -1.0|        0.0|0.066666667|       0.0|       0.0|               2|      2|     7|  

In [32]:
train.select(['feature','Revenue'])

DataFrame[feature: vector, Revenue: int]

### 7.Decision Tree Classifier

In [34]:
from pyspark.ml.classification import DecisionTreeClassifier
tree = DecisionTreeClassifier(featuresCol = 'feature', labelCol= 'Revenue')
decision_model= tree.fit(train)

In [35]:
#Prediciton on test dataset.

predictions = decision_model.transform(test)

In [37]:
predictions.select(['feature','Revenue','prediction']).show()

+--------------------+-------+----------+
|             feature|Revenue|prediction|
+--------------------+-------+----------+
|[0.0,-1.0,0.0,-1....|      0|       0.0|
|[0.0,-1.0,0.0,-1....|      0|       0.0|
|[0.0,-1.0,0.0,-1....|      0|       0.0|
|[0.0,-1.0,0.0,-1....|      0|       0.0|
|[0.0,-1.0,0.0,-1....|      0|       0.0|
|[0.0,-1.0,0.0,-1....|      0|       0.0|
|[0.0,-1.0,0.0,-1....|      0|       0.0|
|[0.0,-1.0,0.0,-1....|      0|       0.0|
|[0.0,-1.0,0.0,-1....|      0|       0.0|
|(17,[6,7,10,11,12...|      0|       0.0|
|(17,[4,7,10,11,12...|      0|       0.0|
|(17,[4,6,7,10,11,...|      0|       0.0|
|(17,[4,6,7,10,11,...|      0|       0.0|
|(17,[4,6,7,10,11,...|      0|       0.0|
|(17,[4,6,7,10,11,...|      0|       0.0|
|(17,[4,6,7,10,11,...|      0|       0.0|
|(17,[4,6,7,10,11,...|      0|       0.0|
|(17,[4,6,7,10,11,...|      0|       0.0|
|(17,[4,6,7,10,11,...|      0|       0.0|
|(17,[4,6,7,10,11,...|      0|       0.0|
+--------------------+-------+----

### 8. Classification Metrics

In [40]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol='Revenue', predictionCol='prediction',metricName='accuracy')
accuracy = evaluator.evaluate(predictions)

In [41]:
accuracy

0.8990825688073395