# Building Reusable Spark Library for Feature Encoding

In [18]:
import sys, os
from pathlib import Path
from pyspark.sql.types import StructType
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession, functions as F
from pathlib import Path
import pandas as pd


# Setting up the Path Variable and Autoreloader

In [19]:
path = "D:\\SANJANA\\HSBC Internship\\spark-packages\\src\\main\\FeatureEncoder"
os.chdir(path)
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Setting up the Spark Session

In [20]:
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()
sc = spark.sparkContext.getOrCreate()
sqlContext = SQLContext(sc)

# Importing the Library and  Reading the data

In [21]:
from FeatureEncoder import FeatureEncoder
dat = pd.read_csv("D:\SANJANA\HSBC Internship\spark-packages\datasets\dummy data1.csv")
spark_dat = spark.createDataFrame(dat)
spark_dat = spark_dat.withColumn("Date",F.to_date(F.col("Date"),"yyyy-mm-dd"))


# Help functionality

In [22]:
help(FeatureEncoder)

Help on class FeatureEncoder in module FeatureEncoder:

class FeatureEncoder(builtins.object)
 |  FeatureEncoder(strategy: str = 'SR', cat_col: str = None, label_col: str = None, positive_class: str = '1', negative_class: str = '0')
 |  
 |  A general class for quick usage of Feature Encoding techniques
 |  General structure of code inspired by:
 |      https://github.com/apache/spark/blob/master/python/pyspark/ml/feature.py.
 |  
 |  Methods defined here:
 |  
 |  __init__(self, strategy: str = 'SR', cat_col: str = None, label_col: str = None, positive_class: str = '1', negative_class: str = '0')
 |      :param strategy         :  The selected strategy. Currently supported strategies are Supervised Ration Encoding (SR) 
 |                                 and Weight of Evidence (WOE).
 |      :param cat_col          :  The String that denotes the categorical column name.
 |      :param label_col        :  The String that denotes the label/target column name.
 |      :param positive_cla

# Feature Encoding using Supervised Ratio

In [23]:
encoder1 = FeatureEncoder(strategy = "SR",cat_col="AccountType",label_col="EliteMember",positive_class="Yes",negative_class="No")
df1 = encoder1.encode(spark_dat)
df1.show()

Positive Class =  Yes <class 'str'>
Negative Class =  No <class 'str'>
+-----------+----------+-------+----------+--------+-----------------+-----------+-----+-------------------+
|AccountType|CustomerId|Country|      Date|Balances|MaxAccountBalance|EliteMember|Label|AccountType_encoded|
+-----------+----------+-------+----------+--------+-----------------+-----------+-----+-------------------+
|    Savings|     92929|     CA|2019-01-01|221680.0|          6554191|         No|    0|                0.5|
|    Savings|    625741|     CA|2019-01-01|560797.0|          2731430|         No|    0|                0.5|
|    Savings|     92929|     CA|2019-01-02|695791.0|          6554191|         No|    0|                0.5|
|    Savings|    625741|     CA|2019-01-02|740800.0|          2731430|         No|    0|                0.5|
|    Savings|     92929|     CA|2019-01-03|     NaN|          6554191|         No|    0|                0.5|
|    Savings|    625741|     CA|2019-01-03|     NaN|     

In [24]:
encoder2 = FeatureEncoder(strategy = "SR",cat_col="AccountType",label_col="Label")
df1 = encoder2.encode(spark_dat)
df1.show()

Positive Class =  1 <class 'str'>
Negative Class =  0 <class 'str'>
+-----------+----------+-------+----------+--------+-----------------+-----------+-----+-------------------+
|AccountType|CustomerId|Country|      Date|Balances|MaxAccountBalance|EliteMember|Label|AccountType_encoded|
+-----------+----------+-------+----------+--------+-----------------+-----------+-----+-------------------+
|    Savings|     92929|     CA|2019-01-01|221680.0|          6554191|         No|    0|                0.5|
|    Savings|    625741|     CA|2019-01-01|560797.0|          2731430|         No|    0|                0.5|
|    Savings|     92929|     CA|2019-01-02|695791.0|          6554191|         No|    0|                0.5|
|    Savings|    625741|     CA|2019-01-02|740800.0|          2731430|         No|    0|                0.5|
|    Savings|     92929|     CA|2019-01-03|     NaN|          6554191|         No|    0|                0.5|
|    Savings|    625741|     CA|2019-01-03|     NaN|        

In [25]:
encoder3 = FeatureEncoder(strategy = "SR",cat_col="AccountType",label_col="Label",positive_class=1,negative_class=0)
df1 = encoder3.encode(spark_dat)
df1.show()

Positive Class =  1 <class 'int'>
Negative Class =  0 <class 'int'>
+-----------+----------+-------+----------+--------+-----------------+-----------+-----+-------------------+
|AccountType|CustomerId|Country|      Date|Balances|MaxAccountBalance|EliteMember|Label|AccountType_encoded|
+-----------+----------+-------+----------+--------+-----------------+-----------+-----+-------------------+
|    Savings|     92929|     CA|2019-01-01|221680.0|          6554191|         No|    0|                0.5|
|    Savings|    625741|     CA|2019-01-01|560797.0|          2731430|         No|    0|                0.5|
|    Savings|     92929|     CA|2019-01-02|695791.0|          6554191|         No|    0|                0.5|
|    Savings|    625741|     CA|2019-01-02|740800.0|          2731430|         No|    0|                0.5|
|    Savings|     92929|     CA|2019-01-03|     NaN|          6554191|         No|    0|                0.5|
|    Savings|    625741|     CA|2019-01-03|     NaN|        

# Feature Encoding using Weight of Evidence

In [26]:
encoder4 = FeatureEncoder(strategy = "WOE",cat_col="AccountType",label_col="EliteMember",positive_class="Yes",negative_class="No")
df2 = encoder4.encode(spark_dat)
df2.show()

Positive Class =  Yes <class 'str'>
Negative Class =  No <class 'str'>
+-----------+----------+-------+----------+--------+-----------------+-----------+-----+-------------------+
|AccountType|CustomerId|Country|      Date|Balances|MaxAccountBalance|EliteMember|Label|AccountType_encoded|
+-----------+----------+-------+----------+--------+-----------------+-----------+-----+-------------------+
|    Savings|     92929|     CA|2019-01-01|221680.0|          6554191|         No|    0|-0.3010299956639812|
|    Savings|    625741|     CA|2019-01-01|560797.0|          2731430|         No|    0|-0.3010299956639812|
|    Savings|     92929|     CA|2019-01-02|695791.0|          6554191|         No|    0|-0.3010299956639812|
|    Savings|    625741|     CA|2019-01-02|740800.0|          2731430|         No|    0|-0.3010299956639812|
|    Savings|     92929|     CA|2019-01-03|     NaN|          6554191|         No|    0|-0.3010299956639812|
|    Savings|    625741|     CA|2019-01-03|     NaN|     

In [30]:
encoder5 = FeatureEncoder(strategy = "WOE",cat_col="AccountType",label_col="Label")
df2 = encoder5.encode(spark_dat)
df2.show()

Positive Class =  1 <class 'str'>
Negative Class =  0 <class 'str'>
+-----------+----------+-------+----------+--------+-----------------+-----------+-----+-------------------+
|AccountType|CustomerId|Country|      Date|Balances|MaxAccountBalance|EliteMember|Label|AccountType_encoded|
+-----------+----------+-------+----------+--------+-----------------+-----------+-----+-------------------+
|    Savings|     92929|     CA|2019-01-01|221680.0|          6554191|         No|    0|-0.3010299956639812|
|    Savings|    625741|     CA|2019-01-01|560797.0|          2731430|         No|    0|-0.3010299956639812|
|    Savings|     92929|     CA|2019-01-02|695791.0|          6554191|         No|    0|-0.3010299956639812|
|    Savings|    625741|     CA|2019-01-02|740800.0|          2731430|         No|    0|-0.3010299956639812|
|    Savings|     92929|     CA|2019-01-03|     NaN|          6554191|         No|    0|-0.3010299956639812|
|    Savings|    625741|     CA|2019-01-03|     NaN|        

In [31]:
encoder6 = FeatureEncoder(strategy = "WOE",cat_col="AccountType",label_col="Label",positive_class=1,negative_class=0)
df2 = encoder6.encode(spark_dat)
df2.show()

Positive Class =  1 <class 'int'>
Negative Class =  0 <class 'int'>
+-----------+----------+-------+----------+--------+-----------------+-----------+-----+-------------------+
|AccountType|CustomerId|Country|      Date|Balances|MaxAccountBalance|EliteMember|Label|AccountType_encoded|
+-----------+----------+-------+----------+--------+-----------------+-----------+-----+-------------------+
|    Savings|     92929|     CA|2019-01-01|221680.0|          6554191|         No|    0|-0.3010299956639812|
|    Savings|    625741|     CA|2019-01-01|560797.0|          2731430|         No|    0|-0.3010299956639812|
|    Savings|     92929|     CA|2019-01-02|695791.0|          6554191|         No|    0|-0.3010299956639812|
|    Savings|    625741|     CA|2019-01-02|740800.0|          2731430|         No|    0|-0.3010299956639812|
|    Savings|     92929|     CA|2019-01-03|     NaN|          6554191|         No|    0|-0.3010299956639812|
|    Savings|    625741|     CA|2019-01-03|     NaN|        