In [1]:
import numpy as np
import pandas as pd

In [2]:
df=pd.read_csv('online_shoppers_intention.csv')

In [3]:
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

# Data Pre-Processing and Modelling

Data pre-processing is the process of performing operations on the raw data to make it compatible for machine learning algorithms.

It shall include the following steps:

- **Data Cleaning**: Since our data does not have any missing values, the process of data cleaning will primarily include removing outliers so that they do not mess with our algorithm.
- **Categorical Feature Encoding** and **Continuous Feature Scaling**: For our project, we will do this using a data pipeline to make the process more streamlined and organized.
- **Model Selection**: We will use the cleaned and tranformed data to evaluate a few machine learning models and select the best one
- **Hyper-parameter Tuning**: Select the best set of hyper-parameters for the chosen model
- **Final Evaluation**: Evaluate the final model fitted with the train data on the test data

## Data Cleaning:

Most columns in our data have quite a few extreme outliers (please check the Exploratory Data Analysis file). For the purpose of this project, we will term an extreme outlier as a value that has a z-score greater than 3.

We may lose quite some data if we delete every observation containing an outlier value. To avoid the loss, we will only delete those observations which contain more outliers than a specified threshold value.

Here's the **step-by-step** process:

- Assign a z-score to every value in every numberic(int or float) column
- Calculate the number of features having z-score>3 for every row
- Delete the rows where number or outliers is greater than the specified threshold value

<span style="color:red">The following function calculates the number of outliers for every observation and drops the observations with number of outliers greater than the threshold. The problem I'm encountering is that the process takes a long time as we repeat the steps for every continuous column. Please suggest a faster alternative.</span>.

In [5]:
##The zscore function will calculate the z-score for every value in a particular column
## z-score is basically the distance between the value and the mean expressed in terms of standard-deviation
## value-mean = z-score*standard-deviation
def zscore(x,col):
    mean, std = np.mean(df[col]), np.std(df[col])
    return (x-mean)/std


##The cleaned_data function takes a dataframe as input and returns the same after removing observations with too many outliers
##We will consider a value having |z-score|>3 as an outlier
##The function calculates the number of outliers present in each observations
##It deletes the observations having more outliers than the specified threshold value

def cleaned_data(data,threshold=5):
    
    ##creating an array of all numeric features
    ##numerics=['int64','float64']
    ##num_cols=data.select_dtypes(include=numerics).columns
    num_cols=['Administrative','Administrative_Duration','Informational','Informational_Duration','ProductRelated',
              'ProductRelated_Duration','BounceRates','ExitRates','PageValues','SpecialDay']
    
    ##z_cols keeps track of the new columns created in the function
    z_cols=[]
    for col in num_cols:
        data[f'{col}_z']=(abs(data[col].apply(lambda x:zscore(x,col)))>3)
        z_cols.append(f'{col}_z')
        
    ##df['Total_Outliers'] contains the total number of outliers for every observation
    data['Total_Outliers']=np.sum(data[z_cols],axis=1)
    z_cols.append('Total_Outliers')
    
    ##delete observations with number of outliers greater than threshold
    data.drop(data[data['Total_Outliers']>threshold].index,axis=0,inplace=True)
    
    ##delete all columns created in this function
    data.drop(z_cols,axis=1,inplace=True)
    data.reset_index(inplace=True)
    data.drop('index',axis=1,inplace=True)
    
    return data

In [6]:
df=cleaned_data(df,threshold=3)

In [7]:
##Checking the number of rows in the cleaned data
df.shape[0]

12290

In [8]:
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [9]:
##Splitting the data into training and testing halves
from sklearn.model_selection import StratifiedShuffleSplit
split=StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=101)

for train_idx, test_idx in split.split(df,df['Revenue']):
    train=df.iloc[train_idx]
    test=df.iloc[test_idx]

In [10]:
train.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
127,0,0.0,0,0.0,10,225.466667,0.0,0.05,0.0,0.4,Feb,1,1,8,3,Returning_Visitor,False,False
10303,0,0.0,0,0.0,4,0.0,0.2,0.2,0.0,0.0,Nov,3,2,1,3,Returning_Visitor,False,False
5474,3,33.2,0,0.0,10,278.433333,0.007143,0.02619,32.700345,0.0,Jul,2,2,6,1,Returning_Visitor,False,True
11525,5,262.121212,1,0.0,117,7468.860173,0.0129,0.020948,3.014026,0.0,Nov,3,2,1,2,Returning_Visitor,False,False
1639,0,0.0,0,0.0,14,183.666667,0.0,0.004762,0.0,0.0,Mar,3,2,3,3,Returning_Visitor,False,False


In [11]:
test.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
3182,0,0.0,0,0.0,0,0.0,0.2,0.2,0.0,0.0,May,2,2,8,3,Returning_Visitor,False,False
6327,3,84.5,0,0.0,346,12541.77628,0.001709,0.006473,5.177107,0.0,Aug,2,2,3,1,Returning_Visitor,True,True
2444,0,0.0,0,0.0,6,81.0,0.08,0.133333,0.0,1.0,May,1,1,2,4,Returning_Visitor,True,False
3890,2,235.0,0,0.0,23,1193.0,0.0,0.008333,45.6125,0.0,May,3,2,7,2,New_Visitor,False,True
6278,8,89.9,3,20.0,84,1854.915556,0.004211,0.012386,3.170426,0.0,Jul,2,2,7,1,Returning_Visitor,False,False


## Feature Encoding/Scaling using Data Pipeline

Feature Encoding/Scaling is the process of bringing raw data into a format best suited for machine learning algorithms. A number of algorithms only support encoded categorical features. Similarly, a lot of algorithms can only give correct predictions when the continuous features have been scaled (Eg. distance-based algorithms like KNN). Instead of doing the work manually, we shall use Pipeline and ColumnTransformer from the Scikit-Learn library alongside custom functions to automate the task.

Here's the **step-by-step** process:
- Make all the necessary imports (OneHotEncoder, StandardScaler etc.)
- Define a **numeric_pipeline** function for continuous attributes that returns a Pipeline for imputation and scaling alongside an array having the names of all continuous attributes
- Create a master **data_transform** function that uses OneHotEncoder() and numeric_pipeline to transform categorical and continuous attributes respectively. It returns the features as a pandas DataFrame and the label(target) as a pandas Series.

In [12]:
##Necessary imports
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
##for continuous features
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
##for categorical features
from sklearn.preprocessing import OneHotEncoder

In [13]:
def numeric_pipeline(data):
    ##list of all numeric attributes in data
    num_attrs=data.select_dtypes(include=['int64','float64']).columns
    ##pipeline for numeric data tranformation
    num_pipeline=Pipeline([('impute',SimpleImputer(strategy='median')),
                          ('scale',StandardScaler(with_mean=False))])
    return num_attrs, num_pipeline

In [14]:
def data_transform(data):
    
    ##convert categories in the following categorical cols to string from int to avoid confusion
    cols=['OperatingSystems','Browser','Region','TrafficType','Weekend','Revenue']
    for col in cols:
        df[col]=df[col].astype(str)
        
    ##use numeric_pipeline func to get num_attrs and num_pipeline
    num_attrs, num_pipeline=numeric_pipeline(data)
    ##creating a list of categorical features
    cat_attrs=data.select_dtypes(exclude=['int64','float64']).columns
    
    ##sparse_threshold=0 ensures we main_pip always returns a dense array
    main_pip=ColumnTransformer([('num',num_pipeline,num_attrs),
                               ('cat',OneHotEncoder(),cat_attrs)],sparse_threshold=0)
    
    ##data in sparse matrix form
    my_data=main_pip.fit_transform(data)
    
    ##Creating labels for our data
    labels=[]
    for col in data.columns:
        if col in num_attrs:
            labels.append(col)
        else:
            labels.extend(np.sort([f'{col}_{x}' for x in df[col].unique()]))
            
    ##Creating a dataframe using the data and the labels
    transformed_df=pd.DataFrame(data=my_data,columns=labels)
    
    ##'Weekend' and 'Revenue' are boolean columns and hence can be represented by one column alone
    transformed_df.drop(['Revenue_False'],axis=1,inplace=True)
    
    ##Separating the dataframe into feature and label sets
    features, label=transformed_df.drop('Revenue_True',axis=1), transformed_df['Revenue_True']
    
    
    return features, label

In [15]:
##Creating features and label sets for train data
X_train, y_train=data_transform(train)

In [16]:
X_train.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,...,Month_Sep,OperatingSystems,Browser,Region,TrafficType,VisitorType_New_Visitor,VisitorType_Other,VisitorType_Returning_Visitor,Weekend_False,Weekend_True
0,0.0,0.0,0.0,0.0,0.2429,0.13709,0.0,1.035526,0.0,2.026388,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.09716,0.0,4.150848,4.142104,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,0.922185,0.198204,0.0,0.0,0.2429,0.169296,0.148245,0.542418,1.73135,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,1.536975,1.564863,0.827889,0.0,2.841935,4.541287,0.267734,0.433847,0.15958,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.340061,0.111675,0.0,0.098622,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [17]:
y_train.head()

0    0.0
1    0.0
2    1.0
3    0.0
4    0.0
Name: Revenue_True, dtype: float64

## Model Selection

We will now use the cleaned and transformed data to select the best classifiction algorithm for our data. We will perform cross-validation on X_train and y_train for each algorithm and score them based on accuracy. We will select the algorithm that gives the best result.

**Models** to evaluate:
- Decision Trees Classifier
- K Neighbors Classifier
- Support Vector Classifier

Here's **the step-by-step** process:
- Import the necessary algorithms and cross_val_score from scikit-learn
- Evaluate every algorithm on X_train and y_train
- Select the algorithm that gives the best score

In [18]:
##import all algorithms to be tested
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [19]:
##import cross_val_score
from sklearn.model_selection import cross_val_score

In [20]:
##create a function to evaluate model
def model_evaluation(model,X,y,folds):
    scores=cross_val_score(estimator=model,X=X,y=y,cv=folds,scoring='accuracy')
    print(f'{str(model)}')
    print(f'scores: {scores}')
    print(f'mean_score: : {np.mean(scores)}')

In [21]:
##evaluating DecisionTreeClassifier()
model_evaluation(model=DecisionTreeClassifier(),X=X_train,y=y_train,folds=5)

DecisionTreeClassifier()
scores: [0.86578546 0.86171835 0.86113937 0.87080366 0.85300102]
mean_score: : 0.8624895723486354


In [22]:
##evaluating KNeighborsClassifier()
model_evaluation(model=KNeighborsClassifier(),X=X_train,y=y_train,folds=5)

KNeighborsClassifier()
scores: [0.88103711 0.87595323 0.8733469  0.88453713 0.87283825]
mean_score: : 0.8775425238717578


In [23]:
##evaluating SVC()
model_evaluation(model=SVC(),X=X_train,y=y_train,folds=5)

SVC()
scores: [0.89425521 0.88917133 0.89165819 0.90335707 0.88402848]
mean_score: : 0.8924940563033699


Model Selected : **Support Vector Classifier**

## Hyper-Parameter Tuning

Hyperparameters are the configurations of a machine learning model that can be set externally by the user. They differ greatly from parameters, which are the configurations that the algorithm learns by itself during training. Hyperparameters give us some control over the learning process of the algorithm and correctly tuning then can greatly improve performance.

For the purpose of this project we will use **GridSearchCV** from sklearn.model_selection for hyperparameter-tuning. It takes in a parameter grid for the model and evaluates all possible combinations of hyperparameters before giving us the best combination.

Here's the **step-by-step** process:

- Import GridSearchCV from sklearn.model_selection
- Specify a parameter grid for SVC()
- Instantiate a GridSearchCV instance
- Fit the instance with X_train and y_train
- Create a model with the best hyperparameter combination

In [24]:
from sklearn.model_selection import GridSearchCV

In [25]:
##'C' introduces a penalty for every mis-classified point - High value of 'C' means greater variance - Low 'C' means high bias
##'kernel' specifies the kernel function for the classifier
##'degree' specifies the degree of the kernel if it is polynomial
param_grid={'C':np.linspace(0.2,0.4,2),
           'kernel':['poly'],
           'degree':[1,2,3]}

In [26]:
##Instantiating
gscv=GridSearchCV(estimator=SVC(),param_grid=param_grid,cv=5,verbose=4,scoring='accuracy')

In [27]:
##Fit with train data
gscv.fit(X_train,y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] C=0.2, degree=1, kernel=poly ....................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ........ C=0.2, degree=1, kernel=poly, score=0.886, total=   1.3s
[CV] C=0.2, degree=1, kernel=poly ....................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.2s remaining:    0.0s


[CV] ........ C=0.2, degree=1, kernel=poly, score=0.882, total=   1.3s
[CV] C=0.2, degree=1, kernel=poly ....................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.5s remaining:    0.0s


[CV] ........ C=0.2, degree=1, kernel=poly, score=0.886, total=   1.4s
[CV] C=0.2, degree=1, kernel=poly ....................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    3.9s remaining:    0.0s


[CV] ........ C=0.2, degree=1, kernel=poly, score=0.895, total=   1.4s
[CV] C=0.2, degree=1, kernel=poly ....................................
[CV] ........ C=0.2, degree=1, kernel=poly, score=0.878, total=   1.3s
[CV] C=0.2, degree=2, kernel=poly ....................................
[CV] ........ C=0.2, degree=2, kernel=poly, score=0.890, total=   1.5s
[CV] C=0.2, degree=2, kernel=poly ....................................
[CV] ........ C=0.2, degree=2, kernel=poly, score=0.883, total=   1.4s
[CV] C=0.2, degree=2, kernel=poly ....................................
[CV] ........ C=0.2, degree=2, kernel=poly, score=0.887, total=   1.5s
[CV] C=0.2, degree=2, kernel=poly ....................................
[CV] ........ C=0.2, degree=2, kernel=poly, score=0.893, total=   1.8s
[CV] C=0.2, degree=2, kernel=poly ....................................
[CV] ........ C=0.2, degree=2, kernel=poly, score=0.880, total=   1.5s
[CV] C=0.2, degree=3, kernel=poly ....................................
[CV] .

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   48.8s finished


GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': array([0.2, 0.4]), 'degree': [1, 2, 3],
                         'kernel': ['poly']},
             scoring='accuracy', verbose=4)

In [28]:
##Check best hyper-parameter combination
gscv.best_params_

{'C': 0.4, 'degree': 2, 'kernel': 'poly'}

In [29]:
##Check the score of the best estimator
gscv.best_score_

0.8882221455645827

In [30]:
##Create best model
svc_classifier=SVC(C=0.4,degree=2,kernel='poly')

## Final Evaluation

In this section we will fit our model on the train data and evaluate it on the test data. We will use roc_auc score for scoring.

In [31]:
from sklearn.metrics import accuracy_score

In [32]:
##fitting the model with the train data
svc_classifier.fit(X_train,y_train)

SVC(C=0.4, degree=2, kernel='poly')

In [33]:
##writing a function to do all necessary transformations and evaluate the model

In [34]:
def final_evaluation(data, model):
    
    ##convert the test data into a dataframe if it is in dict format
    if type(data)==dict:
        ##convert the data into dataframe if in dict format
        data = pd.DataFrame(data)
    else:
        data = data
    
    ##pre-process the data using our custom function
    features, label=data_transform(data)
    
    ##make predictions using the model
    predictions=model.predict(features)
    
    ##roc_auc score for the model
    score=accuracy_score(label, predictions)
    
    return predictions, score

In [35]:
## evaluating model against test data
predictions, score = final_evaluation(test, svc_classifier)

In [36]:
predictions

array([0., 0., 0., ..., 0., 0., 0.])

In [37]:
score

0.887713588283157

<span style="color:red">The data_transform function worked fine on the train and test sets but is now showing an error when used with a part of those very sets. I'm unable to understand the error. Please help.</span>.

In [38]:
data_transform(test[:10])

ValueError: Shape of passed values is (10, 26), indices imply (10, 31)