### Online_shoppers_intention

https://www.kaggle.com/datasets/henrysue/online-shoppers-intention

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, confusion_matrix, r2_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('online_shoppers_intention.csv')
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [3]:
print(df.columns)
print("------------------------------------------------------")
print(df.dtypes)
print("------------------------------------------------------")
print(df.shape)

Index(['Administrative', 'Administrative_Duration', 'Informational',
       'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
       'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'Month',
       'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType',
       'Weekend', 'Revenue'],
      dtype='object')
------------------------------------------------------
Administrative               int64
Administrative_Duration    float64
Informational                int64
Informational_Duration     float64
ProductRelated               int64
ProductRelated_Duration    float64
BounceRates                float64
ExitRates                  float64
PageValues                 float64
SpecialDay                 float64
Month                       object
OperatingSystems             int64
Browser                      int64
Region                       int64
TrafficType                  int64
VisitorType                 object
Weekend                       bool
R

In [4]:
def detail_info(data):
    temp_df = pd.DataFrame(index= data.columns)
    
    temp_df['data_type'] = data.dtypes
    temp_df['unique_val'] = data.nunique()
    temp_df['duplicate_val'] = data.duplicated().sum()
    temp_df['missing_val'] = data.isnull().sum()
    temp_df['missing_val_%'] = round(data.isnull().mean()*100,2)
    
    return temp_df

detail_info(df)

Unnamed: 0,data_type,unique_val,duplicate_val,missing_val,missing_val_%
Administrative,int64,27,125,0,0.0
Administrative_Duration,float64,3335,125,0,0.0
Informational,int64,17,125,0,0.0
Informational_Duration,float64,1258,125,0,0.0
ProductRelated,int64,311,125,0,0.0
ProductRelated_Duration,float64,9551,125,0,0.0
BounceRates,float64,1872,125,0,0.0
ExitRates,float64,4777,125,0,0.0
PageValues,float64,2704,125,0,0.0
SpecialDay,float64,6,125,0,0.0


In [5]:
df.drop_duplicates(keep ='first', inplace=True)
df.reset_index(drop=True, inplace=True)

detail_info(df)

Unnamed: 0,data_type,unique_val,duplicate_val,missing_val,missing_val_%
Administrative,int64,27,0,0,0.0
Administrative_Duration,float64,3335,0,0,0.0
Informational,int64,17,0,0,0.0
Informational_Duration,float64,1258,0,0,0.0
ProductRelated,int64,311,0,0,0.0
ProductRelated_Duration,float64,9551,0,0,0.0
BounceRates,float64,1872,0,0,0.0
ExitRates,float64,4777,0,0,0.0
PageValues,float64,2704,0,0,0.0
SpecialDay,float64,6,0,0,0.0


In [6]:
string_features = []
for feature in df.columns:
    if df[feature].dtype == "O" or df[feature].dtype == bool:
        string_features.append(feature)    

string_features

['Month', 'VisitorType', 'Weekend', 'Revenue']

In [7]:
# Encoding the output feature

def feature_encoding(df, col):
    temp_list = list(sorted(df[col].unique()))
    return dict(zip(temp_list, range(len(temp_list)))) 

In [8]:
transformed_dict = {} 

for feature in df[string_features]:
    feature_dict = feature_encoding(df, feature)
    transformed_dict[feature] = feature_dict

    df[feature] = df[feature].map(feature_dict)
    
transformed_dict

{'Month': {'Aug': 0,
  'Dec': 1,
  'Feb': 2,
  'Jul': 3,
  'June': 4,
  'Mar': 5,
  'May': 6,
  'Nov': 7,
  'Oct': 8,
  'Sep': 9},
 'VisitorType': {'New_Visitor': 0, 'Other': 1, 'Returning_Visitor': 2},
 'Weekend': {False: 0, True: 1},
 'Revenue': {False: 0, True: 1}}

In [9]:
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,2,1,1,1,1,2,0,0
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,2,2,2,1,2,2,0,0
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,2,4,1,9,3,2,0,0
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,2,3,2,2,4,2,0,0
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,2,3,3,1,4,2,1,0


In [10]:
for feature, feature_dict in transformed_dict.items():
    print(f"{feature} dictionary: {feature_dict}")

Month dictionary: {'Aug': 0, 'Dec': 1, 'Feb': 2, 'Jul': 3, 'June': 4, 'Mar': 5, 'May': 6, 'Nov': 7, 'Oct': 8, 'Sep': 9}
VisitorType dictionary: {'New_Visitor': 0, 'Other': 1, 'Returning_Visitor': 2}
Weekend dictionary: {False: 0, True: 1}
Revenue dictionary: {False: 0, True: 1}


In [11]:
# Split the dataset into features and target variable
X = df.drop(['Revenue'], axis=1)
y = df['Revenue']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)

# Predict the revenue for the test set
y_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print("Accuracy", test_accuracy)

# Evaluate the model's performance
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy 0.9041376485047112
              precision    recall  f1-score   support

           0       0.92      0.97      0.94      2079
           1       0.74      0.54      0.63       362

    accuracy                           0.90      2441
   macro avg       0.83      0.76      0.79      2441
weighted avg       0.90      0.90      0.90      2441

[[2010   69]
 [ 165  197]]


In [17]:
# For informational duration

X_dur = df.drop(['Informational_Duration'], axis=1)
y_dur = df['Informational_Duration']

X_dur_train, X_dur_test, y_dur_train, y_dur_test = train_test_split(X_dur, y_dur, test_size=0.2, random_state=42)

param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

model = GradientBoostingRegressor()
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_dur_train, y_dur_train)
best_model = grid_search.best_estimator_

y_dur_pred = best_model.predict(X_dur_test)

rmse = np.sqrt(mean_squared_error(y_dur_test, y_dur_pred))
print('Root Mean Squared Error:', rmse)

Root Mean Squared Error: 123.37377578561536
