In [9]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score,classification_report,confusion_matrix
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_csv("shop_smart_ecommerce.csv")
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [3]:
X = df.drop(["Revenue"],axis = 1)
y = df["Revenue"].astype (int)

In [4]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Revenue, dtype: int32

In [5]:
df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

Administrative             0
Administrative_Duration    0
Informational              0
Informational_Duration     0
ProductRelated             0
ProductRelated_Duration    0
BounceRates                0
ExitRates                  0
PageValues                 0
SpecialDay                 0
Month                      0
OperatingSystems           0
Browser                    0
Region                     0
TrafficType                0
VisitorType                0
Weekend                    0
Revenue                    0
dtype: int64

In [6]:
num_feature = X.select_dtypes(include = ["int","float"]).columns
cat_feature = X.select_dtypes(include = ["object","category"]).columns

In [7]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state = 42,stratify = y)

In [12]:
# preprocessing different calumns pr diff.preprocessing used  -- ColumnTransformer
# preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers = [
        ("num",StandardScaler(),num_feature),
        ("cat",OneHotEncoder(),cat_feature)
    ]
)


In [13]:
dt = DecisionTreeClassifier(
    max_depth = 5,
    min_samples_leaf = 30,
    class_weight = "balanced",
    random_state = 42
)

In [14]:
pipe = Pipeline(
    steps = [
        ("preprocess",preprocessor),
        ("model",dt)
    ]
    
)

In [15]:
pipe.fit(X_train,y_train)

In [17]:
y_pred = pipe.predict(X_test)
print("F1 score:", f1_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

F1 score: 0.6251236399604352

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.85      0.90      2084
           1       0.50      0.83      0.63       382

    accuracy                           0.85      2466
   macro avg       0.73      0.84      0.76      2466
weighted avg       0.89      0.85      0.86      2466


Confusion Matrix:
 [[1771  313]
 [  66  316]]


In [19]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    "model__max_depth" : [4,6,8],
    "model__min_samples_leaf":[20,30,50]
}
grid = GridSearchCV(
    pipe,
    param_grid,
    cv = 5,
    scoring = "f1",
    n_jobs = -1
)
grid.fit(X_train,y_train)
print("Best score",grid.best_score_)
print("Best Params",grid.best_params_)

Best score 0.6343735129725652
Best Params {'model__max_depth': 4, 'model__min_samples_leaf': 50}
