In [9]:
from sklearn.preprocessing import StandardScaler, TargetEncoder, FunctionTransformer,MinMaxScaler,RobustScaler, OneHotEncoder, PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import normaltest, t
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import normaltest

In [8]:
pip install category_encoders

Note: you may need to restart the kernel to use updated packages.


In [10]:
ecommerce = pd.read_csv('ecommerce_customer_behavior_dataset_v2.csv')
ecommerce.head()

Unnamed: 0,Order_ID,Customer_ID,Date,Age,Gender,City,Product_Category,Unit_Price,Quantity,Discount_Amount,Total_Amount,Payment_Method,Device_Type,Session_Duration_Minutes,Pages_Viewed,Is_Returning_Customer,Delivery_Time_Days,Customer_Rating
0,ORD_000001-1,CUST_00001,2023-05-29,40,Male,Ankara,Books,29.18,1,0.0,29.18,Digital Wallet,Mobile,14,9,True,13,4
1,ORD_000001-2,CUST_00001,2023-10-12,40,Male,Ankara,Home & Garden,644.4,1,138.05,506.35,Credit Card,Desktop,14,8,True,6,2
2,ORD_000001-3,CUST_00001,2023-12-05,40,Male,Ankara,Sports,332.82,5,0.0,1664.1,Credit Card,Mobile,15,10,True,9,4
3,ORD_000002-1,CUST_00002,2023-05-11,33,Male,Istanbul,Food,69.3,5,71.05,275.45,Digital Wallet,Desktop,16,13,True,4,4
4,ORD_000002-2,CUST_00002,2023-06-16,33,Male,Istanbul,Beauty,178.15,3,0.0,534.45,Credit Card,Mobile,14,7,True,6,4


In [11]:
def remove_outliers(ecommerce):
  df = ecommerce.copy()
  numerical_col = df.select_dtypes(include='number').columns

  for col in numerical_col:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5*iqr
    upper = q3 + 1.5*iqr
    df = df[(df[col]>=lower) & (df[col]<=upper)]
  return df
ecommerce_df = remove_outliers(ecommerce)

display(ecommerce_df.describe())
ecommerce_df.select_dtypes(include='object')
numerical_cols = ecommerce_df.select_dtypes(include='number').columns

Unnamed: 0,Age,Unit_Price,Quantity,Discount_Amount,Total_Amount,Session_Duration_Minutes,Pages_Viewed,Delivery_Time_Days,Customer_Rating
count,11143.0,11143.0,11143.0,11143.0,11143.0,11143.0,11143.0,11143.0,11143.0
mean,34.877232,157.155729,2.785695,8.154865,367.236434,14.586826,8.987975,6.209459,3.892668
std,10.895104,159.282077,1.413332,15.966977,334.653711,2.859696,2.255697,2.987185,1.133743
min,18.0,5.05,1.0,0.0,6.21,7.0,1.0,1.0,1.0
25%,27.0,53.43,2.0,0.0,119.48,13.0,7.0,4.0,3.0
50%,35.0,102.17,3.0,0.0,248.4,15.0,9.0,6.0,4.0
75%,42.0,202.06,4.0,8.715,509.8,17.0,10.0,8.0,5.0
max,66.0,1121.84,5.0,69.15,1520.0,23.0,17.0,14.0,5.0


In [12]:
from category_encoders import TargetEncoder

X = ecommerce_df[['Product_Category','Quantity','Payment_Method','City']]
y = ecommerce_df['Total_Amount']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

numerical_cols = X.select_dtypes(include='number').columns
categorical_cols = X.select_dtypes(include='object').columns

def log_transform(X):
  return np.log1p(X)
log_transformer = FunctionTransformer(log_transform, validate=False)

y_train_log = np.log1p(y_train)

y_test_log = np.log1p(y_test)

numerical_pipeline = Pipeline(steps=[
    ('log', log_transformer ),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('encoder', TargetEncoder())
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_pipeline, numerical_cols),
    ('cat', categorical_pipeline, categorical_cols)
])

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge())
])

params = {
    'regressor__alpha': [0.0001, 0.001, 0.01,0.1]
}

grid = GridSearchCV(
    pipeline,
    params,
    cv=5,
    scoring='r2',
    n_jobs = -1
)

grid.fit(X_train, y_train_log)
model = grid.best_estimator_
prediction = model.predict(X_test)
train_prediction = model.predict(X_train)

mse = mean_squared_error(y_test_log, prediction)
rmse = np.sqrt(mse)

r2 = r2_score(y_test_log, prediction)

train_mse = mean_squared_error(y_train_log, train_prediction)
train_r2 = r2_score(y_train_log, train_prediction)
train_rmse = np.sqrt(train_mse)

print('Best Paramter:', grid.best_params_)
print(f'Test MSE:{mse:.4f}')
print(f'Train MSE: {train_mse:.4f}')
print(f'Train RMSE: {train_rmse:.4f}')
print(f'Test RMSE: {rmse:.4f}')
print(f'Test R2 : {r2:.4f}')
print(f'Train R2 : {train_r2:.4f}')

new_data = pd.DataFrame({
    'Quantity': [5, 8],
    'Product_Category':['Books','Sports'],
    'Payment_Method': ['Credit Card', 'Digital Wallet'],
    'City': ['Istanbul', 'UK']
})


log_pred = model.predict(new_data)

original_pred = np.expm1(log_pred)


for i, value in enumerate(original_pred, start=1):
  print(f'Predicted Amount for Person {i} : {value:.4f}')

residuals = y_test_log - prediction
stat, p = normaltest(residuals)
if p > 0.05:
  print('Model is likely bias')
else:
  print('Model is likely unbias')

sns.histplot(residuals, kde=True)
plt.title("Residual Distribution")
plt.show()

plt.scatter(prediction, residuals)
plt.axhline(0, color='red')
plt.xlabel("Predicted")
plt.ylabel("Residuals")
plt.title("Residuals vs Predictions")
plt.show()

import scipy.stats as stats
import matplotlib.pyplot as plt

stats.probplot(residuals, dist='norm', plot=plt)
plt.title("Q-Q Plot of Residuals")
plt.show()

categorical_features = model.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_cols)

all_features = np.concatenate([categorical_features, numerical_cols])

feature_importance = pd.DataFrame({
    'feature': all_features,
    'importance': model.named_steps['regressor'].coef_
}).sort_values(by='importance', ascending=False)

display(feature_importance)

sns.barplot(data=feature_importance,
            y='feature',
            x='importance',
            hue='feature',
            palette='Set2')
plt.title('Feature Importance')
plt.show()





ValueError: 
All the 20 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/home/win/traeProject/ecomML/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/win/traeProject/ecomML/.venv/lib/python3.12/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/win/traeProject/ecomML/.venv/lib/python3.12/site-packages/sklearn/pipeline.py", line 654, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/win/traeProject/ecomML/.venv/lib/python3.12/site-packages/sklearn/pipeline.py", line 588, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/win/traeProject/ecomML/.venv/lib/python3.12/site-packages/joblib/memory.py", line 326, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/win/traeProject/ecomML/.venv/lib/python3.12/site-packages/sklearn/pipeline.py", line 1551, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/win/traeProject/ecomML/.venv/lib/python3.12/site-packages/sklearn/utils/_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/win/traeProject/ecomML/.venv/lib/python3.12/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/win/traeProject/ecomML/.venv/lib/python3.12/site-packages/sklearn/compose/_column_transformer.py", line 1001, in fit_transform
    result = self._call_func_on_transformers(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/win/traeProject/ecomML/.venv/lib/python3.12/site-packages/sklearn/compose/_column_transformer.py", line 910, in _call_func_on_transformers
    return Parallel(n_jobs=self.n_jobs)(jobs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/win/traeProject/ecomML/.venv/lib/python3.12/site-packages/sklearn/utils/parallel.py", line 77, in __call__
    return super().__call__(iterable_with_config)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/win/traeProject/ecomML/.venv/lib/python3.12/site-packages/joblib/parallel.py", line 1986, in __call__
    return output if self.return_generator else list(output)
                                                ^^^^^^^^^^^^
  File "/home/win/traeProject/ecomML/.venv/lib/python3.12/site-packages/joblib/parallel.py", line 1914, in _get_sequential_output
    res = func(*args, **kwargs)
          ^^^^^^^^^^^^^^^^^^^^^
  File "/home/win/traeProject/ecomML/.venv/lib/python3.12/site-packages/sklearn/utils/parallel.py", line 139, in __call__
    return self.function(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/win/traeProject/ecomML/.venv/lib/python3.12/site-packages/sklearn/pipeline.py", line 1551, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/win/traeProject/ecomML/.venv/lib/python3.12/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/win/traeProject/ecomML/.venv/lib/python3.12/site-packages/sklearn/pipeline.py", line 730, in fit_transform
    return last_step.fit_transform(
           ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/win/traeProject/ecomML/.venv/lib/python3.12/site-packages/sklearn/utils/_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/win/traeProject/ecomML/.venv/lib/python3.12/site-packages/category_encoders/utils.py", line 474, in fit_transform
    return self.fit(X, y, **fit_params).transform(X, y)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/win/traeProject/ecomML/.venv/lib/python3.12/site-packages/category_encoders/utils.py", line 299, in fit
    self._check_fit_inputs(X, y)
  File "/home/win/traeProject/ecomML/.venv/lib/python3.12/site-packages/category_encoders/utils.py", line 336, in _check_fit_inputs
    if self._get_tags().get('supervised_encoder'):
       ^^^^^^^^^^^^^^^^
  File "/home/win/traeProject/ecomML/.venv/lib/python3.12/site-packages/sklearn/base.py", line 417, in _get_tags
    return _to_old_tags(get_tags(self))
                        ^^^^^^^^^^^^^^
  File "/home/win/traeProject/ecomML/.venv/lib/python3.12/site-packages/sklearn/utils/_tags.py", line 430, in get_tags
    sklearn_tags_provider[klass] = klass.__sklearn_tags__(estimator)  # type: ignore[attr-defined]
                                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/win/traeProject/ecomML/.venv/lib/python3.12/site-packages/sklearn/base.py", line 859, in __sklearn_tags__
    tags = super().__sklearn_tags__()
           ^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'super' object has no attribute '__sklearn_tags__'
