In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import category_encoders as ce
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv(r'Exploration.csv')

In [3]:
pd.set_option('display.max_columns', 23)
df.head()

Unnamed: 0,Promotion Name,Store Kind,Store Sales,Is Recyclable?,Yearly Income,Store Area,Meat Area,Cost,Gross Weight,Package Weight,City Code,Country Code,Status,Number of Children,Working Status,Sex,Department,Ordered_Brand,Product_Type,Video Store,Florist,Coffee Bar,Ready Food
0,Dimes Off,Deluxe,8760000.0,yes,less_70,2842.23,323.0,602.7575,28.1997,1.599,H11go,ZA,Single,4.0,professional,Female,Household,Red Wing,Cleaning Supplies,1.0,1.0,1.0,1.0
1,Budget Bargains,Supermarket,6360000.0,no,less_70,2814.95,,708.665,16.571,1.599,S04ne,WA,Single,3.0,management,Female,Snack Foods,Nationeel,Snack Foods,,,,
2,Shelf Emptiers,Supermarket,10860000.0,yes,less_70,2192.32,348.85,564.2647,28.6358,1.4536,L05es,CA,Married,2.0,skilled manual,Male,Periodicals,Excel,Magazines,0.0,1.0,0.0,0.0
3,Savings Galore,,1980000.0,yes,less_70,1974.73,293.95,724.5119,14.2161,2.9217,S03le,WA,Single,3.0,professional,Female,Dairy,Carlson,Dairy,0.0,0.0,1.0,0.0
4,Sale Winners,Deluxe,11560000.0,no,less_70,2862.3,395.95,519.7574,12.6172,2.9072,M10da,YU,Married,5.0,professional,Female,Produce,Hermanos,Vegetables,1.0,1.0,1.0,1.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33007 entries, 0 to 33006
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Promotion Name      33007 non-null  object 
 1   Store Kind          28202 non-null  object 
 2   Store Sales         32995 non-null  float64
 3   Is Recyclable?      33005 non-null  object 
 4   Yearly Income       33000 non-null  object 
 5   Store Area          29995 non-null  float64
 6   Meat Area           29995 non-null  float64
 7   Cost                33007 non-null  float64
 8   Gross Weight        32356 non-null  float64
 9   Package Weight      30721 non-null  float64
 10  City Code           33007 non-null  object 
 11  Country Code        33007 non-null  object 
 12  Status              33007 non-null  object 
 13  Number of Children  26565 non-null  float64
 14  Working Status      33007 non-null  object 
 15  Sex                 33007 non-null  object 
 16  Depa

In [5]:
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].astype('category')

In [6]:
# Separate features and target
X = df.drop(['Cost'], axis=1)

y = df['Cost']

# Define columns by data type
numeric_cols = [col for col in X.select_dtypes(include=['float64']).columns]
categorical_cols = [col for col in X.select_dtypes(include=['category']).columns]

In [7]:
# Assuming you have defined X and y elsewhere

# Split your data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42, shuffle=True)

# Define your preprocessing steps for numeric data
numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=2)),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('BinaryEncoder', ce.BinaryEncoder(return_df=True, handle_unknown='ignore'))

])

# Define the numeric and categorical column names
numeric_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X_train.select_dtypes(exclude=[np.number]).columns.tolist()

# Create the ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Create the main pipeline including preprocessing and the model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

# Define the parameter grid to search
param_grid = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [None, 10, 20],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}

# Create RandomizedSearchCV with cross-validation
Randomized_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=2,
    n_jobs=-1
)

# Fit the randomized search to your data
Randomized_search.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params = Randomized_search.best_params_
best_rf_model = Randomized_search.best_estimator_

# Make predictions on the test set using the best_rf_model
predictions = best_rf_model.predict(X_test)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


49 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\SALAH MAHMOUD\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\SALAH MAHMOUD\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\pipeline.py", line 401, in fit
    Xt = self._fit(X, y, **fit_params_steps)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\SALAH MAHMOUD\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\pipeline.py", line 359, in _fit
    X, fitted_transformer = fit_

In [8]:
# Make predictions on the test set
y_pred = best_rf_model.predict(X_test)

# Calculate RMSE
rmse_test = mean_squared_error(y_test, y_pred, squared=False)
print(f"Root Mean Squared Error test: {rmse_test}")
y_train_pred = best_rf_model.predict(X_train)
# Calculate RMSE
rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)
print(f"Root Mean Squared Error train: {rmse_train}")

Root Mean Squared Error test: 52.67936442993521
Root Mean Squared Error train: 34.12940878768222


In [9]:
import joblib

In [10]:
joblib.dump(best_rf_model, 'final_model.joblib')


['final_model.joblib']

In [11]:
X.columns

Index(['Promotion Name', 'Store Kind', 'Store Sales', 'Is Recyclable?',
       'Yearly Income', 'Store Area', 'Meat Area', 'Gross Weight',
       'Package Weight', 'City Code', 'Country Code', 'Status',
       'Number of Children', 'Working Status', 'Sex', 'Department',
       'Ordered_Brand', 'Product_Type', 'Video Store', 'Florist', 'Coffee Bar',
       'Ready Food'],
      dtype='object')