In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from lazypredict.Supervised import LazyRegressor
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [3]:
df = pd.read_csv("data/cleaned_data.csv")
df.head()

Unnamed: 0,id,Place Code,Promotion Name,Store Kind,Store Sales,Store Cost,Is Recyclable?,Store Area,Grocery Area,Frozen Area,...,Department,Bar For Salad,Coffee Bar,Ready Food,Video Store,Florist,Gross Weight,Net Weight,Package Weight,Min. Person Yearly Income
0,mc_ID_0,H11go_ZA,Dimes Off,Deluxe,8760000.0,4292400.0,yes,2842.23,2037.64,481.98,...,Household,1,1,1,1,1,28.2,26.6,1.6,10000.0
1,mc_ID_1,S04ne_WA,Budget Bargains,Supermarket,6360000.0,1971600.0,no,2814.95,2049.72,457.36,...,Snack Foods,0,0,0,0,0,16.57,14.97,1.6,50000.0
2,mc_ID_2,L05es_CA,Shelf Emptiers,Supermarket,10860000.0,4452600.0,yes,2192.32,1322.21,523.32,...,Periodicals,0,0,0,0,1,28.64,27.18,1.45,30000.0
3,mc_ID_4,M10da_YU,Sale Winners,Deluxe,11560000.0,4970800.0,no,2862.3,1872.19,593.93,...,Produce,1,1,1,1,1,12.62,9.71,2.91,50000.0
4,mc_ID_5,S03le_WA,Weekend Discount,Supermarket,5220000.0,1618200.0,yes,1970.17,1236.07,440.92,...,Household,0,1,0,0,0,15.41,13.95,1.45,30000.0


In [4]:
target = "Cost"
X = df.drop(columns=target)
y = df[target]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
ord_categorical_columns = ['Place Code', 'Promotion Name', 'Store Kind', 'Is Recyclable?',
                            'Children', 'Degree', 'Work', 'Oreder Brand',
                            'Product', 'Department']

hot_categorical_columns = ['Marriage', 'Gender']

all_categorical_columns = X.select_dtypes("object").columns

numeric_columns = X.select_dtypes("float").columns

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OrdinalEncoder(), ord_categorical_columns),
        ('ohe', OneHotEncoder(), hot_categorical_columns),
        ('num', StandardScaler(), numeric_columns)
    ]
)

In [8]:
model = make_pipeline(
    preprocessor,
    DecisionTreeRegressor(max_depth=20, random_state=42)
)
model.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('cat', OrdinalEncoder(),
                                                  ['Place Code',
                                                   'Promotion Name',
                                                   'Store Kind',
                                                   'Is Recyclable?', 'Children',
                                                   'Degree', 'Work',
                                                   'Oreder Brand', 'Product',
                                                   'Department']),
                                                 ('ohe', OneHotEncoder(),
                                                  ['Marriage', 'Gender']),
                                                 ('num', StandardScaler(),
                                                  Index(['Store Sales', 'Store Cost', 'Store Area', 'Grocery Area',
       'Frozen Area', 'Meat Area', 'Gross Weight', 'N

In [9]:
# Check if the model fitted well or not
predictions = model.predict(X_train)

mse = mean_squared_error(y_train, predictions)
rmse = np.sqrt(mse)
rmse

16.24279568376655

In [10]:
predictions = model.predict(X_test)

mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
rmse

66.24441153879552

In [11]:
rfmodel = make_pipeline(
    preprocessor,
    RandomForestRegressor(n_estimators=100, random_state=42)
)
rfmodel.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('cat', OrdinalEncoder(),
                                                  ['Place Code',
                                                   'Promotion Name',
                                                   'Store Kind',
                                                   'Is Recyclable?', 'Children',
                                                   'Degree', 'Work',
                                                   'Oreder Brand', 'Product',
                                                   'Department']),
                                                 ('ohe', OneHotEncoder(),
                                                  ['Marriage', 'Gender']),
                                                 ('num', StandardScaler(),
                                                  Index(['Store Sales', 'Store Cost', 'Store Area', 'Grocery Area',
       'Frozen Area', 'Meat Area', 'Gross Weight', 'N

In [12]:
# Check if the model fitted well or not
predictions = rfmodel.predict(X_train)

mse = mean_squared_error(y_train, predictions)
rmse = np.sqrt(mse)
rmse

19.261730227407828

In [13]:
predictions = rfmodel.predict(X_test)

mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
rmse

53.051258803066865

In [14]:
def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred)**2))

rmse_scorer = make_scorer(rmse, greater_is_better=False)

scores = cross_val_score(rfmodel, X_train, y_train, cv=5, scoring=rmse_scorer)

positive_scores = -scores

print("Cross-validation RMSE scores:", positive_scores)

Cross-validation RMSE scores: [53.68760687 56.09358958 52.34657331 52.12778238 50.49838727]


In [15]:
preprocessor.fit(X_train)

X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

## Try Lazypredict

In [21]:
reg = LazyRegressor(verbose=0,ignore_warnings=False, custom_metric=None)
models,predictions = reg.fit(X_train_transformed, X_test_transformed, y_train, y_test)

NameError: name 'X_train_transformed' is not defined

In [None]:
print(models)

## Extract submissions

In [None]:
samples = pd.read_csv('data/sample_submission.csv')
samples.head()

In [None]:
test = pd.read_csv("data/test.csv")
test.head()

In [None]:
test.isna().sum()

In [None]:
y_sub_pred = rfmodel.predict(test)

In [None]:
samples["Cost"] = y_sub_pred

In [None]:
samples.to_csv("data/submission.csv")