In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np
sns.set_theme(color_codes = True)

In [None]:
df = pd.read_csv("/content/pizza_v1.csv")
df.head()

#**Data Preprocessing**

In [None]:
# remove "Rp" & commas from "price_rupiah" column
df["price_rupiah"] = df["price_rupiah"].str.replace('Rp', '').str.replace(',', '')

In [None]:
df.head()

In [None]:
# check the number of unique value on object datatype
df.select_dtypes(include = 'object').nunique()

In [None]:
# convert "Amount" coloummn to integer
df['price_rupiah'] = df['price_rupiah'].astype(int)

#**Segment Pizza Variant**

In [None]:
df.variant.unique()

In [None]:
# define function to segment pizza names into types
def segment_variant(variant):
     if 'veggie' in variant:
         return 'Vegetarian'
     elif 'meat' in variant or 'BBQ' in variant:
          return 'Meat'
     elif 'tuna' in variant:
          return 'Seafood' 
     else:
          return 'Other'
  
# apply function to 'Pizza Name column to create new 'Pizza Type' column
df['variant'] = df['variant'].apply(segment_variant)

In [None]:
plt.figure(figsize = (10,5))
df['variant'].value_counts().plot(kind='bar')

## Exploratory Data Analysis

In [None]:
# list of categorical variables to plot
cat_vars = ['company', 'topping', 'variant', 'size', 'extra_sauce', 'extra_cheese']
# create figure with subplots
fig, axs = plt.subplots(nrows = 2, ncols = 3, figsize = (20,10))
axs = axs.ravel()
# create a barplot for each categorical variable
for i, var in enumerate(cat_vars):
  sns.barplot(x=var, y='price_rupiah', data = df, ax = axs[i], estimator = np.mean)
  axs[i].set_xticklabels(axs[i].get_xticklabels(), rotation = 90)

# adjust apacing between subplots
fig.tight_layout()
plt.show()


In [None]:
sns.boxplot(x = 'diameter', data = df)

In [None]:
sns.violinplot(x = 'diameter', data = df
               )

In [None]:
sns.scatterplot(data = df, x = 'diameter', y = 'price_rupiah', hue = 'company')

In [None]:
df.head()

In [None]:
df.tail(5)

In [None]:
# check missing values
check_missing = df.isnull().sum() * 100 / df.shape[0]
check_missing[check_missing > 0].sort_values(ascending = False)

# Label Encoding for Object datatype 

In [None]:
# Loop over each column in the datafrane where dtype is 'object'
for col in df.select_dtypes(include = ['object']).columns:
  # print the column name & the unique values
  print(f"{col}: {df[col].unique() } ")

In [None]:
from sklearn import preprocessing 
# Loop over each column in the Dataframe where dtype is 'object'
for col in df.select_dtypes(include = ['object']).columns:
  # Initialize the label encoder
  label_encoder = preprocessing.LabelEncoder()
  # Fit the encoder to the unique values in the column
  label_encoder.fit(df[col].unique())
  # Transform the column using the encoder
  df[col] = label_encoder.transform(df[col])
  # Print the column name & the unique encoded values
  print(f"{col}: {df[col].unique()}")

I will not remove the outlier beacuse the dataset is very small

In [None]:
# correlation heatmap
plt.figure(figsize = (20,16))
sns.heatmap(df.corr(), fmt = '.2g', annot = True)

#*Train Test Split*

In [None]:
x = df.drop('price_rupiah', axis = 1)
y = df['price_rupiah']

In [None]:
# test size 20% & train size 80%
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

## Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
# Create DecisionTreeRegressor object
dtree = DecisionTreeRegressor()
# Define the hyperparameters to tune & their values
param_grid = {
    'max_depth': [2,4,6,8],
    'min_samples_split': [2,4,6,8],
    'min_samples_leaf': [1,2,3,4],
    'max_features': ['auto', 'sqrt', 'log2']
}
# create a GridSearchCV object
grid_search = GridSearchCV(dtree, param_grid, cv = 5, scoring = 'neg_mean_squared_error')
# fit the GridSearchCV object to the data
grid_search.fit(x_train, y_train)
# Print the hyperparameters
print(grid_search.best_params_)

In [None]:
from sklearn.tree import DecisionTreeRegressor
dtree = DecisionTreeRegressor(random_state=0, max_depth = 8, max_features='auto', min_samples_leaf = 1, min_samples_split= 2)
dtree.fit(x_train, y_train)

In [None]:
from sklearn import metrics
from sklearn.metrics import mean_absolute_percentage_error
import math
y_pred = dtree.predict(x_test)
mae = metrics.mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)
rmse = math.sqrt(mse)
print('MAE is {}'.format(mae))
print('MAPE is {}'.format(mape))
print('MSE is {}'.format(mse))
print('R2 score is {}'.format(r2))
print('RMSE score is {}'.format(rmse))

In [None]:
imp_df = pd.DataFrame({
     'Feature Name': x_train.columns,
     'Importance' : dtree.feature_importances_
 })
fi = imp_df.sort_values(by = "Importance", ascending = False)
fi2 = fi.head(10)
plt.figure(figsize = (10,8))
sns.barplot(data = fi2, x = 'Importance', y = 'Feature Name')
plt.title('Feature Imporrtance Attribbutes (Decision Tree Rgressor)', fontsize = 18)
plt.xlabel('Importance', fontsize = 16)
plt.ylabel('Feature Name', fontsize = 16)
plt.show()

In [None]:
pip install shap

In [None]:
import shap 
explainer = shap.TreeExplainer(dtree)
shap_values = explainer.shap_values(x_test)
shap.summary_plot(shap_values, x_test)

In [None]:
explainer = shap.Explainer(dtree, x_test)
shap_values = explainer(x_test)
shap.plots.waterfall(shap_values[0])

# **Random** **Forest** **Regressor**

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
# create a Random Forest Regressor object
rf =  RandomForestRegressor()
# Define the hyperparameter grid
param_grid = {
    'max_depth' : [3,5,7,9],
    'min_samples_split' : [2, 5, 10],
    'min_samples_leaf' : [1, 2, 4],
    'max_features' : ['auto','sqrt']
}
# create a GridSearchCV object
grid_search = GridSearchCV(rf, param_grid, cv = 5, scoring = 'r2')
# Fit the GrdSearchCV object to the training data
grid_search.fit(x_train, y_train)
# print the bset hyperparameter
print("Best hyperparameters: ", grid_search.best_params_)


In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state = 0, max_depth=9, min_samples_split = 2, min_samples_leaf = 1, max_features = 'auto')
rf.fit(x_train, y_train)

In [None]:
from sklearn import metrics
from sklearn.metrics import mean_absolute_percentage_error
import math
y_pred = rf.predict(x_test)
mae = metrics.mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)
rmse = math.sqrt(mse)
print('MAE is {}'.format(mae))
print('MAPE is {}'.format(mape))
print('MSE is {}'.format(mse))
print('R2 score is {}'.format(r2))
print('RMSE score is {}'.format(rmse))

In [None]:
imp_df = pd.DataFrame({
    "Feature Name" : x_train.columns,
    "Importance" : dtree.feature_importances_
})
fi = imp_df.sort_values(by = "Importance", ascending = False)
fi2 = fi.head(10)
plt.figure(figsize = (10,8))
sns.barplot(data = fi2, x = 'Importance', y = 'Feature Name')
plt.title('Feature Importance Each Attributes (Random Forest Reggressor)', fontsize = 18)
plt.xlabel('Importance', fontsize = 16)
plt.ylabel('Feature Name', fontsize = 16)
plt.show()

In [None]:
import shap 
explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(x_test)
shap.summary_plot(shap_values, x_test)

In [None]:
expaliner = shap.Explainer(rf,x_test, check_additivity = False)
shap_values = explainer(x_test, check_additivity = False)
shap.plots.waterfall(shap_values[0])

# **Adaboost** **Regressor**

In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV
# Define adaboostregressor model
abr = AdaBoostRegressor()
# Define hyperparameters & possible values
params  ={'n_estimators' : [50,100,150],
          'learning_rate' : [0.01, 0.1 ,1 ,10]
          }
# Perform GridSearchCV with 5 fold cross validation
grid_search = GridSearchCV(abr, param_grid = params, cv = 5, scoring = 'neg_mean_squared_error')
grid_search.fit(x_train, y_train)
# Print best hyperparameter & corresponding score
print("Best hyperparameters:" ,grid_search.best_params_)

In [None]:
from sklearn.ensemble import RandomForestRegressor
abr = AdaBoostRegressor(random_state = 0, learning_rate=1, n_estimators = 150)
abr.fit(x_train, y_train)

In [None]:
from sklearn import metrics
from sklearn.metrics import mean_absolute_percentage_error
import math
y_pred = abr.predict(x_test)
mae = metrics.mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)
rmse = math.sqrt(mse)
print('MAE is {}'.format(mae))
print('MAPE is {}'.format(mape))
print('MSE is {}'.format(mse))
print('R2 score is {}'.format(r2))
print('RMSE score is {}'.format(rmse))