In [None]:
pip install pycaret

<center><h1 class="list-group-item list-group-item-success">🍕 Pizza Price Prediction 🍕</h1></center><br><br>
<center><img src = "https://media-cdn.tripadvisor.com/media/photo-s/10/aa/89/3a/pizza.jpg"></center>

### Context

<font size = 3.5 color = "blue">
This dataset designed to understand the factors that lead to increase/ decrease in pizza prize.<br>
<br>Independent Features : <br>
<li>company </li>
<li>diameter</li>
<li>topping</li>
<li>variant</li>
<li>size</li>
<li>extra_sauce</li>
<li>extra_cheese</li><br>
 With these features you will predict the pizza prize, as well as interpreting affected factors on pizza price.





### Contents:
<font size = 3.5 color = "blue">
<li>Importing Packages</li>
<li>Importing Data</li>
<li>Analysing Data</li>
<li>Data Visualization</li>
<li>Essential Functions</li>
<li>Data Preprocessing</li>
<li>One Hot encoding</li>
<li>Filling NA Values</li>
<li>Data Upscaling</li>
<li>Training Models</li>
<li>Evaluation Metrics</li>

## Importing Packages
<font size = 3>Importing all the packages in the first cell is always a good practice. </font>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import category_encoders as ce
from pycaret.regression import *
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import catboost as cb
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error

## Importing Data
Importing the dataset

In [None]:
df = pd.read_csv("../input/pizza-price-prediction/pizza_v1.csv")

## Analysing Data
Performing Data Exploration

In [None]:
df

In [None]:
df.shape

#### Here we have only less data points for training and testing 😥

In [None]:
df.isnull().sum()

#### No null values 😀 

In [None]:
df.dtypes

#### Here all columns are in **object** datatype. So we need to perform some encoding techniques to convert them to numbers 

## Data Visualization

In [None]:
plt.figure(figsize=(10,8))
plt.pie(df['company'].value_counts(),autopct='%.1f',labels=df['company'].unique())
plt.title('Companies Weigtage')
plt.show()

#### Here, each pie represents differnt companies.All comapnies contribute equally in the dataset

In [None]:
sns.boxplot(x="topping", y="diameter", data=df)
plt.xticks(rotation =90)
plt.show()

#### This chart tells us about the average diameter with respect to topping
#### Chicken pizzas are more larger in size compared to all other toppings<br>

In [None]:
sns.histplot(data=df, x="company",hue='extra_sauce')

In [None]:
df_company=df.groupby('company').agg({'company':['count']})
sns.kdeplot(df['company'].value_counts(), color='g', shade=True)


<center><img src = "https://assets.rbl.ms/18339144/origin.jpg" width = 800></center><br>


# Essential Functions
##### These functions will help us to generate insights, ecoding the data, visualizing the data which will be more helpful in Data Preprocessing 

In [None]:
# This function is to generate basic insigts from all columns before data cleaning
def give_insights(df,column):
    print("Data Type: ",df[column].dtypes)
    print("Value Counts:\n",df[column].value_counts())
    plt.figure(figsize=(10,5))
    print("Plotting the distribution",plt.plot(df[column]))
    
    plt.show()

In [None]:
# One hot encoding
def one_hot_encoding(df,col):
    one_hot_encoder=ce.OneHotEncoder(cols=col,return_df=True,use_cat_names=True)
    df_final = one_hot_encoder.fit_transform(df)
    return df_final

In [None]:
# Ordinal Encoding
def ordinal_encoding(df,col,mapping):
    ordinal_encoder=ce.OrdinalEncoder(cols=col,return_df=True,mapping=[{'col':col,'mapping':mapping}])
    df_final = ordinal_encoder.fit_transform(df)
    return df_final

## Data Preprocessing


### Now taking **company** column into consideration

In [None]:
give_insights(df,"company")

#### Here, if we see five different companies are available in our data. Each  contribute to around 20-30 observations in our dataset.
#### Also we can notice that they are denoted with alphabets. So we can perform one hot encode them and convert to boolean values.

In [None]:
# Cleaning Function for "company" column
def clean_company(df,column):
    df = one_hot_encoding(df,column)
    return df

In [None]:
# Applying the function
df = clean_company(df,"company")

In [None]:
df.sample(5)

#### Here new coloumns have been created for each company and values are converted to boolean

### Now taking **price_rupiah** column into consideration

In [None]:
give_insights(df,"price_rupiah")

#### Here, in our problem statement "price_rupiah" is the dependent variable and this is continuous variable. It ranges from Rs.28,000 - Rs.2,48,000.
#### Also we can notice that they have a prefix Rp which we have clean. So we can perform some replace functions and clean them accordingly

In [None]:
# Cleaning Function for "price_rupiah" column
def clean_price_rupiah(df,column):
    price = []
    for item in df[column]:
        price += [float(item.replace('Rp', '').replace(',', ''))]
    df[column] = price
    return df

In [None]:
# Applying the function
df =  clean_price_rupiah(df,"price_rupiah")

In [None]:
df.sample(5)

#### Now price column has been cleaned as we removed string characters from them

### Now taking **diameter** column into consideration

In [None]:
give_insights(df,"diameter")

#### Here, "diameter" is already in float datatype. It ranges from 8cm - 22cm. So nothing to clean in this column

### Now taking **topping** column into consideration

In [None]:
give_insights(df,"topping")

#### Here, if we see 12 different toppings are available in our data.Most likely topping in our datatset are **chicken, mushrooms, mozzarella, smoked beef**
#### Also we can notice that each topping are denoted as string. So we can perform one hot encode them and convert to boolean values.

In [None]:
# Cleaning Function for "topping" column
def clean_topping(df,column):
    df = one_hot_encoding(df,column)
    return df

In [None]:
# Applying the function
df = clean_company(df,"topping")

In [None]:
df.sample(5)

#### Here you can check new coloumns have been created for each topping and values are converted to boolean

### Now taking **variant** column into consideration

In [None]:
give_insights(df,"variant")

#### Here, if we see 20 different variant pizzas are available in our data.Most likely variant in our datatset are **classic, crunchy, double_mix, new_york**
#### Also we can notice that each variant are denoted as string. So we can perform one hot encode them and convert to boolean values.

In [None]:
# Cleaning Function for "variant" column
def clean_variant(df,column):
    df = one_hot_encoding(df,column)
    return df

In [None]:
# Applying the function
df = clean_company(df,"variant")

In [None]:
df.sample(5)

#### Here you can check new coloumns have been created for each variant and values are converted to boolean

### Now taking **size** column into consideration

In [None]:
give_insights(df,"size")

#### Here, if we see 6 different size pizzas are available in our data.Most likely variant in our datatset are medium ones.
#### Also we can notice that each variant are denoted as string. So we can perform ordinal encoding them and convert to ordered numerical values

In [None]:
# Cleaning Function for "size" column
def clean_size(df,column,encoding):
    df = ordinal_encoding(df,column,encoding)
    return df

In [None]:
# Ordinal values
encoding = {"small":1,"medium":2,"reguler":3,"large":4,"XL":5,"jumbo":6}
# Applying the function

df = clean_size(df,"size",encoding)

In [None]:
df.sample(5)

### Now taking "extra_sauce" and "extra_cheese" columns into consideration

In [None]:
give_insights(df,"extra_sauce")

In [None]:
give_insights(df,"extra_cheese")

#### Here, in both the cases, the values are boolean with Yes and No and weightage of Yes is more in all cases
#### Also we can notice that yes and no denoted with string. So we can perform ordinal encoding them and convert to boolean numerical  values¶

In [None]:
# Cleaning Function for "extra_sauce" and "extra_cheese" columns
def clean_yes_no(df,column,encoding):
    df = ordinal_encoding(df,column,encoding)
    return df
    

In [None]:
# Ordinal values
encoding = {"yes":1,"no":0}

# Applying the function
df = clean_yes_no(df,"extra_sauce",encoding)
df = clean_yes_no(df,"extra_cheese",encoding)

In [None]:
df.sample(5)

#### Here you can see that the values are converted to boolean (1 and 0)

In [None]:
df.dtypes

<center><img src="https://media.tenor.com/images/aa37ff519d18dc4b51b8a55fb36e27e7/tenor.gif"></img></center><br>
<center><font size = 4 color = "red">Data Cleaning done successfully ✨</font></center>

## Train Test Split

In [None]:
# Splitting Dependent and Independent variable
X = df.drop("price_rupiah",axis = 1)
Y = df["price_rupiah"]

In [None]:
# Splitting the dataset from training and validation
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,random_state = 27,test_size = 0.2)

## Data Scaling

In [None]:
# Initializing the Min Max Scaler objects
X_scaler = MinMaxScaler()
Y_scaler = MinMaxScaler()

In [None]:
#Fitting and tranforimg the independent variables
X_train = X_scaler.fit_transform(X_train)
X_test = X_scaler.transform(np.array(X_test))

#Fitting and tranforimg the dependent variables
Y_train = Y_scaler.fit_transform(np.array(Y_train).reshape(-1, 1))
Y_test = Y_scaler.transform(np.array(Y_test).reshape(-1,1))

## Model Selection

In [None]:
experiment =setup(df,"price_rupiah",normalize = True)

In [None]:
compare_models()

#### Here, We get high $R^2$ value (Coefficient of Determination) for CatBoostRegreesor. So we can choose the CatBoostRegreesor and work on hyperparameter tuning in it.
#### A higher r-squared indicates a better fit for the model 😁

## Model Building

In [None]:
# Fitting our data into Catboost pool constructor
train_dataset = cb.Pool(X_train, Y_train) 
test_dataset = cb.Pool(X_test, Y_test)

In [None]:
# Initializing the catboost Model and loss function
model = cb.CatBoostRegressor(loss_function="RMSE")

In [None]:
# Hyper parameter tuning
grid = {'iterations': [100, 150, 200],
        'learning_rate': [0.03, 0.1],
        'depth': [2, 4, 6, 8],
        'l2_leaf_reg': [0.2, 0.5, 1, 3]}
model.grid_search(grid, train_dataset)


### Predicting x_test with our trained model

In [None]:
Y_pred = model.predict(X_test)

## Evaluation Metrics

In [None]:
rmse = (np.sqrt(mean_squared_error(Y_test, Y_pred)))
r2 = r2_score(Y_test, Y_pred)
mae = mean_absolute_error(Y_test, Y_pred)

In [None]:
print("Testing performance")
print("RMSE: {:.2f}".format(rmse))
print("R2: {:.2f}".format(r2))
print("MAE: {:.2f}".format(mae))

## Hurryay !!We got more $R^2$ value by tuning hyperparameters in CatBoost Model.
<center><img src = "https://c.tenor.com/Nz_vlGMgXV0AAAAM/done-congrats.gif"></center>

### Thank You 🤗
### I hope you had a good time reading my notebook. Pls do support and comment! 😎