In [None]:
# importing all the necessary libraries to carry out data visualization
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

In [None]:
# create a pizza_df
pizza_df = pd.read_excel('/kaggle/input/pizza-sales/Data Model - Pizza Sales.xlsx')
# display 1st 5 of the data
pizza_df.head(10)

In [None]:
# info of the data frame 
pizza_df.info()

In [None]:
# identify the number of duplicated rows
pizza_df.duplicated().sum()

In [None]:
# describe the summary stats of pizza_df
# mean, min, max, quartiles, std deviation
summary_stats = pizza_df.describe()
print(summary_stats)

***Total Revenue of Pizza Sales***

In [None]:
total_revenue = (pizza_df['total_price']).sum()
print("Total Revenue: $", total_revenue)

***Average Order Value***

In [None]:
average_order_value = pizza_df.groupby("order_id")['total_price'].sum().mean()
# round to 2 dp
AOV = round(average_order_value, 2)
print("Average Order Value: $",AOV)

***Total Pizza Sold***

In [None]:
total_pizza_sold = pizza_df['quantity'].sum()
print("Total Pizzas Sold: ", total_pizza_sold)

In [None]:
total_orders = len(pizza_df.groupby('order_id').count())
print("Total Orders:", total_orders)

***Average Pizzas Per Order***

In [None]:
avg_pizzas_per_order = pizza_df['quantity'].sum() / total_orders
AOP = round(avg_pizzas_per_order, 2)
print("Average Pizzas per Order:", AOP)

In [None]:
category_analysis = pizza_df.groupby('pizza_category').agg(
    average_unit_price=('unit_price', 'mean'),
    revenue_per_category=('unit_price', lambda x: (x * pizza_df['quantity']).sum())
).sort_values(by='revenue_per_category', ascending=False)
print("Average Unit Price and Revenue by Category:\n", category_analysis)

In [None]:
# Revenue by Pizza Category by a Bar Chart
plt.figure(figsize=(9, 5))
category_analysis['revenue_per_category'].plot(kind="bar", color="lightblue", edgecolor="black", linewidth=1.2)
plt.xlabel("Pizza Category", fontsize=12, fontweight='bold')
plt.ylabel("Revenue", fontsize=12, fontweight='bold')
plt.title("Revenue by Pizza Category",fontsize=12, fontweight='bold')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
import plotly.express as px
# Now create an interactive bar chart using Plotly
fig = px.bar(
    category_analysis,
    x=category_analysis.index,  # The pizza categories
    y='revenue_per_category',   # The revenue per category
    labels={
        'x': 'Pizza Category', 
        'revenue_per_category': 'Revenue'
    },
    title='Revenue by Pizza Category',
    text='revenue_per_category'  # Display the revenue values on hover
)

# Customize the layout for better appearance
fig.update_layout(
    xaxis_title="Pizza Category",
    yaxis_title="Revenue",
    title_x=0.5,  # Center the title
    template='plotly_white',  # Use a clean theme
    bargap=0.2,  # Space between bars
    title_font=dict(size=20, color='black', family="Arial"),
    xaxis=dict(tickangle=-45),  # Rotate x-axis labels for readability
)
# Automatically show the text inside the bars
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.show()

In [None]:
# Average unit price by pizza size
size_analysis = pizza_df.groupby('pizza_size').agg(
    average_unit_price=('unit_price', 'mean'),
    revenue_per_size=('unit_price', lambda x: (x * pizza_df['quantity']).sum())
).sort_values(by='revenue_per_size', ascending=False)
print("Average Unit Price and Revenue by Size:\n", size_analysis)

In [None]:
# Revenue by pizza size
plt.figure(figsize=(9, 5))
size_analysis['revenue_per_size'].plot(kind='bar', color='teal', edgecolor="black", linewidth=1.2)
plt.xlabel('Pizza Sizes', fontsize=12, fontweight='bold')
plt.ylabel('Revenue', fontsize=12, fontweight='bold')
plt.title('Revenue by Pizza Size', fontsize=12, fontweight='bold')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

In [None]:
# types of pizza that were sold in the restaurant
pizza_df.pizza_name.value_counts()

In [None]:
# ingredients that were used
pizza_df.pizza_ingredients.value_counts()

In [None]:
text = ' '.join(pizza_df['pizza_ingredients'])

plt.rcParams['figure.figsize'] = (12,12)
wordcloud = WordCloud(background_color = 'black',colormap='gnuplot2_r', width = 1200,  height = 1200, max_words = 121).generate(text)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
# average unit price & revenue of the top pizza
top_pizza_analysis = pizza_df.groupby("pizza_name").agg(
    average_unit_price=('unit_price', 'mean'),
    revenue_per_pizza=('unit_price', lambda x: (x * pizza_df['quantity']).sum())
).nlargest(5, "revenue_per_pizza")
print("Average Unit Price & Revenue of Top 5 Pizzas:\n", top_pizza_analysis)

In [None]:
# Revenue by pizza size
plt.figure(figsize=(10, 5))
top_pizza_analysis['revenue_per_pizza'].plot(kind='bar', color='maroon', edgecolor="black", linewidth=1.2)
plt.xlabel('Pizza Name', fontsize=12, fontweight='bold')
plt.ylabel('Revenue', fontsize=12, fontweight='bold')
plt.title('Revenue by Top Pizzas', fontsize=12, fontweight='bold')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

In [None]:
#which day of week has highest num of orders 
pizza_df['day_of_week'] = pd.to_datetime(pizza_df['order_date']).dt.strftime('%A')
day_of_week_analysis = pizza_df.groupby('day_of_week')['order_id'].count().sort_values(ascending=False)
print("Highest Number of Orders by Day of Week:\n", day_of_week_analysis)

In [None]:
# year in which the orders were made
pizza_df['order_year'] =pd.DatetimeIndex( pizza_df['order_date']).year
pizza_df['order_year'].value_counts()

In [None]:
plt.figure(figsize=(10, 4))
day_of_week_analysis.plot(kind='bar', color='violet', edgecolor="black", linewidth=1.2)
plt.xlabel('Day of Week', fontsize=12, fontweight='bold')
plt.ylabel('Number of Orders',fontsize=12, fontweight='bold')
plt.title('Orders by Day of Week',fontsize=12, fontweight='bold')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

In [None]:
import plotly.express as px
fig = px.pie(
    day_of_week_analysis,
    names=day_of_week_analysis.index,  # The days of the week
    values=day_of_week_analysis,       # The number of orders
    title="Orders by Day of Week"
)

# Customize the layout for better appearance
fig.update_traces(
    textinfo='label+percent+value',  # Show label, percentage, and value on hover
    hoverinfo='label+percent+value', # Ensure all details are visible on hover
    textfont_size=14                 # Font size of the text inside the pie
)

fig.update_layout(
    title_font=dict(size=20, color='black', family="Arial"),
    title_x=0.5,  # Center the title
    template='plotly_white',  # Use a clean theme
)

# Show the interactive pie chart
fig.show()

In [None]:
# what time does the order occur the most
# display the time when pizzas are ordered the most along w/ humber of orders
pizza_df['delivery_time'] = pizza_df['order_time'].astype('string').str.slice(0, 5) 
delivery_time_analysis = pizza_df.groupby('delivery_time')['order_id'].count().nlargest(5)
print("Most Occurring Order Times:\n", delivery_time_analysis)

In [None]:
# do a line chart
# Orders by Peak time (bar chart)
plt.figure(figsize=(8, 5))
delivery_time_analysis.plot(kind='bar', color='deeppink', edgecolor="black", linewidth=1.2)
plt.xlabel('Peak Time', fontsize=12, fontweight='bold')
plt.ylabel('Number of Orders', fontsize=12, fontweight='bold')
plt.title('Orders by Peak Time', fontsize=12, fontweight='bold')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

In [None]:
#orders_per_hour = pizza_df.groupby('Hour')['order_id'].nunique()
# Create the line chart
#plt.plot(orders_per_hour.index, orders_per_hour.values)
#plt.xlabel('Hour of the Day')
#plt.ylabel('Number of Orders')
#plt.title('Peak Hour in a Day (24 hours format)',color ='blue')
#plt.tight_layout()
#plt.grid()
#plt.show()

In [None]:
pizza_df['order_time']= pizza_df['order_time'].astype('string')
pizza_df[['Hour','Minute', 'Second']]= pizza_df['order_time'].str.split(":",expand=True)

In [None]:
pizza_df['datetime'] = pd.to_datetime(pizza_df['order_date'].astype(str) + ' ' + pizza_df['order_time'].astype(str))
pizza_df['weekday'] = pizza_df['datetime'].dt.strftime('%A')

In [None]:
# Group the data by hour and count the number of unique orders for each hour
hourly_orders = pizza_df.groupby(pizza_df['datetime'].dt.hour)['order_id'].nunique()

# Print the count of unique orders for each hour
for hour, count in hourly_orders.items():
    print(f"Hour {hour}: {count} orders")

In [None]:
sns.countplot(data=pizza_df,x="Hour",palette="flare")
plt.xticks(rotation=45)
plt.xlabel("Hour",fontsize=10,color="red")
plt.ylabel("Frequency",fontsize=10,color="red")
plt.title("HOUR",color="black")
plt.show()

In [None]:
# which month has the highest revenue ?
month_revenue_analysis = pizza_df.groupby(pd.to_datetime(pizza_df['order_date']).dt.strftime('%B'))['total_price'].sum().sort_values(ascending=False)
print("Highest Revenue Months:\n", month_revenue_analysis)

In [None]:
month_revenue_analysis.plot(kind='line', color='orange',linewidth=1.2)
plt.xlabel('Month', fontsize=12, fontweight='bold')
plt.ylabel('Revenue', fontsize=12, fontweight='bold')
plt.title('Revenue by Month', fontsize=12, fontweight='bold')
plt.xticks(ticks=range(len(month_revenue_analysis.index)), 
           labels=month_revenue_analysis.index, 
           rotation=45)
plt.grid(axis='x', linestyle='--', alpha=0.5)
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

In [None]:
# most ordered pizza
most_ordered_pizza = pizza_df.groupby('pizza_name').agg(
    order_count=('order_id', 'count'),
    average_unit_price=('unit_price', 'mean')
).nlargest(3, 'order_count')
print("Most Ordered Pizzas:\n", most_ordered_pizza)

In [None]:
# least ordered pizza by the customers
least_ordered_pizza = pizza_df.groupby('pizza_name').agg(
    order_count=('order_id', 'count'),
    average_unit_price=('unit_price', 'mean')
).nsmallest(3, 'order_count')
print("Most Ordered Pizzas:\n", least_ordered_pizza)

In [None]:
# pizza size that is preferred by the customers?
preferred_pizza_size = pizza_df.groupby('pizza_size').agg(
    order_count=('order_id', 'count'),
    average_unit_price=('unit_price', 'mean')
).nlargest(1, 'order_count')
print("Preferred Pizza Size:\n", preferred_pizza_size)

In [None]:
# pizza category that is most preferred by the customers
preferred_pizza_category = pizza_df.groupby('pizza_category').agg(
    order_count=('order_id', 'count'),
    average_unit_price=('unit_price', 'mean')
).nlargest(1, 'order_count')
print("Preferred Pizza Category:\n", preferred_pizza_category)

In [None]:
# cheapest pizza
cheapest_pizza = pizza_df[['pizza_name', 'unit_price']].sort_values(by='unit_price').iloc[0]
print("Lowest Priced Pizza:\n", cheapest_pizza)

In [None]:
# expensive pizza
most_exp_pizza = pizza_df[['pizza_name', 'unit_price']].sort_values(by='unit_price', ascending=False).iloc[0]
print("Lowest Priced Pizza:\n", most_exp_pizza)

In [None]:
# pizza per cat
pizzas_per_category = pizza_df['pizza_category'].value_counts()
print("Number of Pizzas per Category:\n", pizzas_per_category)

In [None]:
import matplotlib.cm as cm
color_map = plt.get_cmap('Set1')

# Plotting the pizza category distribution as a pie chart
plt.figure(figsize=(5, 6))  # Set figure size for better visibility

pizzas_per_category.plot(
    kind='pie',
    autopct='%1.1f%%',  # Display percentages on the pie chart
    colors=color_map(range(len(pizzas_per_category))),  # Apply the color map
    wedgeprops=dict(width=0.3),  # Add some space between slices (optional)
    textprops={'fontsize': 12}  # Set font size of the text
)

# Adding a title and ensuring the pie chart is a circle
plt.title('Pizza Category Distribution', fontsize=14, fontweight='bold')
plt.axis('equal')  # Equal aspect ratio ensures the pie is drawn as a circle

# Show the pie chart
plt.show()


In [None]:
# num of pizza per size
pizzas_per_size = pizza_df['pizza_size'].value_counts()
print("Number of Pizzas per Size:\n", pizzas_per_size)

In [None]:
color_map = plt.get_cmap('Set3') 

# Plotting the pizza size distribution as a pie chart
plt.figure(figsize=(7, 5))  # Set figure size for better visibility

pizzas_per_size.plot(
    kind='pie',
    autopct='%1.1f%%',  # Display percentages on the pie chart
    colors=color_map(range(len(pizzas_per_size))),  # Apply the color map
    wedgeprops=dict(width=0.3),  # Optional: add some space between slices
    textprops={'fontsize': 10}  # Set font size of the text
)

plt.title('Pizza Size Distribution', fontsize=14, fontweight='bold')
plt.axis('equal')  # Equal aspect ratio ensures the pie is drawn as a circle

plt.show()

In [None]:
pip install xgboost

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
df = pd.read_excel('/kaggle/input/pizza-sales/Data Model - Pizza Sales.xlsx')  # Changed to read_excel

# Preprocess the data
le = LabelEncoder()
categorical_columns = ['pizza_id', 'pizza_size', 'pizza_category', 'pizza_name']
for col in categorical_columns:
    df[col] = le.fit_transform(df[col])

# Convert datetime columns
df['order_date'] = pd.to_datetime(df['order_date'])
df['order_dow'] = df['order_date'].dt.dayofweek
df['order_month'] = df['order_date'].dt.month

# Select features and target
features = ['order_id', 'pizza_id', 'quantity', 'unit_price', 'pizza_size', 
            'pizza_category', 'order_dow', 'order_month']
target = 'total_price'

X = df[features]
y = df[target]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost model
model = xgb.XGBRegressor(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Root Mean Squared Error: {rmse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared Score: {r2}")

# Feature importance
feature_importance = model.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5

fig, ax = plt.subplots(figsize=(8, 6))
ax.barh(pos, feature_importance[sorted_idx], align='center')
ax.set_yticks(pos)
ax.set_yticklabels(np.array(features)[sorted_idx])
ax.set_title('Feature Importance')
plt.tight_layout()
plt.show()

# Scatter plot of predicted vs actual values
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted Values')
plt.tight_layout()
plt.show()

# Residual plot
residuals = y_test - y_pred
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_pred, y=residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
df = pd.read_excel('/kaggle/input/pizza-sales/Data Model - Pizza Sales.xlsx')

# Preprocess the data
le = LabelEncoder()
categorical_columns = ['pizza_id', 'pizza_size', 'pizza_category', 'pizza_name', 'order_time']
for col in categorical_columns:
    df[col] = le.fit_transform(df[col])

# Convert datetime columns
df['order_date'] = pd.to_datetime(df['order_date'])
df['order_dow'] = df['order_date'].dt.dayofweek
df['order_month'] = df['order_date'].dt.month

# Select features and target
features = ['order_id', 'pizza_id', 'quantity', 'unit_price', 'pizza_size', 
            'pizza_category', 'order_dow', 'order_month', 'order_time']
target = 'total_price'

X = df[features]
y = df[target]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Root Mean Squared Error: {rmse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared Score: {r2}")

# Feature importance
feature_importance = model.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5

fig, ax = plt.subplots(figsize=(12, 6))
ax.barh(pos, feature_importance[sorted_idx], align='center')
ax.set_yticks(pos)
ax.set_yticklabels(np.array(features)[sorted_idx])
ax.set_title('Feature Importance')
plt.tight_layout()
plt.show()

# Scatter plot of predicted vs actual values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted Values')
plt.tight_layout()
plt.show()

# Residual plot
residuals = y_test - y_pred
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_pred, y=residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
df = pd.read_excel('/kaggle/input/pizza-sales/Data Model - Pizza Sales.xlsx')

# Preprocess the data
le = LabelEncoder()
categorical_columns = ['pizza_id', 'pizza_size', 'pizza_category', 'pizza_name', 'order_time']
for col in categorical_columns:
    df[col] = le.fit_transform(df[col])

# Convert datetime columns
df['order_date'] = pd.to_datetime(df['order_date'])
df['order_dow'] = df['order_date'].dt.dayofweek
df['order_month'] = df['order_date'].dt.month

# Select features and target
features = ['order_id', 'pizza_id', 'quantity', 'unit_price', 'pizza_size', 
            'pizza_category', 'order_dow', 'order_month', 'order_time']
target = 'total_price'

X = df[features]
y = df[target]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
xgb_model = xgb.XGBRegressor(random_state=42)

# Function to evaluate model
def evaluate_model(model, X, y):
    # Perform 5-fold cross-validation
    cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
    rmse_scores = np.sqrt(-cv_scores)
    
    # Train the model on the entire dataset
    model.fit(X, y)
    
    # Make predictions
    y_pred = model.predict(X)
    
    # Calculate metrics
    mse = mean_squared_error(y, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    
    return {
        'cv_rmse': rmse_scores.mean(),
        'rmse': rmse,
        'mae': mae,
        'r2': r2
    }

# Evaluate models
rf_results = evaluate_model(rf_model, X, y)
xgb_results = evaluate_model(xgb_model, X, y)

# Print results
print("Random Forest Results:")
for metric, value in rf_results.items():
    print(f"{metric}: {value}")

print("\nXGBoost Results:")
for metric, value in xgb_results.items():
    print(f"{metric}: {value}")

# Visualize results
metrics = ['cv_rmse', 'rmse', 'mae', 'r2']
rf_values = [rf_results[m] for m in metrics]
xgb_values = [xgb_results[m] for m in metrics]

plt.figure(figsize=(12, 6))
x = range(len(metrics))
width = 0.35
plt.bar([i - width/2 for i in x], rf_values, width, label='Random Forest', alpha=0.8)
plt.bar([i + width/2 for i in x], xgb_values, width, label='XGBoost', alpha=0.8)
plt.ylabel('Score')
plt.title('Random Forest vs XGBoost Performance Comparison')
plt.xticks(x, metrics)
plt.legend()
plt.tight_layout()
plt.show()

# Train final models on entire dataset
rf_model.fit(X, y)
xgb_model.fit(X, y)

# Compare feature importance
rf_importance = rf_model.feature_importances_
xgb_importance = xgb_model.feature_importances_

plt.figure(figsize=(12, 6))
x = range(len(features))
width = 0.35
plt.bar([i - width/2 for i in x], rf_importance, width, label='Random Forest', alpha=0.8)
plt.bar([i + width/2 for i in x], xgb_importance, width, label='XGBoost', alpha=0.8)
plt.ylabel('Feature Importance')
plt.title('Random Forest vs XGBoost Feature Importance Comparison')
plt.xticks(x, features, rotation=45, ha='right')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Selecting features and target variable
features = ['pizza_id', 'quantity', 'unit_price', 'pizza_size', 'pizza_category']
target = 'total_price'

# Encoding categorical features
label_encoder = LabelEncoder()
df['pizza_id'] = label_encoder.fit_transform(df['pizza_id'])
df['pizza_size'] = label_encoder.fit_transform(df['pizza_size'])
df['pizza_category'] = label_encoder.fit_transform(df['pizza_category'])

# Split the data into features and target
X = df[features]
y = df[target]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

xgboost is a highly efficient and can handle large datasets with high accuracy.

- XGBRegressor: creates an instance of the XGBoost regressor for regression tasks (predicting continuous values like total_price).
- objective='reg:squarederror': Specifies the learning task as regression with squared error as the loss function.
- n_estimators=100: Defines the number of boosting rounds (or decision trees) to build. The higher the value, the more trees, which generally improves performance but increases training time.
- random_state=42: Ensures reproducibility of results by setting a random seed.

In [None]:
import xgboost as xgb

# Train XGBoost
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
xgb_reg.fit(X_train, y_train) # The model will learn patterns from the feature set (X_train) to predict the target variable (total_price).

# Predictions and evaluation
    # Purpose: This makes predictions on the testing set (X_test). The predict() method uses the trained model to estimate the total_price values for the test data.
    # The result, y_pred_xgb, contains the predicted values.
y_pred_xgb = xgb_reg.predict(X_test)
    # Mean Squared Error (MSE): A common metric for regression, it measures the average squared difference between the actual (y_test) and predicted (y_pred_xgb) values.
    # Lower MSE means better model performance.
print(f"XGBoost MSE: {mean_squared_error(y_test, y_pred_xgb)}")