# Task 1 - Setup and EDA
In this section, you will: 
- Load the necessary packages required for executing the code
- Load the data
- Prepare the data for further analysis
- Conduct EDA on the data

In [None]:
# Import 'numpy' and 'pandas' for working with numbers and dataframes
import numpy as np
import pandas as pd

# Import 'matplotlib.pyplot' for visualizations
from matplotlib import pyplot as plt
import seaborn as sns

# Import method for regression from 'statsmodels'
import statsmodels.formula.api as smf

# Import methods for regression diagnostic plots from 'statsmodels'
from statsmodels.api import ProbPlot, qqplot

In [None]:
# Load the data and take a look at it
# Note: Make sure that the data is in the same folder as the Jupyter notebook or specify the address correctly
df = pd.read_csv('Buffet_Details.csv', index_col = 0)
df.head()

In [None]:
# Study the description of the data
with open('Buffet_Details_Feature_Description.txt', 'r') as f:
    print(f.read())

In [None]:
# Look at the specifics of the data frame using the '.info()' command
##### CODE HERE #####

In [None]:
# Drop the 'Name' feature from the data set using the '.drop()' command
##### CODE HERE #####

In [None]:
# Convert the 'Cuisine' feature to the 'category' data type using the '.astype()' command
##### CODE HERE #####

In [None]:
# Take a look at the data
df.head()

In [None]:
# Create histograms for the variables 'Age' and 'Expenditure'
plt.figure(figsize = (12, 4))

colorname = ['lightblue', 'lightgreen']
fignum = 0
for featurename in ['Age', 'Expenditure']:
    fignum = fignum + 1
    plt.subplot(1, 2, fignum)
    sns.histplot(data = df, x = featurename, color = colorname[fignum - 1])

plt.tight_layout();

In [None]:
# Create box plots for the variables 'Age' and 'Expenditure'
plt.figure(figsize = (12, 4))

colorname = ['lightblue', 'lightgreen']
fignum = 0
for featurename in ['Age', 'Expenditure']:
    fignum = fignum + 1
    plt.subplot(1, 2, fignum)
    sns.boxplot(data = df, x = featurename, color = colorname[fignum - 1])

plt.tight_layout();

In [None]:
# Create a count plot for the variables 'Cuisine'
plt.figure(figsize = (8, 4))

sns.countplot(data = df, x = 'Cuisine')
    
plt.tight_layout();

In [None]:
# Create a bar plot for the variable 'Cuisine' with 'Expenditure' on the Y-axis
plt.figure(figsize = (8, 4))

sns.barplot(x = df['Cuisine'], y = df['Expenditure'], ci = None)

plt.tight_layout();

In [None]:
# Create scatter plots of 'Expenditure' versus 'Age', one colored by 'Cuisine' and the other without any categorical division
plt.figure(figsize = (14, 5))
plt.subplot(1, 2, 1)
sns.scatterplot(data = df, x = 'Age', y = 'Expenditure')
plt.subplot(1, 2, 2)
sns.scatterplot(data = df, x = 'Age', y = 'Expenditure', hue = 'Cuisine');

In [None]:
# Create a pair plot for the data
sns.pairplot(df);

# Task 2 - Linear Regression Models
In this section, you will create linear regression models for the data and evaluate them.

### Model 1

In [None]:
# Create and train a linear regression model for the data using the 'smf.ols()' method and view its summary
# Note: The objective is to predict 'Expenditure' using 'Age'
lr_model_1 = ##### CODE HERE #####
lr_model_1 = lr_model_1.fit()
print(lr_model_1.summary())

### Model 2

In [None]:
# Create and train a linear regression model for the data using the 'smf.ols()' method and view its summary
# Note: The objective is to predict 'Expenditure' using 'Cuisine'
# Note: Set the base category of 'Cuisine' to 'Indian' using the '.cat.set_categories()' command
df['Cuisine'] = ##### CODE HERE #####
lr_model_2 = ##### CODE HERE #####
lr_model_2 = lr_model_2.fit()
print(lr_model_2.summary())

### Model 3

In [None]:
# Create and train a linear regression model for the data using the 'smf.ols()' method and view its summary
# Note: The objective is to predict 'Expenditure' using 'Age' and 'Cuisine'
# Note: Set the base category of 'Cuisine' to 'Indian' using the '.cat.set_categories()' command
df['Cuisine'] = ##### CODE HERE #####
lr_model_3 = ##### CODE HERE #####
lr_model_3 = lr_model_3.fit()
print(lr_model_3.summary())

# Task 3 - Diagnostic Plots
In this section, you will create and analyze diagnostic plots for *lr_model_3*

In [None]:
# Create a scatter plot between the fitted and actual values of 'Expenditure'
plt.figure(figsize = (6, 6))
sns.scatterplot(x = lr_model_3.fittedvalues, y = df['Expenditure'])
plt.axline((30, 30), slope = 1, linestyle = '--', linewidth = 1, color = 'r')
plt.xlabel('Fitted Values of Expenditure')
plt.ylabel('Actual Values of Expenditure');

In [None]:
# Create a scatter plot between the fitted values of 'Expenditure' and the residuals
plt.figure(figsize = (8, 4))
sns.scatterplot(x = lr_model_3.fittedvalues, y = lr_model_3.resid)
plt.axhline(y = 0, xmin = 0, xmax = 1, linewidth = 1, color = 'k')
plt.xlabel('Fitted Values of Expenditure')
plt.ylabel('Residuals');

In [None]:
# Create a histogram of the residuals
plt.figure(figsize = (8, 4))
sns.histplot(data = df, x = lr_model_3.resid, color = 'lightgray')
plt.xlabel('Residual Value')
plt.ylabel('Frequency');

In [None]:
# Create a QQ plot for the data
QQ = ProbPlot(lr_model_3.get_influence().resid_studentized_internal)
fig = QQ.qqplot(line = '45', alpha = 0.5, lw = 1)
fig.set_size_inches(5, 5)
fig.gca().set_title('Normal Q-Q')
fig.gca().set_xlabel('Theoretical Quantiles')
fig.gca().set_ylabel('Standardized Residuals');