In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
treatsdf = pd.read_csv('../data/treatscleaned_final.csv')
ingcount = pd.read_csv('../data/ingredientscount.csv')

In [None]:
treatsdf.head()

In [None]:
ingcount.head()

In [None]:
treatsdf.head()

In [None]:
#creating a column identifying whether or not a treat is limited ingredient.
#there is no regulatory meaning of 'Limited Ingredient', so 8 is selected as the cutoff based on the
#maximum number of ingredients present in our existing homemade dog treat recipes, 8 also represents the 25%ile.

limited_ing = []

for x in treatsdf['ing_count']:
    if x <= 8:
        limited_ing.append('Yes')
    else:
        limited_ing.append('No')

In [None]:
treatsdf['limited_ing'] = limited_ing

In [None]:
treatsdf.describe()

In [None]:
treatsdf.corr()

In [None]:
#plotting dog treat price against rating.

sns.scatterplot(x = treatsdf['rating'], y = treatsdf['price'])
plt.xlabel("Rating")
plt.ylabel("Price")
plt.title("Do lower cost treats rate higher?")
plt.show()

Price and rating relationship does not imply that cost and rating are related. Calculated correlation coeffecient doesn't mathematically support this, however.

Research paper "Price Effects in Online Product Reviews: An Analytical Model and Empirical Analysis." appears to confirm that uni-dimensional rating systems are substantially biased by price, where rating is based more on 'perceived value' over 'perceived quality'.

Li, Xinxin, and Lorin M. Hitt. “Price Effects in Online Product Reviews: An Analytical Model and Empirical Analysis.” MIS Quarterly, vol. 34, no. 4, 2010, pp. 809–31. JSTOR, https://doi.org/10.2307/25750706. Accessed 14 June 2023.

In [None]:
#plotting ingredient count against price.

sns.scatterplot(x = treatsdf['ing_count'], y = treatsdf['price'], size = treatsdf['rating'], hue = treatsdf['rating'], 
                palette = 'viridis')
plt.xlabel('Ingredient Count')
plt.ylabel('Price')
plt.title("Are limited ingredient treats more expensive?")
#plotting a line at 8 or my cutoff for 'limited ingredient'
plt.axvline(x = 8, linestyle = ':', color = 'orange')
plt.legend(title = "Rating", bbox_to_anchor = (1.0, 1.0))
plt.show()

The most expensive treats are considered 'Limited Ingredient'. It doesn't appear to directly correlate, with a calculated correlation coeffecient of -.17. The vertical line represents my cutoff for 'limited ingredient' treats.

In [None]:
#calculating cost per ounce for treats

cost_per_oz = treatsdf['price'] / treatsdf['size']

In [None]:
#plotting distribution of cost per ounce

sns.boxplot(x = cost_per_oz, showfliers = True, palette = 'viridis', showmeans=True)
plt.title('Dog Treat Price Per Ounce')
plt.show()

While I made an effort to convert all measures to ounces, few are by count. I believe this is resulting in the outliers. Weight data was not available from the website after further investigation.

In [None]:
#plotting distribution of cost per ounce, removed outliers for a cleaner presentation.

sns.boxplot(x = cost_per_oz, showfliers = False, palette = 'viridis', showmeans=True)
plt.title('Dog Treat Price Per Ounce')
plt.show()

In [None]:
#basics stats relative to cost per ounce.

cost_per_oz.describe()

In [None]:
#quick math pricing an 8 ounce product based on median and mean cost per ounce.

print('The price of an 8 oz product based on median cost per ounce is ' + str(.8325 * 8) + '.')
print('The price of an 8 oz product based on mean cost per ounce is ' + str(2.561517 * 8) + '.')

With the number of outliers present, median cost may be a more accurate measure to consider for pricing of a new product.

In [None]:
#understanding common pack sizes

treatsdf['size'].describe()

50%ile for product size is 8 ounces. 

In [None]:
treatsdf['cost_per_oz'] = cost_per_oz

In [None]:
#creating two tables to subset data

limited_ingredient = treatsdf[treatsdf['ing_count'] <= 8]
other = treatsdf[treatsdf['ing_count'] > 8]

In [None]:
#basic statistics about the limited_ingredient table

limited_ingredient.describe()

In [None]:
#Filtering to limited ingredient recipes with reviews to be better compare average ratings.

treatsdf[(treatsdf['ing_count'] <= 8) & (treatsdf['reviews'] > 0)]['rating'].describe()

In [None]:
#Exploring cost per oz for limited ingredient treats.

print('The price of an 8 oz product based on median cost per ounce for limited ingredient treats is ' + str(1.318 * 8) + '.')
print('The price of an 8 oz product based on mean cost per ounce for limited ingredient treats is ' + str(4.60211 * 8) + '.')

In [None]:
#Exploring cost per oz for all other treats.

other.describe()

In [None]:
#filtering to unlimited ingredient recipes with reviews to better compare rating.

treatsdf[(treatsdf['ing_count']) > 8 & (treatsdf['reviews'] > 0)]['rating'].describe()

In [None]:
print('The price of an 8 oz product based on median cost per ounce for non-limited ingredient treats is ' + str(.699 * 8) + '.')
print()
print('The price of an 8 oz product based on mean cost per ounce for non-limited ingredient treats is ' + str(1.806498 * 8) + '.')