In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import regex as re

In [None]:
treatsdf = pd.read_csv('../data/treatscleaned_final.csv')
ingcount = pd.read_csv('../data/ingredientscount.csv')

In [None]:
treatsdf.head()

In [None]:
ingcount.head()

In [None]:
treatsdf.head()

In [None]:
#creating a column identifying whether or not a treat is limited ingredient.
#there is no regulatory meaning of 'Limited Ingredient', so 8 is selected as the cutoff based on the
#maximum number of ingredients present in our existing homemade dog treat recipes, 8 also represents the 25%ile.

limited_ing = []

for x in treatsdf['ing_count']:
    if x <= 8:
        limited_ing.append('Yes')
    else:
        limited_ing.append('No')

In [None]:
treatsdf['limited_ing'] = limited_ing

In [None]:
treatsdf.describe()

In [None]:
treatsdf.corr()

In [None]:
#plotting dog treat price against rating.

sns.scatterplot(x = treatsdf['rating'], y = treatsdf['price'])
plt.xlabel("Rating")
plt.ylabel("Price")
plt.title("Do lower cost treats rate higher?")
plt.show()

Price and rating relationship does not imply that cost and rating are related. Calculated correlation coeffecient doesn't mathematically support this, however.

Research paper "Price Effects in Online Product Reviews: An Analytical Model and Empirical Analysis." appears to confirm that uni-dimensional rating systems are substantially biased by price, where rating is based more on 'perceived value' over 'perceived quality'.

Li, Xinxin, and Lorin M. Hitt. “Price Effects in Online Product Reviews: An Analytical Model and Empirical Analysis.” MIS Quarterly, vol. 34, no. 4, 2010, pp. 809–31. JSTOR, https://doi.org/10.2307/25750706. Accessed 14 June 2023.

In [None]:
#calculating cost per ounce for treats

cost_per_oz = treatsdf['price'] / treatsdf['size']

#creating column in the dataframe for cost_per_oz

treatsdf['cost_per_oz'] = cost_per_oz

In [None]:
#plotting distribution of cost per ounce

sns.boxplot(x = cost_per_oz, showfliers = True, palette = 'viridis', showmeans=True)
plt.title('Dog Treat Price Per Ounce')
plt.show()

While I made an effort to convert all measures to ounces, few are by count. I believe this is resulting in the outliers. Weight data was not available from the website after further investigation. Will remove outliers where price is concerned moving forward.

In [None]:
#finding quartiles to calculate interquartile range to filter cost_per_oz outliers in upcoming visual.

q3, q1 = np.percentile(treatsdf['cost_per_oz'], [75, 25])
print("3rd quartile is ", q3)
print("1st quartile is ", q1)

In [None]:
#calculate IQR for cost_per_oz

iqr = q3 - q1
print("IQR is ", iqr)

In [None]:
#Finding high limit for outliers, cost_per_oz

high_outlier = q3 + (1.5 * iqr)

In [None]:
#creating a dataframe with all cost_per_oz outliers removed for better visualization.

ol_removed = treatsdf.loc[treatsdf['cost_per_oz'] <= high_outlier]

In [None]:
#plotting ingredient count against price.
minsize, maxsize = [min(ol_removed['rating']) * 15 + 20, max(ol_removed['rating']*25)]
sns.scatterplot(x = treatsdf['ing_count'], y = ol_removed['cost_per_oz'], size = ol_removed['rating'], hue = ol_removed['rating'], 
                palette = 'viridis', sizes = (minsize, maxsize))
plt.xlabel('Ingredient Count')
plt.ylabel('Retail Cost Per Ounce')
plt.title("Are limited ingredient treats more expensive?")

#plotting a line at 8 or my cutoff for 'limited ingredient'
plt.axvline(x = 8, linestyle = ':', color = 'orange')

plt.legend(title = "Rating", bbox_to_anchor = (1.0, 1.0))

tick_locations = plt.yticks()[0]
tick_labels = ['${}'.format(int(tick)) for tick in tick_locations]
plt.yticks(tick_locations, tick_labels)

plt.show()

Limited ingredient treats tend to be a bit more expensive. It doesn't appear to directly correlate, with a calculated correlation coeffecient of -.17. The vertical line represents my cutoff for 'limited ingredient' treats.

With outliers removed, correlation is reported at -.11, suggesting even less of a correlation between price and number of ingredients.

In [None]:
#plotting distribution of cost per ounce, removed outliers for a cleaner presentation.

sns.boxplot(x = cost_per_oz, showfliers = False, palette = 'viridis', showmeans=True)
plt.title('Dog Treat Price Per Ounce')
plt.show()

In [None]:
#basics stats relative to cost per ounce, with outliers.

cost_per_oz.describe()

In [None]:
#basics stats relative to cost per ounce, without outliers.

ol_removed['cost_per_oz'].describe()

In [None]:
ol_removed.corr()

In [None]:
#understanding common pack sizes

treatsdf['size'].describe()

50%ile for product size is 8 ounces. 

In [None]:
#creating two tables to subset data

limited_ingredient = ol_removed[ol_removed['ing_count'] <= 8]
other = ol_removed[ol_removed['ing_count'] > 8]

In [None]:
#basic statistics about the limited_ingredient table. Dataframe limited_ingredient has outliers removed.

limited_ingredient.describe()

In [None]:
#Filtering to limited ingredient recipes with reviews to be better compare average ratings.

treatsdf[(treatsdf['ing_count'] <= 8) & (treatsdf['reviews'] > 0)]['rating'].describe()

In [None]:
#Exploring cost per oz for all other treats. Dataframe other has outliers removed.

other.describe()

In [None]:
#filtering to unlimited ingredient recipes with reviews to better compare rating.

treatsdf[(treatsdf['ing_count']) > 8 & (treatsdf['reviews'] > 0)]['rating'].describe()