In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

## Load data and structure

In [None]:
df_amz_ppd = pd.read_csv('./amz_uk_price_prediction_dataset.csv')

In [None]:
df_amz_ppd.head()

In [None]:
df_amz_ppd.dtypes

In [None]:
df_amz_ppd.shape

In [None]:
df_amz_ppd.nunique().sort_values(ascending=False)

In [None]:
df_amz_ppd['category'].unique()


In [None]:
df_amz_cat_count = df_amz_ppd['category'].value_counts()
df_amz_cat_count

## Part 1: Understanding Product Categories
Business Question: What are the most popular product categories on Amazon UK, and how do they compare in terms of listing frequency?

Frequency Tables:

- Generate a frequency table for the product category.
- Which are the top 5 most listed product categories?

Visualizations:

- Display the distribution of products across different categories using a bar chart. If you face problems understanding the chart, do it for a subset of top categories.
- For a subset of top categories, visualize their proportions using a pie chart. Does any category dominate the listings?

In [None]:
df_amz_ppd['asin'].unique()

In [None]:
df_amz_asin_count = df_amz_ppd[["asin", "title"]].value_counts()   #"category"
#df_amz_asin_count = df_amz_asin_count.sort_values(ascending=False)
df_amz_asin_count.head(25)

In [None]:
df_amz_cat_freq = pd.crosstab(index = df_amz_ppd["category"], columns = "abs_count")

df_amz_cat_freq = df_amz_cat_freq.reset_index()
df_amz_cat_freq = df_amz_cat_freq.sort_values(by = "abs_count", ascending=False)

df_amz_cat_freq["rel_freq_in%"] = df_amz_cat_freq["abs_count"] / df_amz_cat_freq["abs_count"].sum() *100

df_amz_cat_freq.head(5)

In [None]:
df_top10_abs_count = df_amz_cat_freq[["category", "abs_count"]].nlargest(10, "abs_count")
df_top10_abs_count

In [None]:
top10_rel_freq = df_amz_cat_freq[["category", "rel_freq_in%"]].nlargest(10, "rel_freq_in%")
top10_rel_freq

In [None]:
print(df_amz_cat_freq.head())
print(df_amz_cat_freq.columns)

In [None]:
sns.barplot(x=df_top10_abs_count["abs_count"], y=df_top10_abs_count["category"], palette="Set3")
plt.title("Top 10 Category according to listing")
plt.xlabel("absolut count")
plt.ylabel("category")
plt.show()

In [None]:
sns.barplot(x=top10_rel_freq["rel_freq_in%"], y=top10_rel_freq["category"], palette="Set3")
plt.title("Top 10 category according to listing in %")
plt.xlabel("count in %")
plt.ylabel("category")
plt.show()

In [None]:
plt.pie(
    top10_rel_freq["rel_freq_in%"],
    labels=top10_rel_freq["category"],
    autopct='%1.1f%%',
    startangle=90,
    colors=sns.color_palette("Set3")
)
plt.title("Top10 categories")
plt.show()

# it's just the top10 =100% - from the top10 Sports&Outdoors is dominating with 83%

## Part 2: Delving into Product Pricing
Business Question: How are products priced on Amazon UK, and are there specific price points or ranges that are more common?

Measures of Centrality:

Calculate the mean, median, and mode for the price of products.
What's the average price point of products listed? How does this compare with the most common price point (mode)?
Measures of Dispersion:

Determine the variance, standard deviation, range, and interquartile range for product price.
How varied are the product prices? Are there any indicators of a significant spread in prices?
Visualizations:

Is there a specific price range where most products fall? Plot a histogram to visualize the distribution of product prices. If its hard to read these diagrams, think why this is, and explain how it could be solved..
Are there products that are priced significantly higher than the rest? Use a box plot to showcase the spread and potential outliers in product pricing.

In [None]:
df_amz_ppd.describe().round(2)

In [None]:
count_price = df_amz_ppd["price"].count()
mean_price = df_amz_ppd["price"].mean()
median_price = df_amz_ppd["price"].median()
mode_price = df_amz_ppd["price"].mode()[0]

count_price, mean_price, median_price, mode_price

Interpretation:

The average is significantly different from the median and the mode. 50% of the products are cheaper than 19 units and 50% are more expensive than 19 units. 
Products with 10 units cost the most less than the median, but they deviate very strongly from the average with 89 units. 
This difference is most likely due to products that are significantly more expensive than 10 or 20 units.

In [None]:
# MODE , VARIANCE, STANDARD DEVIATION, RANGE, 
var_price = df_amz_ppd["price"].var()
std_price = df_amz_ppd["price"].std()
range_price = df_amz_ppd["price"].max()  - df_amz_ppd["price"].min()

var_price, std_price, range_price

Interpretation:

 Yes, there is a strong deviation from the median from around 1800%. A randomly selected product is 345 units away from the median (19 units). This high deviation is also visible in the extreme range of 100000 units

In [None]:
sns.histplot(df_amz_ppd['price'], kde=True, bins=5, color="salmon")

In [None]:
plt.hist(x=df_amz_ppd["price"], bins =30)
plt.show

In [None]:
sns.boxplot(df_amz_ppd['price'], color="lightblue")
plt.show

In [None]:
df_amz_ppd['price_quantile'] = pd.qcut(df_amz_ppd['price'], q=5, labels=['very low', 'low', 'medium', 'high', 'very high'])

df_amz_ppd.price_quantile.value_counts()

In [None]:
skewness_df_amz_ppd = df_amz_ppd["price"].skew()
kurtosis_df_amz_ppd = df_amz_ppd['price'].kurtosis()

skewness_df_amz_ppd, kurtosis_df_amz_ppd

In [None]:
sns.histplot(df_amz_ppd['price'], kde=True, bins=25, color="salmon") #bins=12, (zwischen kde und color)
#plt.ylim(0,100) #fürs reinzoomen
plt.show()

In [None]:
#custom binning
bins = [0, 4, 8, 12, 16, 20, 25, 50, 75, 100, 200, 500, 1000, df_amz_ppd["price"].max()]
labels = ["extremely cheap","very cheap", "cheap", "fairly cheap", "moderate", "slightly moderate", "neutral", "slightly expensive", "somewhat expensive", "fairly expensive", "expensive", "very expensive", "extremely expensive"]

df_amz_ppd["price_bin"] = pd.cut(df_amz_ppd["price"], bins = bins, labels = labels)

df_amz_ppd.head()

In [None]:
df_amz_ppd.price_bin.value_counts()

In [None]:
sns.histplot(df_amz_ppd['price_bin'], kde=True, bins=13, color="salmon")

In [None]:
sns.boxplot(df_amz_ppd['price_bin'], color="lightblue")
plt.show

## Part 3: Unpacking Product Ratings
Business Question: How do customers rate products on Amazon UK, and are there any patterns or tendencies in the ratings?

Measures of Centrality:

Calculate the mean, median, and mode for the rating of products.
How do customers generally rate products? Is there a common trend?
Measures of Dispersion:

Determine the variance, standard deviation, and interquartile range for product rating.
Are the ratings consistent, or is there a wide variation in customer feedback?
Shape of the Distribution:

Calculate the skewness and kurtosis for the rating column.
Are the ratings normally distributed, or do they lean towards higher or lower values?
Visualizations:

Plot a histogram to visualize the distribution of product ratings. Is there a specific rating that is more common?

In [None]:
count_stars = df_amz_ppd["stars"].count()
mean_stars = df_amz_ppd["stars"].mean()
median_stars = df_amz_ppd["stars"].median()
mode_stars = df_amz_ppd["stars"].mode()[0]

count_stars, mean_stars, median_stars, mode_stars

In [None]:
var_stars = df_amz_ppd["stars"].var()
std_stars = df_amz_ppd["stars"].std()
range_stars = df_amz_ppd["stars"].max()  - df_amz_ppd["stars"].min()

var_stars, std_stars, range_stars



In [None]:
skewness_df_amz_ppd = df_amz_ppd["stars"].skew()
kurtosis_df_amz_ppd = df_amz_ppd['stars'].kurtosis()

skewness_df_amz_ppd, kurtosis_df_amz_ppd

In [None]:
sns.histplot(df_amz_ppd['stars'], kde=True, bins=5, color="salmon")

The rating of 1 and 5 out of 5 stars are most common - basically there are the most ratings with 1 out of 5 stars

In [None]:
sns.boxplot(df_amz_ppd['stars'], color="lightblue")
plt.show