**TITLE**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

%matplotlib inline

In [None]:
data = r"amz_uk_price_prediction_dataset.csv"
amz_data = pd.read_csv(data)

In [None]:
amz_data.columns

In [None]:
amz_data.head()

In [None]:
amz_data.groupby(["category"])["asin"].count().sort_values(ascending=False).head(5)

In [None]:
amz_cats = pd.crosstab(amz_data["category"], columns="frequency").sort_values(by="frequency", ascending=False)
amz_cats.head(5)

In [None]:
plt.figure(figsize=(30,7))
plt.xticks(rotation=90)
sns.barplot(amz_cats[1:], x=amz_cats.index[1:], y="frequency", palette="mako") # Except the first one.

In [None]:
amz_data.columns

In [None]:
amz_data["price"].describe().round(2)

In [None]:
amz_data["price"].mode()

In [None]:
amz_data["price"].median()

**An average that is almost ten times as high as the mode signifies a right-skewed distribution.**

In [None]:
variance = amz_data["price"].var()
st_deviation = amz_data["price"].std()
range = amz_data["price"].max() - amz_data["price"].min()
quantiles_price = amz_data["price"].quantile([0.25, 0.5, 0.75])


variance, st_deviation, range, quantiles_price

**With the standard deviation being roughly 345$ it appears that product prizes vary greatly**

In [None]:
avg_prices = amz_data.groupby(["category"])["price"].mean().sort_values(ascending=False)
plt.figure(figsize=(30,7))
plt.xticks(rotation=90)
sns.barplot(x=avg_prices.index, y=avg_prices.values, palette="mako")

In [None]:
avg_prices = pd.DataFrame(avg_prices)
combined_cats = avg_prices.join(amz_cats, on=avg_prices.index)

In [None]:
combined_cats = combined_cats.sort_values(by="frequency", ascending=False)

In [None]:
combined_cats.head()

In [None]:
sns.scatterplot(data=combined_cats[1:], x="price", y="frequency", color="gray") # Except the first one

In [None]:
sns.histplot(combined_cats["price"][1:], kde=True, bins=50, color="gray")

In [None]:
sns.boxplot(x=combined_cats["price"][1:], color="gray")

In [None]:
amz_data.head()

In [None]:
mean_rating = amz_data["stars"].mean()
median_rating = amz_data["stars"].median()
mode_rating = amz_data["stars"].mode()

mean_rating, median_rating, mode_rating

In [None]:
sns.histplot(amz_data[amz_data["stars"]!=0], x="stars", color="gray")

In [None]:
variance_rating = amz_data["stars"].var()
st_deviation_rating = amz_data["stars"].std()
quantiles_price_rating = amz_data["stars"].quantile([0.25, 0.5, 0.75])


variance_rating, st_deviation_rating, range_rating, quantiles_price_rating

**Considering the wide gap between 0 star ratings and 5 star ratings i would say the spread is wide**

In [None]:
skewness_rating = amz_data["stars"].skew()
kurtosis_rating = amz_data["stars"].kurtosis()

skewness_rating, kurtosis_rating

**A very small but positive skewness points towards a slight right-skewness. Meanwhile the negative kurtosis of roughly -2 
points towards a platykurtic destribution.**