In [None]:
# pip install seaborn --upgrade

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

from scipy.stats import chi2_contingency
from scipy.stats.contingency import association

import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

%matplotlib inline 

In [None]:
data = r"amz_uk_price_prediction_dataset.csv"
amz_data = pd.read_csv(data)

In [None]:
amz_data.head(3)

In [None]:
crosstab_results = pd.crosstab(amz_data["category"], amz_data["isBestSeller"])

In [None]:
crosstab_results.sort_values(by=True, ascending=False)

In [None]:
crosstab_results[crosstab_results[True]>crosstab_results[False]]

In [None]:
chi2_statistic, chi2_p_value, _, _ = chi2_contingency(crosstab_results)
chi2_statistic, chi2_p_value

In [None]:
round(association(crosstab_results, method="cramer"),3)

In [None]:
crosstab_results.plot(kind="bar", stacked=True)
plt.figure(figsize=(10,6))

In [None]:
low_q = amz_data["price"].quantile(0.25)
high_q = amz_data["price"].quantile(0.75)
IQR = high_q - low_q

In [None]:
pot_outliers = amz_data[(amz_data["price"]>high_q+1.5*IQR) | (amz_data["price"]<low_q-1.5*IQR)]
data_removed = amz_data[~amz_data.isin(pot_outliers)]

In [None]:
head20 = data_removed.groupby(["category"])["price"].count().sort_values(ascending=False).head(20).to_frame()
head20 = data_removed[data_removed["category"].isin(head20.index)]
head10 = data_removed.groupby(["category"])["price"].count().sort_values(ascending=False).head(10).to_frame()
head10 = data_removed[data_removed["category"].isin(head10.index)]

In [None]:
sns.violinplot(head20, x="price", y="category")

In [None]:
# head20.groupby(["category"])["price"].median().sort_values(ascending=False)

In [None]:
head10_means = head10.groupby(["category"])["price"].mean().to_frame()

In [None]:
plt.xticks(rotation=90)
sns.barplot(head10_means.sort_values(by="price", ascending=False), x=head10_means.index, y="price", palette="mako")

In [None]:
plt.xticks(rotation=90)
sns.boxplot(head10, x="category", y="stars")

In [None]:
correlation = data_removed["price"].corr(data_removed["stars"])
correlation.round(3)

**There is no relevant correlation**

In [None]:
sns.scatterplot(data_removed, x="stars", y="price")

In [None]:
data_numerical = data_removed.select_dtypes("number")
correlation_matrix = data_numerical.corr()

In [None]:
sns.heatmap(correlation_matrix, annot=True, cmap="gray")

In [None]:
sm.qqplot(data_removed["price"], line="s")