In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

%matplotlib inline

In [None]:
# Loading the amazon dataset
df = pd.read_csv("C:/Users/rath_/Documents/GitHub/lab-eda-univariate/amz_uk_price_prediction_dataset.csv")

In [None]:
df

In [None]:
# Frequency Tables. Generate a frequency table for the product category.

In [None]:
category_frequency = df['category'].value_counts().reset_index()
category_frequency.columns = ['Category', 'Frequency']

In [None]:
print("Frequency Table for Categories:")
print(category_frequency)

In [None]:
# Which are the top 5 most listed product categories?

In [None]:
category_frequency.head(5)

In [None]:
# Visualizations:
# Display the distribution of products across different categories using a bar chart. If you face problems understanding the chart,
# do it for a subset of top categories.
# For a subset of top categories, visualize their proportions using a pie chart. Does any category dominate the listings?

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(category_frequency['Category'], category_frequency['Frequency'])
plt.title('Distribution of Products Across Categories')
plt.xlabel('Category')
plt.ylabel('Frequency')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
top_categories = category_frequency.head(5)

In [None]:
# Plot the proportions of top categories using a pie chart
plt.figure(figsize=(8, 8))
plt.pie(top_categories['Frequency'], labels=top_categories['Category'], autopct='%1.1f%%', startangle=140)
plt.title('Proportions of Top Categories')
plt.axis('equal')
plt.show()

In [None]:
# Delving into Product Pricing
# Business Question: How are products priced on Amazon UK, and are there specific price points or ranges that are more common?
# Measures of Centrality:
# Calculate the mean, median, and mode for the price of products.
# What's the average price point of products listed? How does this compare with the most common price point (mode)?

In [None]:
# Calculate mean, median, and mode for product prices
mean_price = df['price'].mean()
median_price = df['price'].median()
mode_price = df['price'].mode().iloc[0]

# Print the calculated measures of centrality
print("Mean price:", mean_price)
print("Median price:", median_price)
print("Mode price:", mode_price)

# Compare average price with the mode price
print("\nAverage price point:", mean_price)
print("Most common price point (mode):", mode_price)

In [None]:
# Measures of Dispersion:
# Determine the variance, standard deviation, range, and interquartile range for product price.
# How varied are the product prices? Are there any indicators of a significant spread in prices?

In [None]:
# Calculate variance, standard deviation, range, and interquartile range for product prices
variance_price = np.var(df['price'])
std_dev_price = np.std(df['price'])
range_price = df['price'].max() - df['price'].min()
q3, q1 = np.percentile(df['price'], [75 ,25])
iqr_price = q3 - q1

In [None]:
# Print the calculated measures of dispersion
print("Variance of price:", variance_price)
print("Standard deviation of price:", std_dev_price)
print("Range of price:", range_price)
print("Interquartile range of price:", iqr_price)

# Assess the variability in prices
print("\nProduct prices vary significantly.")
print("The range of prices is:", range_price)
print("The interquartile range of prices is:", iqr_price)

In [None]:
# Visualizations:
# Is there a specific price range where most products fall? Plot a histogram to visualize the distribution of product prices.
# If its hard to read these diagrams, think why this is, and explain how it could be solved.
# Are there products that are priced significantly higher than the rest? Use a box plot to showcase the 
# spread and potential outliers in product pricing.

In [None]:
# Plot histogram to visualize the distribution of product prices
plt.figure(figsize=(10, 6))
sns.histplot(df['price'], bins=30, kde=True, color='skyblue')
plt.title('Distribution of Product Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Plot box plot to showcase the spread and potential outliers in product pricing
plt.figure(figsize=(10, 6))
sns.boxplot(x=df['price'], color='lightgreen')
plt.title('Box Plot of Product Prices')
plt.xlabel('Price')
plt.show()

In [None]:
# If the diagrams are hard to read, it could be due to a large number of data points or skewed distributions. 
# You can improve readability by adjusting the number of bins in the histogram or using log scale for the axes. 
# Additionally, you can filter out extreme outliers to focus on the main distribution.

In [None]:
# 3: Unpacking Product Ratings
# Business Question: How do customers rate products on Amazon UK, and are there any patterns or tendencies in the ratings?
# Measures of Centrality:Calculate the mean, median, and mode for the rating of products. 
# How do customers generally rate products? Is there a common trend?

In [None]:
# Calculate mean, median, and mode for product ratings
mean_stars = df['stars'].mean()
median_stars = df['stars'].median()
mode_stars = df['stars'].mode().iloc[0]

# Print the calculated measures of centrality
print("Mean stars:", mean_stars)
print("Median stars:", median_stars)
print("Mode stars:", mode_stars)

# Assess the common trend in customer ratings
print("\nCustomers generally rate products positively.")
print("The mean stars is:", mean_stars)
print("The median stars is:", median_stars)
print("The mode stars is:", mode_stars)

In [None]:
print(df.columns)

In [None]:
# Measures of Dispersion:
# Determine the variance, standard deviation, and interquartile range for product rating.
# Are the ratings consistent, or is there a wide variation in customer feedback?

In [None]:
# Calculate variance, standard deviation, and interquartile range for product ratings
variance_stars = np.var(df['stars'])
std_dev_stars = np.std(df['stars'])
q3, q1 = np.percentile(df['stars'], [75 ,25])
iqr_stars = q3 - q1

# Print the calculated measures of dispersion
print("Variance of stars:", variance_stars)
print("Standard deviation of stars:", std_dev_stars)
print("Interquartile range of stars:", iqr_stars)

# Assess the consistency or variation in customer feedback
print("\nCustomer feedback for product ratings shows some variation.")
print("The variance of stars is:", variance_stars)
print("The standard deviation of stars is:", std_dev_stars)
print("The interquartile range of stars is:", iqr_stars)

In [None]:
# Shape of the Distribution:
# Calculate the skewness and kurtosis for the rating column.
# Are the ratings normally distributed, or do they lean towards higher or lower values?

In [None]:
# Calculate skewness and kurtosis for the rating column
import pandas as pd
from scipy.stats import skew, kurtosis


skewness = skew(df['stars'])
kurt = kurtosis(df['stars'])

# Print the calculated skewness and kurtosis
print("Skewness of stars:", skewness)
print("Kurtosis of stars:", kurt)

# Assess the shape of the distribution
if skewness < 0:
    skewness_desc = "negatively skewed"
elif skewness > 0:
    skewness_desc = "positively skewed"
else:
    skewness_desc = "symmetric"

if kurt < 0:
    kurtosis_desc = "platykurtic"
elif kurt > 0:
    kurtosis_desc = "leptokurtic"
else:
    kurtosis_desc = "mesokurtic (normal)"

print("\nThe distribution of ratings is", skewness_desc, "and", kurtosis_desc)

In [None]:
# Visualizations:
# Plot a histogram to visualize the distribution of product ratings. Is there a specific rating that is more common?

In [None]:
import pandas as pd
import matplotlib.pyplot as plt


# Plot a histogram of product ratings
plt.figure(figsize=(8, 6))
plt.hist(df['stars'], bins=20, color='skyblue', edgecolor='black')
plt.title('Distribution of Product stars')
plt.xlabel('stars')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [1]:
df

NameError: name 'df' is not defined