# Exporatory Data Analysis (EDA) on Bike Sales Dataset

In [None]:
import pandas as pd
df = pd.read_csv("../../data/Sales.csv", index_col=0, parse_dates=True) # Load the dataset and parse dates
df.head()  # Display the first few rows of the dataset

In [None]:
df.loc["2013"]

### Några saker att utforska
- explorera jämförande tal
- kategorier, subkategorier
- länder
- länder - profit
- tid på året
- graf på åren
- gruppera efter gender
- gruppera efter ålder
- produktkategori per år

In [None]:
df.columns # List all columns in the dataset

In [None]:
df.info()  # Get a concise summary of the DataFrame

In [None]:
df.shape  # Display the shape of the dataset

In [None]:
df['Product_Category'].value_counts()

In [None]:
df['Country'].value_counts()  # Count the occurrences of each country in the dataset

In [None]:
df["Age_Group"].value_counts()  # Count the occurrences of each age group in the dataset

In [None]:
df.describe()

In [None]:
df.describe().T.drop("count", axis=1) # Display descriptive statistics for numerical columns, excluding the count

In [None]:
df.isnull().sum()  # Check for missing values in the dataset

In [None]:
df_location = df[df['Country'] == 'Germany']
df_location = df_location.groupby(["Customer_Gender", "Age_Group"])["Profit"].sum().sort_values(ascending=False)
df_location  

- explorera jämförande tal
- kategorier, subkategorier
- länder - profit
- tid på året
- graf på åren
- gruppera efter gender
- gruppera efter ålder
- produktkategori per år

### Länder och profit


In [None]:
df[["Country", "Profit"]].groupby(["Country"]).mean(numeric_only=True).sort_values(
    by="Profit", ascending=False
).reset_index()


In [None]:
df[["Country", "Profit"]].groupby(["Country"]).sum(numeric_only=True).sort_values(
    by="Profit", ascending=False
).reset_index()

In [75]:
import duckdb

df_profit = duckdb.query(
    """
    SELECT 
        country,
        SUM(profit) AS Total_profit,
        MEAN(profit) AS Avg_profit,
        MEDIAN(profit) AS Median_profit,
    FROM df
    GROUP BY
        country
    ORDER BY
        total_profit DESC,
        avg_profit DESC
"""
).df()

df_profit

Unnamed: 0,Country,Total_profit,Avg_profit,Median_profit
0,United States,11073644.0,282.447687,87.0
1,Australia,6776030.0,283.089489,114.0
2,United Kingdom,4413853.0,324.071439,134.0
3,Canada,3717296.0,262.187615,83.0
4,Germany,3359995.0,302.756803,134.0
5,France,2880282.0,261.891435,99.0


In [None]:
import matplotlib.pyplot as plt 

fig, ax = plt.subplots(1, figsize = (12,6), dpi = 150)

ax.bar(x=df_profit["Country"], height=df_profit["Total_profit"])
ax.set(xlabel = "Country", ylabel="Total profit", title = "Total profit per country")
fig.savefig("profit_per_country.png")