In [None]:
# Import Libraries and Dependencies
import pandas as pd

### 1. Combine and Clean the Data
#### Import CSVs

In [None]:
# Read the CSV files into DataFrames.
df_2020 = pd.read_csv('athletic_sales_2020.csv')
df_2021 = pd.read_csv('athletic_sales_2021.csv')

In [None]:
# Display the 2020 sales DataFrame
print(df_2020.head(5))

In [None]:
# Display the 2021 sales DataFrame
print(df_2021.head(5))


#### Check the data types of each DataFrame

In [None]:
# Check the 2020 sales data types.
print(df_2020.dtypes)

In [None]:
# Check the 2021 sales data types.
print(df_2021.dtypes)

#### Combine the sales data by rows.

In [None]:
# Combine the 2020 and 2021 sales DataFrames on the rows and reset the index.
df_combined = pd.concat([df_2020, df_2021], axis=0)
print(df_combined.count())

In [None]:
# Check if any values are null.
null_check = pd.isnull(df_combined)
null_check = null_check.any()
print(null_check)

In [None]:
# Check the data type of each column
print(df_combined.dtypes)

In [1]:
# Convert the "invoice_date" to a datetime datatype
df_combined['invoice_date'] = pd.to_datetime(df_combined['invoice_date'])


NameError: name 'pd' is not defined

In [None]:
# Confirm that the "invoice_date" data type has been changed.
print(df_combined.dtypes)

### 2. Determine which Region Sold the Most Products

#### Using `groupby`

In [None]:
# Show the number products sold for region, state, and city.
# Rename the sum to "Total_Products_Sold".
grouped_df = df_combined.groupby(['region', 'state', 'city'])['units_sold'].sum().reset_index()
grouped_df = grouped_df.rename(columns={'units_sold': 'Total_Products_Sold'})

# Show the top 5 results.
top_5_df = grouped_df.sort_values(by='Total_Products_Sold', ascending=False).head(5)

print(top_5_df)
                 

#### Using `pivot_table`

In [None]:
# Show the number products sold for region, state, and city.
pivot_table = df_combined.pivot_table(index=['region', 'state', 'city'],
                                      values= 'units_sold',
                                      aggfunc='sum')

# Rename the "units_sold" column to "Total_Products_Sold"
pivot_table = pivot_table.rename(columns={'units_sold' : 'Total_Products_Sold'})

# Show the top 5 results.
top_five_results = pivot_table.sort_values(by='Total_Products_Sold', ascending=False).head(5)
print(top_five_results)

### 3. Determine which Region had the Most Sales

#### Using `groupby`

In [None]:
# Show the total sales for the products sold for each region, state, and city.
# Rename the "total_sales" column to "Total Sales"
grouped_df = df_combined.groupby(['region', 'state', 'city'])['total_sales'].sum()
grouped_df = grouped_df.reset_index(name='Total_Sales')

# Show the top 5 results.
top_sales = (grouped_df.sort_values(by='Total_Sales', ascending=False)
             .head(5)
             .to_string(index=False))
print(top_sales)

#### Using `pivot_table`

In [None]:
# Show the total sales for the products sold for each region, state, and city.
my_table = df_combined.pivot_table(index=['region', 'state', 'city'],
                                     values='total_sales',
                                     aggfunc='sum')

# Optional: Rename the "total_sales" column to "Total Sales"
my_table = my_table.rename(columns={'total_sales': 'Total Sales'})

# Show the top 5 results.
top_5_sales = my_table.sort_values('Total Sales', ascending=False).head(5)
print(top_5_sales)

### 4. Determine which Retailer had the Most Sales

#### Using `groupby`

In [None]:
# Show the total sales for the products sold for each retailer, region, state, and city.
# Rename the "total_sales" column to "Total Sales"
retailer_group_df = df_combined.groupby(['retailer', 'region', 'state', 'city'])['total_sales'].sum()
retailer_group_df = retailer_group_df.reset_index(name='Total Sales')
# Show the top 5 results.
top_retailer = (retailer_group_df.sort_values(by='Total Sales', ascending=False)
                .head(5)
                .to_string(index=False))
print(top_retailer)


#### Using `pivot_table`

In [None]:
# Show the total sales for the products sold for each retailer, region, state, and city.
retailer_table = df_combined.pivot_table(index=['retailer', "region", 'state', 'city'],
                                         values= 'total_sales',
                                         aggfunc='sum')

# Optional: Rename the "total_sales" column to "Total Sales"
retailer_table = retailer_table.rename(columns={'total_sales': 'Total Sales'})

# Show the top 5 results.
top_5_retailers = retailer_table.sort_values('Total Sales', ascending=False).head(5)
print(top_5_retailers)



### 5. Determine which Retailer Sold the Most Women's Athletic Footwear

In [None]:
# Filter the sales data to get the women's athletic footwear sales data.
womens_shoes_df = df_combined[df_combined['product'] == "Women's Athletic Footwear"]


#### Using `groupby`

In [None]:
# Show the total number of women's athletic footwear sold for each retailer, region, state, and city.
# Rename the "units_sold" column to "Womens_Footwear_Units_Sold"
womens_shoe_df = df_combined.groupby(['retailer', 'region', 'state', 'city'])['units_sold'].sum()
womens_shoe_df_adj = womens_shoe_df.reset_index(name="Womens_Footwear_Units_Sold")
# Show the top 5 results.
top_womems_retailer = (womens_shoe_df_adj.sort_values(by='Womens_Footwear_Units_Sold', ascending=False)
                       .head(5)
                       .to_string(index=False))
print(top_womems_retailer)

#### Using `pivot_table`

In [None]:
# Show the total number of women's athletic footwear sold for each retailer, region, state, and city.
womens_shoe_df = womens_shoe_df.reset_index()
womens_table = womens_shoe_df.pivot_table(index=['retailer', 'region', 'state', 'city'],
                                            values='units_sold',
                                            aggfunc='sum')

# Rename the "units_sold" column to "Womens_Footwear_Units_Sold"
womens_table = womens_table.rename(columns={'units_sold': 'Womens_Footwear_Units_Sold'})
# Show the top 5 results.
top_5_w_table = womens_table.sort_values('Womens_Footwear_Units_Sold', ascending=False).head(5)

print(top_5_w_table)


### 5. Determine the Day with the Most Women's Athletic Footwear Sales

In [None]:
# Create a pivot table with the 'invoice_date' column is the index, and the "total_sales" as the values.
womens_shoes_table_df= df_combined[df_combined['product'] == "Women's Athletic Footwear"]
womens_pivot_table = womens_shoes_table_df.pivot_table(index='invoice_date', 
                                             values='total_sales', 
                                             aggfunc='sum')

# Optional: Rename the "total_sales" column to "Total Sales"
womens_pivot_table = womens_pivot_table.rename(columns={'total_sales': 'Total Sales'})

# Show the table.
print(womens_pivot_table)

In [None]:
# Resample the pivot table into daily bins, and get the total sales for each day.
daily_sales = womens_pivot_table.resample('D').sum()

# Sort the resampled pivot table in ascending order on "Total Sales".
sorted_daily_sales = daily_sales.sort_values(by='Total Sales', ascending=False)
print(sorted_daily_sales)

# Extracting the day with the highest sales
highest_sales_row = sorted_daily_sales.idxmax()

highest_sales_date = highest_sales_row['Total Sales']
highest_sales_value = sorted_daily_sales.loc[highest_sales_date, 'Total Sales']

print(f"The day with the highest sales for women's athletic footwear was {highest_sales_date.strftime('%Y-%m-%d')} with a total of 

### 6.  Determine the Week with the Most Women's Athletic Footwear Sales

In [None]:
# Resample the pivot table into weekly bins, and get the total sales for each week.
weekly_sales = womens_pivot_table.resample('W').sum()

# Sort the resampled pivot table in ascending order on "Total Sales".
sorted_weekly_sales = weekly_sales.sort_values(by='Total Sales', ascending=False)
print(sorted_weekly_sales)


In [None]:
# Extracting the week with the highest sales
highest_sales_week = sorted_weekly_sales.index[0]
highest_sales_value = sorted_weekly_sales.iloc[0]['Total Sales']

# Format the start and end dates of the week
week_start = highest_sales_week.strftime('%Y-%m-%d')
week_end = (highest_sales_week + pd.Timedelta(days=6)).strftime('%Y-%m-%d')

print(f"The week with the highest sales for women's athletic footwear was from {week_start} to {week_end}, with a total of ${hi