In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import entropy

# Product in plural, e.g. "Kühlschränke"
product = "Lautsprecher"

df = pd.read_csv(f"{product}/{product}_clean.csv", dtype={'Vendor': str}, parse_dates=['Date'], index_col=0)
# df = pd.read_csv(f"{product}/{product}_first_timepoint_clean.csv", dtype={'Vendor': str}, parse_dates=['Date'], index_col=0) # uncomment when using only first timepoint

display(df)

**1. Mean Coefficient of Variation among Vendors (excluding variation over time) of each model**


In [None]:
# Coef-Var among vendors represents the coef-var for each Model & Date combination. Calculate the average coef-var_among_vendors for each Model
avg_coef_var = df.groupby('Model')['coef_var_among_vendors'].mean().reset_index()
avg_coef_var = avg_coef_var.rename(columns={'coef_var_among_vendors': 'mean_coef_var'})

display(avg_coef_var.sort_values(by='mean_coef_var', ascending=True))

Rounding is needed for subsequent features, but not necessary for mean_coef_var

In [None]:
# Function to round prices based on 1% of the median price for each model
def round_prices_to_one_percent(sub_df):
    model_median_price = sub_df['Median'].iloc[0]  # Get the median price for the model
    
    # Calculate 1% of the median price, and round it
    rounding_factor = model_median_price * 0.01
    
    if rounding_factor == 0:
        rounding_factor = 1  # Ensure that rounding factor is at least 1
    
    # Round the 'Price_w/o_shipping' to the nearest multiple of the rounding_factor
    sub_df['Price_w/o_shipping'] = (sub_df['Price_w/o_shipping'] / rounding_factor).round() * rounding_factor
    
    return sub_df

df = df.groupby('Model', group_keys=False).apply(round_prices_to_one_percent)

display(df)

**2. Number of times vendors changed prices for a model**

In [None]:
# Check if the data contains dates spanning more than 30 days. If no, the num_price_changes column is not meaningful and we drop it in the end
long_timeframe = (df['Date'].max() - df['Date'].min()).days >= 30

# Group by 'Model', 'Vendor' and 'Manufacturer', then calculate the number of distinct prices. Thus, we know how many different prices the vendor charged for the model throughout the dataset.
distinct_prices = df.groupby(['Model', 'Vendor', 'Manufacturer'])['Price_w/o_shipping'].nunique().reset_index()

distinct_prices.columns = ['Model', 'Vendor', 'Manufacturer', 'Distinct_Price_Count']
price_changes = distinct_prices.copy()
price_changes['num_price_changes'] = distinct_prices['Distinct_Price_Count'] - 1

# Calculate the average number of price changes for each model (across all vendors)
price_changes = price_changes.groupby('Model')['num_price_changes'].mean().reset_index()

display(price_changes)

**3. Maximum number of offerings at the same price**

In [None]:
# Group by Model, Date and count the occurences of value in Price_w/o_shipping
price_counts = df.groupby(['Model', 'Date', 'Price_w/o_shipping']).size().reset_index(name='Count_of_prices')

# Preparing df for next metric already
price_counts['Min_Price'] = price_counts.groupby(['Model', 'Date'])['Price_w/o_shipping'].transform('min')
price_counts['Is_Min_Price'] = price_counts['Price_w/o_shipping'] == price_counts['Min_Price']
price_counts = price_counts[['Model', 'Date', 'Price_w/o_shipping', 'Count_of_prices', 'Is_Min_Price']]

# Get the total number of offers per Model and Date
offer_counts = df.groupby(['Model', 'Date']).size().reset_index(name='total_offers')

# Calculate the maximum percentage of prices that are the same for each Model and Date
same_price_pct = price_counts.groupby(['Model', 'Date'])['Count_of_prices'].apply(lambda x: x.max() / x.sum()).reset_index(name='same_price_pct')

# Merge the percentage data with the total offers
same_price_pct = pd.merge(same_price_pct, offer_counts, on=['Model', 'Date'])

# Merge with manufacturer data
manufacturer_model = df[['Model', 'Manufacturer']].drop_duplicates()
same_price_pct = pd.merge(same_price_pct, manufacturer_model, on='Model')

# Average the percentage of same prices for each model across all dates
same_price_pct = same_price_pct.groupby(by='Model')['same_price_pct'].mean(numeric_only=False).reset_index()

display(same_price_pct)

**4. Number of offerings at the cheapest price**


In [None]:
# Same as same_price_pct, but filter the price_counts df to only include the minimum prices
cheap_same_price = price_counts[price_counts['Is_Min_Price'] == True]

# Rename the column to 'count_cheap_same_price'
cheap_same_price = cheap_same_price.rename(columns={'Count_of_prices': 'count_cheap_same_price'})

# Merge with manufacturer data
cheap_same_price = pd.merge(cheap_same_price, manufacturer_model, on='Model')

# Merge with total offers
cheap_same_price = pd.merge(cheap_same_price, offer_counts, on=['Model', 'Date'])

# Calculate the percentage of the cheapest price for each Model and Date
cheap_same_price['cheapest_same_price_pct'] = cheap_same_price['count_cheap_same_price'] / cheap_same_price['total_offers']

# Average the percentage of the cheapest price for each model across all dates
cheap_same_price = cheap_same_price.groupby(by='Model')['cheapest_same_price_pct'].mean(numeric_only=False).reset_index()

display(cheap_same_price)

**5. Entropy**

In [None]:
# Function to calculate the entropy of prices for each model
def calculate_entropy(prices):
    probabilities = np.bincount(prices) / len(prices) # Each unqiue value get own bin = maximum granularity (because we rounded before)
    return entropy(probabilities)

# Group by 'Model' and calculate entropy for each group
entropy_df = df.groupby('Model')['Price_w/o_shipping'].apply(lambda x: entropy(np.histogram(x, bins=len(x))[0], base=2)).reset_index()

entropy_df.columns = ['Model', 'entropy']

display(entropy_df)

**Merging all metrics into one df**

In [None]:
results = pd.concat([avg_coef_var, price_changes, same_price_pct, cheap_same_price, entropy_df], axis=1)

# Drop duplicate "Model" columns, keeping the first one
results = results.loc[:,~results.columns.duplicated()]
results = pd.merge(results, manufacturer_model, on='Model')

# Drop the 'num_price_changes' column if the timeframe is not long enough
if not long_timeframe:
    results.drop(columns=['num_price_changes'], inplace=True)

display(results)

results.to_csv(f"{product}/{product}_results_per_model.csv", index=False)
#results.to_csv(f"{product}/{product}_first_timepoint_results.csv", index=False) # uncomment when wanting analyzing only first timepoint