## Load and preprocess data, then calculate the number of unique algorithmic and non-algorithmic sellers.


In [1]:
import pandas as pd
import numpy as np
##Read in the Sellers who are algorithmic
df_algorithmic_seller_names = pd.read_csv('../Grunddatein/Zwischendatein/PA-Adopters.csv')
# rename column
df_algorithmic_seller_names = df_algorithmic_seller_names.rename(columns={'AlgorithmicSellerNames': 'sellerName'})  
# Check for duplicates in the entire DataFrame
duplicate_rows = df_algorithmic_seller_names.duplicated()
print(f"Number of duplicate rows: {duplicate_rows.sum()}")

Number of duplicate rows: 0


In [2]:


df = pd.read_csv('../Grunddatein/Zwischendatein/CleanedDataCompleteNoNulls.csv')
df["time"] = pd.to_datetime(df["time"], format="%Y-%m-%d %H:%M:%S")

##Transform CrawlTime Column to DateTime Format
df['crawlTime'] = pd.to_datetime(df['crawlTime'])

## Exclude all Rows where condition isnt New and which where a reactive Crawl
df = df[df['condition'].isin(['Neu', 'New'])]

##Get the Number of Distinct Values
distinct_values_verkäufer = df['sellerName'].unique()
print(f"Es gibt insgesamt {len(distinct_values_verkäufer)} verschiedene Verkäufer")

# Create a new column to indicate if the seller is algorithmic or not
df['is_algorithmic'] = df['sellerName'].isin(df_algorithmic_seller_names['sellerName'])

# Calculate the number of algorithmic and non-algorithmic sellers
num_algo_sellers = df[df['is_algorithmic']]['sellerName'].nunique()
num_non_algo_sellers = df[~df['is_algorithmic']]['sellerName'].nunique()

print(f"Es gibt insgesamt {num_non_algo_sellers} Non-Algo Verkäufer")
print(f"Es gibt insgesamt {num_algo_sellers} Algo Verkäufer")
print(f"Sie Summieren sich richtig auf",num_algo_sellers + num_non_algo_sellers == len(distinct_values_verkäufer))

# Create two separate DataFrames for non-algorithmic and algorithmic seller names
non_algo_seller_names = df.loc[df['is_algorithmic'] == False, 'sellerName'].unique()
algo_seller_names = df.loc[df['is_algorithmic'] == True, 'sellerName'].unique()

df_non_algo_names = pd.DataFrame(non_algo_seller_names, columns=['sellerName'])
df_algo_names = pd.DataFrame(algo_seller_names, columns=['sellerName'])


KeyboardInterrupt



In [None]:
df.info()

In [None]:
# Filter dataframe for each seller
df_at_memory = df[df['sellerName'] == 'Schuh-Helden']
df_skyline_media = df[df['sellerName'] == 'Schuh-Lounge24']

# Get unique ASINs for each seller
asins_at_memory = set(df_at_memory['asin'].unique())
asins_skyline_media = set(df_skyline_media['asin'].unique())

# Find ASINs that appear in both sets (i.e., the intersection of the sets)
common_asins = asins_at_memory & asins_skyline_media

# Print the result
print(common_asins)


In [None]:
# define a function to extract hour from datetime column
def get_hour(datetime_val):
    return datetime_val.hour

# apply the get_hour function to the 'time' column using apply method
df['hourTime'] = df['time'].apply(get_hour)

## Alle Plots

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Extract unique ASINs
unique_asins = df['asin'].unique()

# Loop over unique ASINs
for asin in unique_asins:
    # Filter DataFrame for the selected ASIN and select necessary columns
    asin_df = df[df['asin'] == asin][['time', 'price', 'sellerId', 'is_algorithmic']].copy()

    # Convert 'time' to datetime object
    asin_df['time'] = pd.to_datetime(asin_df['time'])

    # Group by 'time', 'sellerId', and 'is_algorithmic', calculate mean of 'price'
    grouped_df = asin_df.groupby(['time', 'sellerId', 'is_algorithmic']).mean().reset_index()

    # Add algorithmic info to sellerId
    grouped_df['sellerId'] = grouped_df.apply(lambda row: f"{row['sellerId']} (Algorithmic)" if row['is_algorithmic'] else f"{row['sellerId']}", axis=1)

    # Pivot the DataFrame
    pivot_df = grouped_df.pivot(index='time', columns='sellerId', values='price')

    # Plot the data
    fig, ax = plt.subplots(figsize=(15, 7))
    pivot_df.plot(kind='line', ax=ax)
    plt.xlabel('Time')
    plt.ylabel('Price')
    plt.title(f'Price History Across Sellers for Asin: {asin}')
    plt.tight_layout()

    # Move the legend outside the plot on the right
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

    plt.show()

    print("This is the ASIN", asin)


In [None]:
## Interessante ASINS
normalSeller = ['B000MJR8UO']
now = ['B077T3QZ1H','B0062VH4NM']
night = ['B001R4BR1O','B00605N1G4']
auch_interesant = ['B001IL99I4','B00DG89W0W']
asins_perfect_matching = ['B0062VH4NM','B0797CV4TX','B07RY6RDV7','B077T3QZ1H','B07ZHBJRWF','B07WQPDC72',]
asins_intresting = ['B099NPYRVF','B00J7GVPY8','B0196Q9PVS','B000GISU1M','B09R1QSHN9','B07MTG5V14',]

## Interesante ASINS

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

asins_perfect_matching = ['B0062VH4NM','B0797CV4TX','B07RY6RDV7','B077T3QZ1H','B07ZHBJRWF','B07WQPDC72',]
asins_intresting = ['B00J7GVPY8','B000GISU1M','B09R1QSHN9']

# Extract unique ASINs
unique_asins = df['asin'].unique()

exclude_sellers = ['amazon','AMTZEDNZE6EVF','A1WUUK7EBFTPLY', 'A2QA60OP8EX7O6',   'ABJ00Z4TWCDSX','ABJ00Z4TWCDSX','ACMWIN42TGI9W']


# Loop over unique ASINs
for asin in normalSeller:
    # Filter DataFrame for the selected ASIN and select necessary columns
    asin_df = df[(df['asin'] == asin) & (~df['sellerId'].isin(exclude_sellers))][['time', 'price', 'sellerId', 'is_algorithmic']].copy()

    # Convert 'time' to datetime object
    asin_df['time'] = pd.to_datetime(asin_df['time'])

    # Create a mapping from unique seller ids to anonymized names
    unique_sellers = asin_df['sellerId'].unique()
    seller_mapping = {seller: f'seller-{i+1}' if seller != 'amazon' else 'amazon' for i, seller in enumerate(unique_sellers)}

    # Replace seller ids with anonymized names
    asin_df['sellerId'] = asin_df['sellerId'].map(seller_mapping)

    # Group by 'time', 'sellerId', and 'is_algorithmic', calculate mean of 'price'
    grouped_df = asin_df.groupby(['time', 'sellerId', 'is_algorithmic']).mean().reset_index()

    # Add algorithmic info to sellerId
    grouped_df['sellerId'] = grouped_df.apply(lambda row: f"{row['sellerId']} (Algorithmic)" if row['is_algorithmic'] else f"{row['sellerId']}", axis=1)

    # Pivot the DataFrame
    pivot_df = grouped_df.pivot(index='time', columns='sellerId', values='price')

    # Print anonymized seller names
    print("Seller IDs for ASIN ", asin, ": ", pivot_df.columns.tolist())

    # Plot the data
    fig, ax = plt.subplots(figsize=(15, 7))
    pivot_df.plot(kind='line', ax=ax)
    plt.xlabel('Time')
    plt.ylabel('Price')
    plt.title(f'Price History Across Sellers for Asin: {asin}')
    plt.tight_layout()

    # Move the legend outside the plot on the right
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

    # Save the plot to a PNG file, including the legend
    plt.savefig(f'{asin}_plot.png', bbox_inches='tight')

    plt.show()

    print("This is the ASIN", asin)


In [None]:
df.head()

In [None]:
# Filter DataFrame for the specified ASIN and sellerId
asin_seller_df = df[(df['asin'] == "B00605N1G4") & (df['sellerId'] == "ABGAQ3TO9PA1P")]

# Calculate the mean price over the whole day
mean_price_all_day = asin_seller_df['price'].mean()
print(f"Mean price over the whole day: {mean_price_all_day}")

# Calculate the mean price during the early morning (03:00 - 05:00)
mean_price_early_morning = asin_seller_df[asin_seller_df['hourTime'].apply(lambda x: 3 <= x < 5)]['price'].mean()
print(f"Mean price during the early morning (03:00 - 05:00): {mean_price_early_morning}")

# Calculate the mean price during the day excluding early morning (05:00 - 20:00)
mean_price_day = asin_seller_df[asin_seller_df['hourTime'].apply(lambda x: 5 <= x < 20)]['price'].mean()
print(f"Mean price during the day (05:00 - 20:00): {mean_price_day}")


In [None]:
# Filter DataFrame for the specified ASIN and sellerId
asin_seller_df = df[(df['asin'] == "B00605N1G4") & (df['sellerId'] == "ABGAQ3TO9PA1P")]

# Sort the DataFrame by 'time' to ensure the data is in chronological order
asin_seller_df = asin_seller_df.sort_values('time')

# Initialize a counter for consecutive occurrences and a maximum counter
consec_counter = 0
max_consec = 0
consec_hours = []
max_consec_hours = []

# Loop through the DataFrame
for idx, row in asin_seller_df.iterrows():
    # If the price is 60.85, increment the counter
    if row['price'] == 60.85:
        consec_counter += 1
        consec_hours.append(row['hourTime'])
        # If this is a new maximum, update the maximum counter and hours
        if consec_counter > max_consec:
            max_consec = consec_counter
            max_consec_hours = consec_hours.copy()  # make sure to copy the list, not just reference it
    # If the price is not 60.85, reset the counter and hours list
    else:
        consec_counter = 0
        consec_hours = []

print(f"The maximum number of consecutive times the price was set to 60.85 is: {max_consec}")
print(f"This occurred at the following hours: {max_consec_hours}")


In [None]:
# Filter DataFrame for the specified ASIN and sellerId
asin_seller_df = df[(df['asin'] == "B00605N1G4") & (df['sellerId'] == "ABGAQ3TO9PA1P")]

# Convert 'time' to datetime object
asin_seller_df['time'] = pd.to_datetime(asin_seller_df['time'])

# Create a 'hour' column based on the 'time' column
asin_seller_df['hour'] = asin_seller_df['time'].dt.hour

# Group by 'hour' and 'price', then drop duplicates and sort by 'hour'
price_hour_combinations = asin_seller_df[['hour', 'price']].drop_duplicates().sort_values('price')

# Count occurrence of each combination
price_hour_counts = asin_seller_df.groupby(['hour', 'price']).size().reset_index(name='counts')

print(price_hour_combinations)


In [None]:
# Filter DataFrame for the specified ASIN
asin_df = df[df['asin'] == "B00605N1G4"]

# Group by 'sellerId' and calculate the minimum and maximum of 'price'
grouped_df = asin_df.groupby('sellerId')['price'].agg(['min', 'max']).reset_index()

print(grouped_df)


## Einstellbar wie viele Seller pro plot

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Extract unique ASINs
unique_asins = df['asin'].unique()

# Sellers to exclude
exclude_sellers = [""]

# Loop over unique ASINs
for asin in unique_asins:
    # Filter DataFrame for the selected ASIN and select necessary columns
    # Exclude the sellers listed in 'exclude_sellers'
    asin_df = df[(df['asin'] == asin) & (~df['sellerId'].isin(exclude_sellers))][['time', 'price', 'sellerId', 'is_algorithmic']].copy()

    # Convert 'time' to datetime object
    asin_df['time'] = pd.to_datetime(asin_df['time'])

    # Group by 'time', 'sellerId', and 'is_algorithmic', calculate mean of 'price'
    grouped_df = asin_df.groupby(['time', 'sellerId', 'is_algorithmic']).mean().reset_index()

    # Add algorithmic info to sellerId
    grouped_df['sellerId'] = grouped_df.apply(lambda row: f"{row['sellerId']} (Algorithmic)" if row['is_algorithmic'] else f"{row['sellerId']}", axis=1)

    # Pivot the DataFrame
    pivot_df = grouped_df.pivot(index='time', columns='sellerId', values='price')

    # Check if only two sellers are present
    if len(pivot_df.columns) == 4:
        # Print seller ids
        print("Seller IDs for ASIN ", asin, ": ", pivot_df.columns.tolist())

        # Plot the data
        fig, ax = plt.subplots(figsize=(15, 7))
        pivot_df.plot(kind='line', ax=ax)
        plt.xlabel('Time')
        plt.ylabel('Price')
        plt.title(f'Price History Across Sellers for Asin: {asin}')
        plt.tight_layout()

        # Move the legend outside the plot on the right
        ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

        # Save the plot to a PNG file, including the legend
        plt.savefig(f'{asin}_plot.png', bbox_inches='tight')

        plt.show()

        print("This is the ASIN", asin)

     

In [None]:
df.info()

In [None]:
# Filter DataFrame for the specified ASIN
asin_df = df[df['asin'] == "B0062VH4NM"]

# Group by 'sellerId' and calculate the minimum and maximum of 'price'
grouped_df = asin_df.groupby('sellerId')['price'].agg(['min', 'max']).reset_index()

print(grouped_df)


In [None]:

# Filter DataFrame for the specified ASIN
asin_df = df[df['asin'] == "B077T3QZ1H"]

# Group by 'sellerId' and calculate the minimum and maximum of 'price'
grouped_df = asin_df.groupby('sellerId')['price'].agg(['min', 'max']).reset_index()

print(grouped_df)
