In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('../Grunddatein/Zwischendatein/cleaned_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109215 entries, 0 to 109214
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   asin                  109215 non-null  object 
 1   time                  109215 non-null  object 
 2   id                    109215 non-null  float64
 3   price                 109215 non-null  float64
 4   currency              109215 non-null  object 
 5   crawlTime             109215 non-null  object 
 6   condition             109215 non-null  object 
 7   sellerName            109215 non-null  object 
 8   sellerId              109215 non-null  object 
 9   sellerbewertung       81457 non-null   float64
 10  seller_sterne         109215 non-null  float64
 11  lieferdatum           105389 non-null  object 
 12  lieferpreis           105389 non-null  float64
 13  lieferung_durch       109215 non-null  object 
 14  ranking               109215 non-null  float64
 15  

In [2]:
## Print Basic Information about Dataframe.
#TimeFrame
df["time"] = pd.to_datetime(df["time"], format="%Y-%m-%d %H:%M:%S")
# Find the max and min values for the 'time' column
max_time = df['time'].max()
min_time = df['time'].min()

print("Max value for the 'time' column:", max_time)
print("Min value for the 'time' column:", min_time)

distinct_values_verkäufer = df['sellerName'].nunique()
print(f"Es gibt insgesamt {distinct_values_verkäufer} verschiedene Verkäufer")

Max value for the 'time' column: 2023-03-23 04:00:00
Min value for the 'time' column: 2023-03-22 16:00:00
Es gibt insgesamt 1838 verschiedene Verkäufer


## Generating List of asin Seller/Pairs and the Corresponding Price Changes/Observation Time

In [3]:
import pandas as pd
from datetime import timedelta

# Assuming your DataFrame is called df
df["time"] = pd.to_datetime(df["time"], format="%Y-%m-%d %H:%M:%S")

# First, sort the DataFrame by 'asin', 'sellerName', 'condition', 'time', and 'price'
df = df.sort_values(by=['asin', 'sellerName','condition', 'time','price'])

# Reset the index after sorting
df.reset_index(drop=True, inplace=True)

# Compute the price changes by checking if the price in the current row is different from the previous row,
# and if the sellerName is the same as in the previous row and if the condition is the same as in the previous row
# and if the time difference is exactly one hour and if there is no offer one hour earlier with the same condition and the same price from the same seller
df['price_change'] = (
    (df['price'] != df['price'].shift(1)) &
    (df['condition'] == df['condition'].shift(1)) &
    (df['sellerName'] == df['sellerName'].shift(1)) &
    (df['asin'] == df['asin'].shift(1)) &
    ((df['time'] - df['time'].shift(1)) == timedelta(hours=1)) &
    (~(
        ((df['condition'] == df['condition'].shift(1)) & (df['price'] == df['price'].shift(1)) & ((df['time'] - df['time'].shift(1)) == timedelta(hours=1)) & (df['sellerName'] == df['sellerName'].shift(1))) |
        ((df['condition'] == df['condition'].shift(2)) & (df['price'] == df['price'].shift(2)) & ((df['time'] - df['time'].shift(2)) == timedelta(hours=1)) & (df['sellerName'] == df['sellerName'].shift(2)))
    ))
).astype(int)

# Calculate the observation time for each seller-asin combination
observation_time = df.groupby(['asin', 'sellerName'])['time'].agg(['min', 'max']).reset_index()
observation_time['observation_time'] = observation_time['max'] - observation_time['min']

# Now, group by 'asin' and 'sellerName' and sum the 'price_change' column to get the number of price changes
price_changes = df.groupby(['asin', 'sellerName'])['price_change'].sum().reset_index()

# Merge the price_changes and observation_time DataFrames
result_df = pd.merge(price_changes, observation_time[['asin', 'sellerName', 'observation_time']], on=['asin', 'sellerName'])

# Convert the result to a list of tuples
result = list(result_df.itertuples(index=False, name=None))

# Print the result
for x in result:
    print(x)


('B00000JD6K', 'Kidia', 2, Timedelta('0 days 12:00:00'))
('B00000JD6K', 'STILE IMMAGINE DIGITAL HD', 0, Timedelta('0 days 12:00:00'))
('B00005KIRS', 'Arlambo', 6, Timedelta('0 days 12:00:00'))
('B00005KIRS', 'Cernovia', 0, Timedelta('0 days 12:00:00'))
('B00005KIRS', 'Crystal4ever', 1, Timedelta('0 days 12:00:00'))
('B00005KIRS', 'DASTRO ® --- Einfach. Genial. Günstig.', 0, Timedelta('0 days 12:00:00'))
('B00005KIRS', 'GREINER GROSSHANDEL', 1, Timedelta('0 days 12:00:00'))
('B00005KIRS', 'Kaleido.Shop GmbH', 0, Timedelta('0 days 12:00:00'))
('B00005KIRS', 'Technikprimus', 0, Timedelta('0 days 12:00:00'))
('B00005KIRS', 'amazon', 1, Timedelta('0 days 12:00:00'))
('B00005KIRS', 'e-joker', 2, Timedelta('0 days 12:00:00'))
('B00005KIRS', 'inandout', 1, Timedelta('0 days 12:00:00'))
('B00008K61M', 'ABC-Schnäppchenmarkt GmbH', 0, Timedelta('0 days 12:00:00'))
('B00008K61M', 'ARDEBO', 0, Timedelta('0 days 12:00:00'))
('B00008K61M', 'CW-SHOPPING', 1, Timedelta('0 days 12:00:00'))
('B00008K61M'

('B004PVUL7C', 'amazon', 2, Timedelta('0 days 12:00:00'))
('B004PVUL7C', 'get goods', 0, Timedelta('0 days 12:00:00'))
('B004PVUL7C', 'profi-electro', 0, Timedelta('0 days 00:00:00'))
('B004PVUL7C', 'spiessxxl', 0, Timedelta('0 days 00:00:00'))
('B004QEZJWU', 'BWBahn', 0, Timedelta('0 days 00:00:00'))
('B004QEZJWU', 'Happy Home Italia', 0, Timedelta('0 days 00:00:00'))
('B004QEZJWU', 'Happy Home Srl', 1, Timedelta('0 days 12:00:00'))
('B004QEZJWU', 'Modellbau Fischer', 0, Timedelta('0 days 12:00:00'))
('B004QEZJWU', 'Railroad 2000', 0, Timedelta('0 days 12:00:00'))
('B004QEZJWU', 'SMDV', 0, Timedelta('0 days 00:00:00'))
('B004QEZJWU', 'amazon', 1, Timedelta('0 days 12:00:00'))
('B004QEZJWU', 'get goods', 0, Timedelta('0 days 12:00:00'))
('B004RUB648', 'MusicScout', 8, Timedelta('0 days 12:00:00'))
('B004RUB648', 'Musikhaus-Korn', 0, Timedelta('0 days 12:00:00'))
('B004RUB648', 'amazon', 0, Timedelta('0 days 12:00:00'))
('B004THFJ3S', 'Amazon Warehouse ', 0, Timedelta('0 days 12:00:00')

## Actual Calculation if a seller is algorithmic

In [4]:
# Convert the result list to a DataFrame
result_df = pd.DataFrame(result, columns=['asin', 'sellerName', 'num_price_changes', 'observation_time'])

# Calculate the observation time in hours
result_df['observation_time'] = result_df['observation_time'].apply(lambda x: x.total_seconds() / 3600)

# Calculate the price changes per hour
result_df['price_changes_per_hour'] = result_df['num_price_changes'] / result_df['observation_time']

# Replace infinite values with NaN
result_df['price_changes_per_hour'] = result_df['price_changes_per_hour'].replace([np.inf, -np.inf], np.nan)

# Calculate the median of non-infinite and non-missing values
median_price_changes = result_df['price_changes_per_hour'].dropna().median()

# Replace missing and infinite values with the median
result_df['price_changes_per_hour'] = result_df['price_changes_per_hour'].fillna(median_price_changes)

# Calculate the mean and standard deviation
mean_price_changes = result_df['price_changes_per_hour'].mean()
std_price_changes = result_df['price_changes_per_hour'].std()

print("Mean price changes per hour:", mean_price_changes)
print("Standard deviation of price changes per hour:", std_price_changes)

# Set a threshold for classifying sellers as algorithmic
# For example, we can set the threshold as mean + 2 * std, which will classify approximately 2.5% of sellers as algorithmic
threshold = mean_price_changes + 1 * std_price_changes
print("Threshold:", threshold)

# Add a column to the result DataFrame indicating whether the seller is algorithmic
result_df['is_algorithmic'] = result_df['price_changes_per_hour'] > threshold

# Sort the algorithmic seller pairs by price changes per hour
algorithmic_seller_pairs = result_df[result_df['is_algorithmic']].sort_values(by='price_changes_per_hour', ascending=False)

# Get the names of algorithmic sellers
algorithmic_seller_names = algorithmic_seller_pairs['sellerName'].unique()

# Filter the result DataFrame to include only non-algorithmic sellers
non_algorithmic_seller_pairs = result_df[~result_df['sellerName'].isin(algorithmic_seller_names)]

# Combine the algorithmic and non-algorithmic seller pairs
final_result = pd.concat([algorithmic_seller_pairs, non_algorithmic_seller_pairs], ignore_index=True)


Mean price changes per hour: 0.06806257014590347
Standard deviation of price changes per hour: 0.13292299627023416
Threshold: 0.20098556641613763


In [5]:
# Filter the DataFrame to include only the algorithmic sellers
algorithmic_sellers = final_result[final_result['is_algorithmic']]

# Get the unique seller names
unique_algorithmic_sellers = algorithmic_sellers['sellerName'].unique()

# Count the number of unique sellers by seller name
num_algorithmic_sellers = len(unique_algorithmic_sellers)

# Print the result
print("Number of unique algorithmic sellers:", num_algorithmic_sellers)
print("Unique algorithmic seller names:")
for seller_name in unique_algorithmic_sellers:
    print(seller_name)


Number of unique algorithmic sellers: 351
Unique algorithmic seller names:
Amazon Warehouse 
Stabilo-Fachmarkt
Happy Home Italia
trendmile
Spielzeugwelten (alle Preise inkl. gesetzlicher MwSt. und zzgl.Versandkosten - Impressum & Widerrufsbelehrung in der Verkäuferinfo)
Cstore.
Morelenet
BestArtikel GmbH
amazon
werkzeugbilligercom
Happy Home Srl
Solution 4YOU
Electronic_System
ACE-Deutschland
YESEATIS
Bauschmeisser
To B To C
Music and More Store
MusicScout
Home Automation EU
baumarktdiscount
GETIC
Stortle
REDIALING
Spiele-und-Abenteuer
LEICKE - Leipzig
e-joker
nox divendo
Computerhandlung
ZOOM ICI
ai-Trading24
BULUTTECH
Expert AGD
Smart7 Europe
Trinity ecom
myHobby24 | Ihr Freizeitspezialist für Deutschland
nrsolutions
Kommerzpunkt
LSE Commerce
HOH GmbH IHR FACHGESCHÄFT seit 1894 - Gratis Versand ab 29 € Deutschland
cnyolee
llccenysp
AS-Discount
Rad & Sportkönig
BOSSPRODUCTS
InstaSpares
CHENGCHUANGA
okluge
World Trader's
GetMarket
Toys for Fun GmbH
ivy-electro
nierlecom
Go De
computeru

In [6]:
# Filter the DataFrame to include only the non-algorithmic sellers
non_algorithmic_sellers = final_result[~final_result['is_algorithmic']]

# Get the unique seller names
unique_non_algorithmic_sellers = non_algorithmic_sellers['sellerName'].unique()

# Count the number of unique sellers by seller name
num_non_algorithmic_sellers = len(unique_non_algorithmic_sellers)
# Print the result
print("Number of unique non algorithmic sellers:", num_non_algorithmic_sellers)
# Print the result
print("Number of unique algorithmic sellers:", num_algorithmic_sellers,"of a total of ",distinct_values_verkäufer)

Number of unique non algorithmic sellers: 1487
Number of unique algorithmic sellers: 351 of a total of  1838


# Calucalte Algorithmic Sellers by Total Price Changes over All Products

In [7]:
import pandas as pd

# Assuming your DataFrame is called df

# Convert the 'time' column to a 'datetime' type
df['time'] = pd.to_datetime(df['time'])

# First, sort the DataFrame by 'asin', 'sellerName', and 'time'
df = df.sort_values(by=['asin', 'sellerName', 'time'])

# Reset the index after sorting
df.reset_index(drop=True, inplace=True)

# Compute the price changes by checking if the price in the current row is different from the previous row, and if the sellerName is the same as in the previous row
df['price_change'] = (df['price'] != df['price'].shift(1)) & (df['sellerName'] == df['sellerName'].shift(1)) & (df['asin'] == df['asin'].shift(1))

# Now, group by 'sellerName' and sum the 'price_change' column to get the number of price changes
seller_price_changes = df.groupby('sellerName')['price_change'].sum().reset_index()

# Calculate the number of different products for each seller
unique_products = df.groupby('sellerName')['asin'].nunique().reset_index()

# Merge the seller_price_changes and unique_products DataFrames
seller_summary = pd.merge(seller_price_changes, unique_products, on='sellerName')

# Sort the results by the number of price changes in descending order
seller_summary = seller_summary.sort_values(by='price_change', ascending=False)

# Convert the result to a list of tuples
result_by_seller = [(row.sellerName, row.price_change, row.asin) for row in seller_summary.itertuples(index=False)]

# Print the result
for x in result_by_seller:
    print(x)


('Amazon Warehouse ', 4680, 316)
('amazon', 1429, 590)
('Happy Home Srl', 1129, 120)
('computeruniverse', 352, 112)
('e-joker', 299, 110)
('GREINER GROSSHANDEL', 292, 32)
('Solution 4YOU', 266, 48)
('okluge', 250, 66)
('MusicScout', 231, 34)
('AS-Discount', 190, 123)
('Musikhaus Kirstein GmbH', 170, 30)
('Stortle', 169, 61)
('To B To C', 168, 14)
('TechPoint1111', 154, 29)
('ProComponentes', 146, 47)
('CW-SHOPPING', 139, 39)
('nierlecom', 133, 35)
('GETIC', 119, 6)
('ITCHECK24  --  alle Preise inkl. MwSt. - Widerrufsbelehrung + AGB unter Verkäuferhilfe', 112, 37)
('Casa Haushalt', 111, 35)
('Expert AGD', 107, 37)
('SIA Solution', 106, 26)
('MIOGA Warenhandel & Service GmbH', 100, 88)
('Masskas', 92, 29)
('inandout', 91, 56)
('nrsolutions', 87, 63)
('GetMarket', 82, 39)
('Happy Home Italia', 79, 66)
('nox divendo', 78, 24)
('tiendainformatica-de', 75, 28)
('Fexyshop', 71, 22)
('BIBCASA', 70, 4)
('Power & Handel Vertriebs-GmbH', 68, 25)
('Computerhandlung', 65, 37)
('Spiele-und-Abenteuer

## Calculation comes Now

In [8]:
import pandas as pd
import numpy as np

# Assuming result_by_seller is the list containing sellerName, price changes, and unique products

# Convert the list to a DataFrame
seller_df = pd.DataFrame(result_by_seller, columns=['sellerName', 'price_changes', 'unique_products'])

# Calculate the average number of price changes per product for each seller
seller_df['price_changes_per_product'] = seller_df['price_changes'] / seller_df['unique_products']

# Calculate the mean and standard deviation of the price changes per product
mean_changes_per_product = seller_df['price_changes_per_product'].mean()
std_changes_per_product = seller_df['price_changes_per_product'].std()

# Calculate the z-scores for each seller
seller_df['z_score'] = (seller_df['price_changes_per_product'] - mean_changes_per_product) / std_changes_per_product

# Define a manually adjustable threshold for the z-score
z_score_threshold = 1

# Classify sellers as algorithmic if their z-score is above the threshold
seller_df['is_algorithmic'] = seller_df['z_score'] > z_score_threshold

# Print the z-score threshold
print('Z-score Threshold:', z_score_threshold)

# Print the DataFrame with the algorithmic sellers
algorithmic_sellers_grouped = seller_df[seller_df['is_algorithmic']]


Z-score Threshold: 1


In [9]:
import pandas as pd
import numpy as np

# Assuming result_by_seller is the list containing sellerName, price changes, and unique products

# Convert the list to a DataFrame
seller_df = pd.DataFrame(result_by_seller, columns=['sellerName', 'price_changes', 'unique_products'])

# Calculate the percentiles for price_changes and unique_products
seller_df['price_changes_percentile'] = seller_df['price_changes'].rank(pct=True)
seller_df['unique_products_percentile'] = seller_df['unique_products'].rank(pct=True)

# Calculate the z-scores for the percentiles
seller_df['z_score_price_changes'] = (seller_df['price_changes_percentile'] - seller_df['price_changes_percentile'].mean()) / seller_df['price_changes_percentile'].std()
seller_df['z_score_unique_products'] = (seller_df['unique_products_percentile'] - seller_df['unique_products_percentile'].mean()) / seller_df['unique_products_percentile'].std()

# Define weights for the z-scores of price_changes and unique_products
weight_price_changes = 1.5
weight_unique_products = 0.5

# Combine the z-scores to create a combined score
seller_df['combined_z_score'] = (weight_price_changes * seller_df['z_score_price_changes']) + (weight_unique_products * seller_df['z_score_unique_products'])

# Define a manually adjustable threshold for the combined z-score
combined_z_score_threshold = 2.5  # Increase the threshold to reduce the number of algorithmic sellers

# Classify sellers as algorithmic if their combined z-score is above the threshold
seller_df['is_algorithmic'] = seller_df['combined_z_score'] > combined_z_score_threshold

# Print the combined z-score threshold
print('Combined Z-score Threshold:', combined_z_score_threshold)

# Print the DataFrame with the algorithmic sellers
algorithmic_sellers_grouped = seller_df[seller_df['is_algorithmic']]


Combined Z-score Threshold: 2.5


## Chen Method with simple numberic Treshhold

In [10]:
algo_seller_chen = []

for tup in result:
    number_of_price_changes = tup[2]
    sellerName = tup[1]
    if number_of_price_changes > 20:
        #print(tup)
        algo_seller_chen.append(sellerName)

# Remove duplicates by converting the list to a set and then back to a list
algo_seller_chen = list(set(algo_seller_chen))
# Transform the list into a DataFrame
df_algo_seller_chen = pd.DataFrame(algo_seller_chen, columns=['sellerName'])

print("Length of algo_seller_chen list (without duplicates):", len(df_algo_seller_chen))

Length of algo_seller_chen list (without duplicates): 0


## Comparing how many Algorithmic Sellers both Ways have in Common

In [11]:
import pandas as pd

# Remove duplicate sellerName entries from algorithmic_sellers
unique_algorithmic_sellers = algorithmic_sellers.drop_duplicates(subset='sellerName')

# Calculate the length of the unique_algorithmic_sellers DataFrame and the algorithmic_sellers_grouped DataFrame
len_unique_algorithmic_sellers = len(unique_algorithmic_sellers)
len_algorithmic_sellers_grouped = len(algorithmic_sellers_grouped)
len_algorithmic_seller_chen = len(df_algo_seller_chen)

# Merge the DataFrames using an inner join, keeping only the common rows
common_sellers_df = unique_algorithmic_sellers.merge(algorithmic_sellers_grouped, on='sellerName', how='inner')
common_sellers_chen_unique = unique_algorithmic_sellers.merge(df_algo_seller_chen, on='sellerName', how='inner')

# Calculate the percentage of common sellers
percentage_common_sellers = (len(common_sellers_df) / min(len_unique_algorithmic_sellers, len_algorithmic_sellers_grouped)) * 100
percentage_common_seller_chen_unique = (len(common_sellers_chen_unique) / min(len_unique_algorithmic_sellers, len_algorithmic_seller_chen)) * 100
percentage_on_all_sellers = (len(common_sellers_chen_unique) / distinct_values_verkäufer) *100
print(f"Total Number of Sellers:", distinct_values_verkäufer)


# Print the lengths of the DataFrames and the percentage of common sellers
print("1: Length of unique_algorithmic_sellers:", len_unique_algorithmic_sellers)
print("2: Length of algorithmic_sellers_grouped:", len_algorithmic_sellers_grouped)
print("3: Length of algortihmic_seller_chen:", len(algo_seller_chen))
print("Length of Sellers in Common between 1 and 2:", len(common_sellers_df))
print("Percentage of common sellers between 1 and 2:", percentage_common_sellers)
print("Percentage of common sellers 1 and 3:", percentage_common_seller_chen_unique)
print("Percentage on all Sellers: ", percentage_on_all_sellers )


# Print the common SellerNames as a list
common_sellers_list = common_sellers_chen_unique['sellerName'].tolist()
#print("\nCommon SellerNames:")
#print(common_sellers_list)

# Convert the list to a pandas DataFrame
df_algorithmic_seller_names = pd.DataFrame(common_sellers_list, columns=['AlgorithmicSellerNames'])
df_algorithmic_seller_names.to_csv('df_algorithmic_seller_names.csv', index=False)

ZeroDivisionError: division by zero

## Sellers only present in the first version or the second version

In [None]:
import pandas as pd

# Merge the DataFrames using an outer join, keeping the origin information in the _merge column
merged_df = algorithmic_sellers.merge(algorithmic_sellers_grouped, on='sellerName', how='outer', indicator=True)

# Filter the merged DataFrame to find sellers unique to each DataFrame
unique_sellers_algorithmic_sellers = merged_df[merged_df['_merge'] == 'left_only']
unique_sellers_algorithmic_sellers_grouped = merged_df[merged_df['_merge'] == 'right_only']

# Convert the filtered Series to lists without index
unique_sellers_list_algorithmic_sellers = unique_sellers_algorithmic_sellers['sellerName'].tolist()
unique_sellers_list_algorithmic_sellers_grouped = unique_sellers_algorithmic_sellers_grouped['sellerName'].tolist()

# Print the sellers unique to each DataFrame as lists without index
print("Sellers only present in algorithmic_sellers:")
#print(unique_sellers_list_algorithmic_sellers)
print(len(unique_sellers_list_algorithmic_sellers))

print("\nSellers only present in algorithmic_sellers_grouped:")
#print(unique_sellers_list_algorithmic_sellers_grouped)
print(len(unique_sellers_algorithmic_sellers_grouped))