## Replace Null Values by Mean Values

In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('../Grunddatein/Zwischendatein/CleanedDataComplete.csv')

### Seller Sterne

In [2]:
    # Select the rows where 'sellerName' does not contain 'amazon'
    no_amazon_df = df[~df['sellerName'].str.contains('amazon', case=False)]

    # Calculate the rounded value of 'seller_sterne'
    average_sterne = no_amazon_df['seller_sterne'].mean()
    print(average_sterne)
    # Round to the nearest 0.5 or full value
    rounded_sterne = round(average_sterne * 2) / 2
    print(rounded_sterne)

    # Replace the null values in 'seller_sterne' with the rounded value for non-amazon rows, and with 5.0 for amazon rows
    df.loc[df['seller_sterne'].isnull() & ~df['sellerName'].str.contains('amazon', case=False), 'seller_sterne'] = rounded_sterne
    df.loc[df['seller_sterne'].isnull() & df['sellerName'].str.contains('amazon', case=False), 'seller_sterne'] = 5.0

    # Check the distinct values for 'seller_sterne' after the replacement
    new_distinct_seller_sterne = df['seller_sterne'].unique()
    new_distinct_seller_sterne = np.sort(new_distinct_seller_sterne)
    print("Distinct values for seller_sterne after replacement:", [f"{value:.1f}" for value in new_distinct_seller_sterne])

4.3098022738892325
4.5
Distinct values for seller_sterne after replacement: ['1.0', '1.5', '2.0', '2.5', '3.0', '3.5', '4.0', '4.5', '5.0']


### Seller Bewertung
## Amazon Braucht noch einen Replace Value für Seller Bewertung

In [3]:
mean_score = round(df['sellerbewertung'].mean(), 2)
sd_score = round(df['sellerbewertung'].std(), 2)
df.loc[df['sellerbewertung'].isnull() & ~df['sellerName'].str.contains('amazon', case=False), 'sellerbewertung'] = mean_score

# Replace all NaN values in the "sellerbewertung" column with the mean value
df['sellerbewertung'] = df['sellerbewertung'].fillna(mean_score)
df['sellerbewertung'] = df['sellerbewertung'].round().astype(int)

In [4]:
df.head()

Unnamed: 0,id,asin,price,currency,time,crawlTime,condition,sellerName,sellerId,sellerbewertung,seller_sterne,lieferdatum,lieferpreis,lieferung_durch,ranking,buyBoxWinner,numberOfSellers,trigByReactive,Fulfillment_type,date_diff
0,341683.0,B09SBXZV9V,141.55,€,2023-03-22 16:00:00,2023-03-22 16:54:19.521639,Neu,belli-shop,AOZ9PW800A1WK,4211.0,5.0,Samstag. 25. März,0.0,Amazon,0.0,True,13.0,f,FBA,3.0
1,341684.0,B0000C72GD,79.9,€,2023-03-22 16:00:00,2023-03-22 16:54:19.522425,Neu,STILE IMMAGINE DIGITAL HD,A16E8RFMSALSSB,28.13,5.0,29. - 31. März,9.9,STILE IMMAGINE DIGITAL HD,0.0,True,2.0,f,FBM,8.0
2,341685.0,B0001GRVJQ,55.31,€,2023-03-22 16:00:00,2023-03-22 16:54:19.523125,Neu,amazon,amazon,,5.0,,0.09,Amazon,0.0,True,8.0,f,FBA,4.11
3,341686.0,B0002CZU1U,273.28,€,2023-03-22 16:00:00,2023-03-22 16:54:19.523736,Neu,Musikhaus Kirstein GmbH,A2LUZCVBLA57KT,36542.7,4.5,27. - 28. März,2.99,Musikhaus Kirstein GmbH,0.0,True,3.0,f,FBM,5.0
4,341687.0,B0002HOS7M,75.62,€,2023-03-22 16:00:00,2023-03-22 16:54:19.524467,Neu,amazon,amazon,,5.0,Samstag. 25. März,0.0,Amazon,0.0,True,23.0,f,FBA,3.0


### FBA Delivery Price and Delivery Duration

In [5]:
sub_df = df[df['Fulfillment_type'] == 'FBA']
mean_price = round(sub_df['lieferpreis'].mean(), 2)
standard_deviation = round(sub_df['lieferpreis'].std(), 2)
mean_delivery_time = round(sub_df['date_diff'].mean(), 2)
delivery_time_sd = round(sub_df['date_diff'].std(), 2)

print(f"Mean Delivery Price: {mean_price}")
print(f"Standard Deviation: {standard_deviation}")

df.loc[(df['date_diff'].isnull()) & (df['Fulfillment_type'] == 'FBA'), 'lieferpreis'] = mean_price
df.loc[(df['date_diff'].isnull()) & (df['Fulfillment_type'] == 'FBA'), 'date_diff'] = mean_delivery_time

Mean Delivery Price: 0.08
Standard Deviation: 1.04


### FBM Delivery Price and Delivery Duration

In [6]:
sub_df = df[df['Fulfillment_type'] == 'FBM']
mean_price = round(sub_df['lieferpreis'].mean(), 2)
standard_deviation = round(sub_df['lieferpreis'].std(), 2)
mean_delivery_time = round(sub_df['date_diff'].mean(), 2)
delivery_time_sd = round(sub_df['date_diff'].std(), 2)

print(f"Mean Delivery Price: {mean_price}")
print(f"Standard Deviation: {standard_deviation}")

df.loc[(df['date_diff'].isnull()) & (df['Fulfillment_type'] == 'FBM'), 'lieferpreis'] = mean_price
df.loc[(df['date_diff'].isnull()) & (df['Fulfillment_type'] == 'FBM'), 'date_diff'] = mean_delivery_time

Mean Delivery Price: 9.79
Standard Deviation: 13.56


In [7]:
## Overwrite File
df.to_csv('../Grunddatein/Zwischendatein/CleanedDataCompleteNoNulls.csv', index=False)

In [8]:
# Count the null values in each column of the DataFrame
null_values_count_per_column = df.isnull().sum()

# Print the number of null values for each column
print("Number of null values per column in df:")
print(null_values_count_per_column)

Number of null values per column in df:
id                        0
asin                      0
price                     0
currency                  0
time                      0
crawlTime                 0
condition                 0
sellerName                0
sellerId                  0
sellerbewertung     1585854
seller_sterne             0
lieferdatum          208608
lieferpreis               0
lieferung_durch           0
ranking                   0
buyBoxWinner              0
numberOfSellers           0
trigByReactive            0
Fulfillment_type          0
date_diff                 0
dtype: int64
