In [49]:
import pandas as pd
import numpy as np


df = pd.read_csv('../Grunddatein/OffersData.csv')
name_for_save_file = '../Grunddatein/Zwischendatein/CleanedDataComplete.csv'

asins2 = pd.read_csv('../Grunddatein/ASINS.csv')
skip_cell = True

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6603268 entries, 0 to 6603267
Data columns (total 18 columns):
 #   Column           Dtype  
---  ------           -----  
 0   id               int64  
 1   asin             object 
 2   price            float64
 3   currency         object 
 4   time             object 
 5   crawlTime        object 
 6   condition        object 
 7   sellerName       object 
 8   sellerId         object 
 9   sellerbewertung  object 
 10  seller_sterne    object 
 11  lieferdatum      object 
 12  lieferpreis      object 
 13  lieferung_durch  object 
 14  ranking          int64  
 15  buyBoxWinner     object 
 16  numberOfSellers  int64  
 17  trigByReactive   object 
dtypes: float64(1), int64(3), object(14)
memory usage: 906.8+ MB


In [51]:
print("Length of the DataFrame:", df.shape[0])
num_true = (df['buyBoxWinner'] == True).sum()
print(f"The Number of Offers winning the BuyBox is {num_true}")
distinct_asins = df['asin'].unique()
asins_complete = asins2['id'].unique()
print("Number of distinct asins:", len(distinct_asins))
set_distinct_asins = set(distinct_asins)
set_asins_complete = set(asins_complete)

asins_not_in_both = set_distinct_asins.symmetric_difference(set_asins_complete)
print("ASINs not present in both:", asins_not_in_both)

Length of the DataFrame: 6603268
The Number of Offers winning the BuyBox is 0
Number of distinct asins: 995
ASINs not present in both: {'B0B2RRZLY6', 'B09MY48N93', 'B07H4FRQ4T', 'B09L6247P8', 'B09W8GB7KY'}


## Delete ASIN with Missing Values and Add Missing Values to the two Offers

In [55]:
df = df[~(df['asin'] == 'B0743BR42Y')]
row_index = df[df['id'] == 1637198].index[0]
# Update the 'price' and 'currency' for the row with 'id' equal to 1637198 with the price from the hour before
df.loc[row_index, 'price'] = 152.44
df.loc[row_index, 'currency'] = '€'
# Find the index of the row with 'id' equal to 2486948
row_index = df[df['id'] == 2486948].index[0]
# Update the 'price' and 'currency' for the row with 'id' equal to 2486948 with the price from the hour before
df.loc[row_index, 'price'] = 185.32
df.loc[row_index, 'currency'] = '€'



## Transform BuyBoxWinner Column to Boolean Format

In [56]:
df['buyBoxWinner'] = df['buyBoxWinner'].apply(lambda x: True if x == 't' else False)

## Transform Time Column to Datetime Format

In [57]:
# convert the 'time' column to datetime format
df['time'] = pd.to_datetime(df['time'], format="%Y-%m-%d-%H")
df.head()

Unnamed: 0,id,asin,price,currency,time,crawlTime,condition,sellerName,sellerId,sellerbewertung,seller_sterne,lieferdatum,lieferpreis,lieferung_durch,ranking,buyBoxWinner,numberOfSellers,trigByReactive
0,341683,B09SBXZV9V,141.55,€,2023-03-22 16:00:00,2023-03-22 16:54:19.521639,Neu,belli-shop,AOZ9PW800A1WK,(4211 Bewertungen) 100 % positiv in...,5 von 5,"Samstag, 25. März",GRATIS,Amazon,0,False,13,f
1,341684,B0000C72GD,79.9,€,2023-03-22 16:00:00,2023-03-22 16:54:19.522425,Neu,STILE IMMAGINE DIGITAL HD,A16E8RFMSALSSB,(29 Bewertungen) 97 % positiv über ...,5 von 5,29. - 31. März,"9,90 €",STILE IMMAGINE DIGITAL HD,0,False,2,f
2,341685,B0001GRVJQ,55.31,€,2023-03-22 16:00:00,2023-03-22 16:54:19.523125,Neu,amazon,amazon,,,,,Amazon,0,False,8,f
3,341686,B0002CZU1U,273.28,€,2023-03-22 16:00:00,2023-03-22 16:54:19.523736,Neu,Musikhaus Kirstein GmbH,A2LUZCVBLA57KT,(38466 Bewertungen) 95 % positiv in...,4.5 von 5,27. - 28. März,"2,99 €",Musikhaus Kirstein GmbH,0,False,3,f
4,341687,B0002HOS7M,75.62,€,2023-03-22 16:00:00,2023-03-22 16:54:19.524467,Neu,amazon,amazon,,,"Samstag, 25. März",GRATIS,Amazon,0,False,23,f


## Transforming/Adding Missing Rows/Times
Missing Rows are "Replaced" by the rows for the same asin one hour before
Search For Asins with an Hour Gap

In [58]:
if not skip_cell:
    # create a MultiIndex using columns "asin", "time", and "id"
    df.set_index(['asin', 'time'], inplace=True)

    # sort the DataFrame by the MultiIndex
    df.sort_index(inplace=True)

    # Initialize an empty list called asins_and_gaps
    asins_and_gaps = []

    def check_one_hour_gap(group):
        group = group.reset_index()  # reset index before indexing with missing_gap Series
        time_diff = group['time'].diff()
        one_hour_gap = pd.Timedelta(hours=1)
        missing_gap = time_diff > one_hour_gap

        if missing_gap.any():
            gap_start_time = group['time'][missing_gap].iloc[0] - one_hour_gap
            missing_hour = gap_start_time.strftime("%Y-%m-%d %H")
            asin = group["asin"].iloc[0]
            print(f'Missing one-hour gap in ASIN {asin} at {missing_hour}')
            print("-------------------------")
            print("\n")

            # Add the ASIN and the missing hour to the asins_and_gaps list
            asins_and_gaps.append((asin, missing_hour))

    # apply custom function to each group
    df.groupby('asin').apply(check_one_hour_gap)

    #print("ASINs and gaps:")
    #print(asins_and_gaps)
    #asins_and_gaps
    distinct_asins = {tup[0] for tup in asins_and_gaps}
    print(f"For {len(distinct_asins)} there is at least one time Gap")
    print(f"In Total there are {len(asins_and_gaps)} time Gaps")

### Find Rows from one hour before and insert them into missing_data_df

In [59]:
if not skip_cell:
    # Assuming you already have the DataFrame df and the asins_and_gaps list

    # Initialize an empty DataFrame called missing_data_df
    missing_data_df = pd.DataFrame()

    for asin, gap_time_str in asins_and_gaps:
        # Convert the gap time string to a datetime object
        gap_time = pd.to_datetime(gap_time_str, format='%Y-%m-%d %H')

        matching_rows = pd.DataFrame()
        gap_found = 0

        # Search for rows up to 3 hours earlier
        for hours in range(1, 4):
            time_earlier = gap_time - pd.Timedelta(hours=hours)
            matching_rows = df.loc[(df.index.get_level_values('asin') == asin) & (df.index.get_level_values('time') == time_earlier)]

            if not matching_rows.empty:
                gap_found = -hours
                break

        # If matching rows are found
        if not matching_rows.empty:
            # Make a copy of matching_rows before modifying it
            matching_rows = matching_rows.copy()

            # Update the time column value for the copied rows to the appropriate gap
            matching_rows.reset_index(inplace=True)
            matching_rows[matching_rows.columns[matching_rows.columns.get_loc('time')]] = matching_rows['time'] - pd.Timedelta(hours=gap_found)

            # Add a new column named "time_gap" to indicate the gap at which the rows were found
            matching_rows['time_gap'] = gap_found

            # Append the matching rows to the missing_data_df DataFrame
            missing_data_df = pd.concat([missing_data_df, matching_rows])

    # Reset the index of the missing_data_df DataFrame
    missing_data_df.reset_index(drop=True, inplace=True)
    #missing_data_df.info()
    distinct_values = missing_data_df['asin'].nunique()
    distinct_values
    print(f"For {distinct_values} there could be found rows from 1/2/3 hourls earlier to replace the time gap")

## Concat Missing Dataframe to Normal Dataframe

In [60]:
if not skip_cell:
    df['time_gap'] = 0
    df['copyed_cause_missing'] = False
    df = df.reset_index()
    missing_data_df['copyed_cause_missing'] = True
    old_size = df.shape[0]
    df = pd.concat([df, missing_data_df])
    new_size = df.shape[0]
    del missing_data_df
    print(f"The Old Size was {old_size} and the New Size is {new_size}. This Means {new_size-old_size} have been added")

In [61]:
df.head()

Unnamed: 0,id,asin,price,currency,time,crawlTime,condition,sellerName,sellerId,sellerbewertung,seller_sterne,lieferdatum,lieferpreis,lieferung_durch,ranking,buyBoxWinner,numberOfSellers,trigByReactive
0,341683,B09SBXZV9V,141.55,€,2023-03-22 16:00:00,2023-03-22 16:54:19.521639,Neu,belli-shop,AOZ9PW800A1WK,(4211 Bewertungen) 100 % positiv in...,5 von 5,"Samstag, 25. März",GRATIS,Amazon,0,False,13,f
1,341684,B0000C72GD,79.9,€,2023-03-22 16:00:00,2023-03-22 16:54:19.522425,Neu,STILE IMMAGINE DIGITAL HD,A16E8RFMSALSSB,(29 Bewertungen) 97 % positiv über ...,5 von 5,29. - 31. März,"9,90 €",STILE IMMAGINE DIGITAL HD,0,False,2,f
2,341685,B0001GRVJQ,55.31,€,2023-03-22 16:00:00,2023-03-22 16:54:19.523125,Neu,amazon,amazon,,,,,Amazon,0,False,8,f
3,341686,B0002CZU1U,273.28,€,2023-03-22 16:00:00,2023-03-22 16:54:19.523736,Neu,Musikhaus Kirstein GmbH,A2LUZCVBLA57KT,(38466 Bewertungen) 95 % positiv in...,4.5 von 5,27. - 28. März,"2,99 €",Musikhaus Kirstein GmbH,0,False,3,f
4,341687,B0002HOS7M,75.62,€,2023-03-22 16:00:00,2023-03-22 16:54:19.524467,Neu,amazon,amazon,,,"Samstag, 25. März",GRATIS,Amazon,0,False,23,f


In [62]:
# Count the null values in each column of the DataFrame
null_values_count_per_column = df.isnull().sum()

# Print the number of null values for each column
print("Number of null values per column in df:")
print(null_values_count_per_column)

Number of null values per column in df:
id                       0
asin                     0
price                    0
currency                 0
time                     0
crawlTime                0
condition                0
sellerName               0
sellerId                 0
sellerbewertung    1587251
seller_sterne      1628234
lieferdatum         208608
lieferpreis         208608
lieferung_durch          0
ranking                  0
buyBoxWinner             0
numberOfSellers          0
trigByReactive           0
dtype: int64


## Transforming Dataframe Seller Sterne

In [63]:
# Get distinct values for the 'seller_sterne' column
distinct_seller_sterne = df['seller_sterne'].unique()

# Print the distinct values
print("Distinct values for seller_sterne:", distinct_seller_sterne)

Distinct values for seller_sterne: ['5 von 5' nan '4.5 von 5' '4 von 5' '3.5 von 5' '3 von 5' '2 von 5'
 '1 von 5' '2.5 von 5' '1.5 von 5']


In [64]:
# Get distinct values for the 'seller_sterne' column
distinct_seller_sterne = df['seller_sterne'].unique()
print("Distinct values for seller_sterne:", distinct_seller_sterne)

# Define a lambda function to convert the string to a float or return None for null values
str_to_float_or_none = lambda x: float(x.split()[0].replace(",", ".")) if isinstance(x, str) and len(x.split()) > 0 else None

# Apply the lambda function to the 'seller_sterne' column to replace the string values with their corresponding float values
df['seller_sterne'] = df['seller_sterne'].apply(str_to_float_or_none)

# Check the distinct values for 'seller_sterne' after the replacement
new_distinct_seller_sterne = df['seller_sterne'].unique()
new_distinct_seller_sterne = np.sort(new_distinct_seller_sterne)
print("Distinct values for seller_sterne after replacement:", [f"{value:.1f}" for value in new_distinct_seller_sterne])

Distinct values for seller_sterne: ['5 von 5' nan '4.5 von 5' '4 von 5' '3.5 von 5' '3 von 5' '2 von 5'
 '1 von 5' '2.5 von 5' '1.5 von 5']
Distinct values for seller_sterne after replacement: ['1.0', '1.5', '2.0', '2.5', '3.0', '3.5', '4.0', '4.5', '5.0', 'nan']


In [66]:
# Count the null values in each column of the DataFrame
null_values_count_per_column = df.isnull().sum()

# Print the number of null values for each column
print("Number of null values per column in df:")
print(null_values_count_per_column)

Number of null values per column in df:
id                       0
asin                     0
price                    0
currency                 0
time                     0
crawlTime                0
condition                0
sellerName               0
sellerId                 0
sellerbewertung    1587251
seller_sterne      1628234
lieferdatum         208608
lieferpreis         208608
lieferung_durch          0
ranking                  0
buyBoxWinner             0
numberOfSellers          0
trigByReactive           0
dtype: int64


## Transformin Dataframe Lieferpreis


In [67]:
df['lieferpreis'] = df['lieferpreis'].replace(['GRATIS', 'FREE'], 0.0)
##Replace Euro Signs
df['lieferpreis'] = df['lieferpreis'].replace('€', '', regex=True)
df = df.replace(',', '.', regex=True)
df['lieferpreis'] = df['lieferpreis'].astype(float, errors='ignore')
num_missing = df['lieferpreis'].isna().sum()
print(f"The column 'lieferpreis' has {num_missing} missing values.")
##Pls be aware that the number doesnt explicitly match tabe a.2. as some rows/one asin was deleted

The column 'lieferpreis' has 208608 missing values.


## Transform Dataframe Sellerbewertung


In [68]:
# Split values into two columns based on first space
df[['vorderer_Teil', 'hinterer_Teil']] = df['sellerbewertung'].str.split(')', 1, expand=True)
df['hinterer_Teil'] = df['hinterer_Teil'].str.replace('%.*', '', regex=True)
# Replace NaN values in vorderer_Teil column with 0
df['vorderer_Teil'] = df['vorderer_Teil'].fillna(0)
df['vorderer_Teil'] = df['vorderer_Teil'].str.extract('(\d+)').astype(float)
df['hinterer_Teil'] = df['hinterer_Teil'].astype(float) / 100.0
df['sellerbewertung_ausgerechnet'] = df['vorderer_Teil'] * df['hinterer_Teil']
df['sellerbewertung'] = df['sellerbewertung_ausgerechnet']

# Drop columns that were created in the process
df = df.drop(['vorderer_Teil', 'hinterer_Teil', 'sellerbewertung_ausgerechnet'], axis=1)

  df[['vorderer_Teil', 'hinterer_Teil']] = df['sellerbewertung'].str.split(')', 1, expand=True)


In [70]:
# Count the null values in each column of the DataFrame
null_values_count_per_column = df.isnull().sum()

# Print the number of null values for each column
print("Number of null values per column in df:")
print(null_values_count_per_column)

Number of null values per column in df:
id                       0
asin                     0
price                    0
currency                 0
time                     0
crawlTime                0
condition                0
sellerName               0
sellerId                 0
sellerbewertung    1628234
seller_sterne      1628234
lieferdatum         208608
lieferpreis         208608
lieferung_durch          0
ranking                  0
buyBoxWinner             0
numberOfSellers          0
trigByReactive           0
dtype: int64


### Tramsform Column lieferung_durch

In [17]:
df['Fulfillment_type'] = ['FBA' if isinstance(x, str) and 'amazon' in x.lower() else 'FBM' for x in df['lieferung_durch']]

## Transform Dataframe Lieferdauer

In [18]:
import pandas as pd
dropped_rows_cause_lieferdatum_null = df[df['lieferdatum'].isna()]

def replace_month_names(date_string):
    month_map = {
        "Januar": "January",
        "Februar": "February",
        "März": "March",
        "April": "April",
        "Mai": "May",
        "Juni": "June",
        "Juli": "July",
        "August": "August",
        "September": "September",
        "Oktober": "October",
        "November": "November",
        "Dezember": "December"
    }
    for german, english in month_map.items():
        date_string = date_string.replace(german, english)
    return date_string

def split_dates(date_str):
    if '-' in date_str:
        date_parts = date_str.split('-')
        start_date_str = date_parts[0].strip()
        end_date_str = date_parts[1].strip()
    else:
        start_date_str = date_str.strip()
        end_date_str = None
    return start_date_str, end_date_str

df = df.dropna(subset=['lieferdatum'])
df['start_date'], df['end_date'] = zip(*df['lieferdatum'].apply(split_dates))


In [19]:
import re

# Define a function to extract the month name from a date string
def extract_month(date_string):
    match = re.search(r'\b(Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)\b', date_string)
    if match:
        return match.group(1)
    else:
        return None

# Iterate over each row in the dataframe and update the 'start_date' and 'end_date' columns as needed
for i, row in df.iterrows():
    # Case 1: If end_date is null, skip this row
    if pd.isnull(row['end_date']):
        continue
    
    # Case 2: If both columns contain a month name, skip this row
    if extract_month(row['start_date']) and extract_month(row['end_date']):
        continue
    
    # Case 3: If start_date doesn't contain a month name, append the month name from end_date to start_date
    if not extract_month(row['start_date']):
        month_name = extract_month(row['end_date'])
        if month_name:
            new_start_date = row['start_date'] + ' ' + month_name
            df.at[i, 'start_date'] = new_start_date
    
    # Case 4: If end_date doesn't contain a month name, append the month name from start_date to end_date
    if not extract_month(row['end_date']):
        month_name = extract_month(row['start_date'])
        if month_name:
            new_end_date = row['end_date'] + ' ' + month_name
            df.at[i, 'end_date'] = new_end_date
    
    # If none of the above conditions are met, do nothing and move on to the next row

# The 'start_date' and 'end_date' columns in the original dataframe have been updated in-place


In [20]:
df['start_date'] = df['start_date'].str.extract(r'(\d{1,2}\.\s[a-zA-ZäöüÄÖÜ]+)')
# extract the day and month from the start_date column
df['start_date'] = df['start_date'].str.extract(r'(\d+\.\s\w+)') # get the day and month
df['start_date_day'] = df['start_date'].str.extract(r'(\d+)').fillna(1).astype(int) # extract the day and convert to int, replace NaN with 1
df['start_day_month'] = df['start_date'].str.extract(r'([a-zA-ZäöüÄÖÜ]+)').fillna('Jan').astype(str) # extract the month and replace NaN with 'Jan'

# extract the day and month from the end_date column
df['end_date'] = df['end_date'].str.extract(r'(\d+\.\s\w+)')
df['end_date_day'] = df['end_date'].str.extract(r'(\d+)').fillna(1).astype(int)
df['end_day_month'] = df['end_date'].str.extract(r'([a-zA-ZäöüÄÖÜ]+)').fillna('Jan').astype(str)

# set end_date_day and end_day_month to NaN if end_date is null
df.loc[df['end_date'].isna(), ['end_date_day', 'end_day_month']] = np.nan

In [21]:
# Define a dictionary to map month names to integers
month_map = {
    'Januar': 1,
    'Jan':1,
    'Februar': 2,
    'März': 3,
    'April': 4,
    'Mai': 5,
    'Juni': 6,
    'Juli': 7,
    'August': 8,
    'September': 9,
    'Oktober': 10,
    'November': 11,
    'Dezember': 12
}

# Parse start_day_month column to integers
df['start_day_month'] = df['start_day_month'].map(month_map)

# Parse end_day_month column to integers
df['end_day_month'] = df['end_day_month'].fillna('').map(month_map)
df['end_day_month'] = df['end_day_month'].astype('Int64')
df['end_date_day'] = df['end_date_day'].astype('Int64')
# Convert start date information to datetime
df['start_date_datetime'] = pd.to_datetime(df['start_date_day'].astype(str) + '-' + df['start_day_month'].astype(str) + '-2023', format='%d-%m-%Y', errors='coerce')

# Convert end date information to datetime, if not null
df['end_date_datetime'] = pd.to_datetime(df['end_date_day'].astype(str).fillna('').astype(str) + '-' + df['end_day_month'].astype(str).fillna('').astype(str) + '-2023', format='%d-%m-%Y', errors='coerce')

In [22]:
df['final_delivery_datetime'] = None
import math
# assuming your dataframe is called df
for index, row in df.iterrows():
    if pd.notna(row['start_date_datetime']) and pd.isna(row['end_date_datetime']):
        df.at[index, 'final_delivery_datetime'] = row['start_date_datetime']
    elif pd.notna(row['start_date_datetime']) and pd.notna(row['end_date_datetime']):
        # case 2: both columns have values
        start_date = pd.to_datetime(row['start_date_datetime'])
        end_date = pd.to_datetime(row['end_date_datetime'])
        if end_date < start_date:
            start_date, end_date = end_date, start_date  # swap the two dates
        time_diff = math.floor((end_date - start_date) / pd.Timedelta(hours=2))
        delivery_time = start_date + pd.Timedelta(hours=time_diff)
        df.at[index, 'final_delivery_datetime'] = delivery_time

df['date_diff'] = df.apply(lambda row: (row['final_delivery_datetime'] - pd.to_datetime(row['time'])).days if pd.notna(row['time']) else None, axis=1)

In [26]:
# create a new dataframe with only the rows where date_diff is less than zero
df_second_time = df[df['date_diff'] < 0].copy()

# if you want to reset the index of the new dataframe
df_second_time.reset_index(drop=True, inplace=True)

In [27]:
negative_values_count = (df['date_diff'] < 0).sum()
print(f"There are {negative_values_count} negative values in the 'date_diff' column.")
negative_date_diff_rows = df[df['date_diff'] < 0]
selected_columns = negative_date_diff_rows[['id','lieferdatum','final_delivery_datetime','end_date_datetime','start_date_datetime', 'start_date_day','start_day_month','start_date','end_date']]
selected_columns
df['second_time'] = df['date_diff'] < 0


There are 62657 negative values in the 'date_diff' column.


## Second Lieferdatum for English Values

In [28]:
import pandas as pd

def replace_month_names(date_string):
    month_map = {
        "Januar": "January",
        "Februar": "February",
        "März": "March",
        "April": "April",
        "Mai": "May",
        "Juni": "June",
        "Juli": "July",
        "August": "August",
        "September": "September",
        "Oktober": "October",
        "November": "November",
        "Dezember": "December"
    }
    for german, english in month_map.items():
        date_string = date_string.replace(german, english)
    return date_string

def split_dates(date_str):
    if '-' in date_str:
        date_parts = date_str.split('-')
        start_date_str = date_parts[0].strip()
        end_date_str = date_parts[1].strip()
    else:
        start_date_str = date_str.strip()
        end_date_str = None
    return start_date_str, end_date_str

# Drop NaN values in the 'lieferdatum' column for the new DataFrame
df_second_time = df_second_time.dropna(subset=['lieferdatum'])

# Apply the split_dates function to the new DataFrame
df_second_time['start_date'], df_second_time['end_date'] = zip(*df_second_time['lieferdatum'].apply(split_dates))

In [29]:
import re

# Define a function to extract the month name from a date string
def extract_month(date_string):
    match = re.search(r'\b(Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)\b', date_string)
    if match:
        return match.group(1)
    else:
        return None

# Iterate over each row in the dataframe and update the 'start_date' and 'end_date' columns as needed
for i, row in df_second_time.iterrows():
    # Case 1: If end_date is null, skip this row
    if pd.isnull(row['end_date']):
        continue
    
    # Case 2: If both columns contain a month name, skip this row
    if extract_month(row['start_date']) and extract_month(row['end_date']):
        continue
    
    # Case 3: If start_date doesn't contain a month name, append the month name from end_date to start_date
    if not extract_month(row['start_date']):
        month_name = extract_month(row['end_date'])
        if month_name:
            new_start_date = row['start_date'] + ' ' + month_name
            df_second_time.at[i, 'start_date'] = new_start_date
    
    # Case 4: If end_date doesn't contain a month name, append the month name from start_date to end_date
    if not extract_month(row['end_date']):
        month_name = extract_month(row['start_date'])
        if month_name:
            new_end_date = row['end_date'] + ' ' + month_name
            df_second_time.at[i, 'end_date'] = new_end_date
    

In [30]:
# extract the day and month from the start_date column
df['start_date'] = df['start_date'].str.extract(r'(\d{1,2}\.\s[a-zA-ZäöüÄÖÜ]+)')
import re

def extract_day(date_str):
    if date_str is None or not isinstance(date_str, str):
        return None

    day = re.search(r'\d+', date_str)
    return int(day.group()) if day else None

def extract_month(date_str):
    if date_str is None or not isinstance(date_str, str):
        return None

    month_pattern = r'(January|February|March|April|May|June|July|August|September|October|November|December|Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)'
    month = re.search(month_pattern, date_str)
    return month.group() if month else None

df_second_time['start_date1'] = df_second_time['start_date'].apply(extract_day)
df_second_time['start_date2'] = df_second_time['start_date'].apply(extract_month)
df_second_time['start_date'] = df_second_time.apply(lambda x: f"{x['start_date1']} {x['start_date2']}" if x['start_date1'] is not None and x['start_date2'] is not None else None, axis=1)
df_second_time = df_second_time.drop(['start_date1', 'start_date2'], axis=1)

In [31]:
# extract the day and month from the start_date column
df_second_time['start_date'] = df_second_time['start_date'].str.extract(r'(\d+\.?\s\w+)')
df_second_time['start_date_day'] = df_second_time['start_date'].str.extract(r'(\d+)').fillna(1).astype(int)
df_second_time['start_day_month'] = df_second_time['start_date'].str.extract(r'([a-zA-ZäöüÄÖÜ]+)').fillna('Jan').astype(str)

# extract the day and month from the end_date column
df_second_time['end_date'] = df_second_time['end_date'].str.extract(r'(\d+\.?\s\w+)')
df_second_time['end_date_day'] = df_second_time['end_date'].str.extract(r'(\d+)').fillna(1).astype(int)
df_second_time['end_day_month'] = df_second_time['end_date'].str.extract(r'([a-zA-ZäöüÄÖÜ]+)').fillna('Jan').astype(str)

# set end_date_day and end_day_month to NaN if end_date is null
df_second_time.loc[df_second_time['end_date'].isna(), ['end_date_day', 'end_day_month']] = np.nan

In [32]:
# Define a dictionary to map month names to integers
month_map = {
    'Januar': 1,
    'Jan':1,
    'Februar': 2,
    'März': 3,
    'March':3,
    'April': 4,
    'Mai': 5,
    'May':5,
    'Juni': 6,
    'Juli': 7,
    'August': 8,
    'September': 9,
    'Oktober': 10,
    'November': 11,
    'Dezember': 12
}

# Parse start_day_month column to integers
df_second_time['start_day_month'] = df_second_time['start_day_month'].map(month_map)

# Parse end_day_month column to integers
df_second_time['end_day_month'] = df_second_time['end_day_month'].fillna('').map(month_map)
df_second_time['end_day_month'] = df_second_time['end_day_month'].astype('Int64')
df_second_time['end_date_day'] = df_second_time['end_date_day'].astype('Int64')
# Convert start date information to datetime
df_second_time['start_date_datetime'] = pd.to_datetime(df_second_time['start_date_day'].astype(str) + '-' + df_second_time['start_day_month'].astype(str) + '-2023', format='%d-%m-%Y', errors='coerce')

# Convert end date information to datetime, if not null
df_second_time['end_date_datetime'] = pd.to_datetime(df_second_time['end_date_day'].astype(str).fillna('').astype(str) + '-' + df_second_time['end_day_month'].astype(str).fillna('').astype(str) + '-2023', format='%d-%m-%Y', errors='coerce')

In [39]:
df_second_time['final_delivery_datetime'] = None
import math
# assuming your dataframe is called df
for index, row in df_second_time.iterrows():
    if pd.notna(row['start_date_datetime']) and pd.isna(row['end_date_datetime']):
        df_second_time.at[index, 'final_delivery_datetime'] = row['start_date_datetime']
    elif pd.notna(row['start_date_datetime']) and pd.notna(row['end_date_datetime']):
        # case 2: both columns have values
        start_date = pd.to_datetime(row['start_date_datetime'])
        end_date = pd.to_datetime(row['end_date_datetime'])
        if end_date < start_date:
            start_date, end_date = end_date, start_date  # swap the two dates
        time_diff = math.floor((end_date - start_date) / pd.Timedelta(hours=2))
        delivery_time = start_date + pd.Timedelta(hours=time_diff)
        df_second_time.at[index, 'final_delivery_datetime'] = delivery_time

df_second_time['date_diff'] = df_second_time.apply(lambda row: (row['final_delivery_datetime'] - pd.to_datetime(row['time'])).days if pd.notna(row['time']) and row['final_delivery_datetime'] is not None else None, axis=1)

In [40]:
df.update(df_second_time)

In [41]:
negative_values_count = (df['date_diff'] < 0).sum()
print(f"There are {negative_values_count} negative values in the 'date_diff' column.")
negative_date_diff_rows = df_second_time[df_second_time['date_diff'] < 0]
selected_columns = negative_date_diff_rows[['id','lieferdatum','final_delivery_datetime','end_date_datetime','start_date_datetime', 'start_date_day','start_day_month','start_date','end_date']]
selected_columns
df = df.append(dropped_rows_cause_lieferdatum_null, ignore_index=True)

There are 62657 negative values in the 'date_diff' column.


  df = df.append(dropped_rows_cause_lieferdatum_null, ignore_index=True)
  df = df.append(dropped_rows_cause_lieferdatum_null, ignore_index=True)


In [42]:
columns_to_drop = [
    "start_date",
    "end_date",
    "start_date_day",
    "start_day_month",
    "end_date_day",
    "end_day_month",
    "start_date_datetime",
    "end_date_datetime",
    "final_delivery_datetime",
    "second_time"
]

df = df.drop(columns=columns_to_drop, axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6591111 entries, 0 to 6591110
Data columns (total 20 columns):
 #   Column            Dtype         
---  ------            -----         
 0   id                float64       
 1   asin              object        
 2   price             float64       
 3   currency          object        
 4   time              datetime64[ns]
 5   crawlTime         object        
 6   condition         object        
 7   sellerName        object        
 8   sellerId          object        
 9   sellerbewertung   float64       
 10  seller_sterne     float64       
 11  lieferdatum       object        
 12  lieferpreis       float64       
 13  lieferung_durch   object        
 14  ranking           float64       
 15  buyBoxWinner      object        
 16  numberOfSellers   float64       
 17  trigByReactive    object        
 18  Fulfillment_type  object        
 19  date_diff         float64       
dtypes: datetime64[ns](1), float64(8), object(11)
m

## Transform Time Column Again

In [43]:
df['time'] = pd.to_datetime(df['time'], format="%Y-%m-%d %H:%M:%S")
## Set Vales for Date_diff <0 to NAN. 
## They are gonna replcaed later on in 1.2
df.loc[df['date_diff'] < 0, 'date_diff'] = np.nan

## Transform Price

In [44]:
# Prices over 1000 € are misinterpreted due to a wrongly placed . sign. Transforming it.
# All Prices below 2 Euros where checked if Products where existing, for which this price is real. All Products under question are actually priced over 1000€

# Select rows where the 'price' column is between 1 and 2
selected_rows = df[(df['price'] >= 1) & (df['price'] <= 2)]

# Print the number of selected rows
print(f"Number of rows with price between 1 and 2: {len(selected_rows)}")

# Multiply the price by 1000 for rows where the price is between 1 and 2
selected_rows.loc[(selected_rows['price'] >= 1) & (selected_rows['price'] <= 2), 'price'] *= 1000

# Update the original DataFrame with the updated values
df.update(selected_rows)

# Print the result
print("Selected rows with updated price:")
selected_rows.head()

# Print distinct ASINs related to the selected rows
distinct_asins = selected_rows['asin'].unique()
print("\nDistinct ASINs related to the selected rows:")
distinct_asins


Number of rows with price between 1 and 2: 12119


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_rows.loc[(selected_rows['price'] >= 1) & (selected_rows['price'] <= 2), 'price'] *= 1000


Selected rows with updated price:

Distinct ASINs related to the selected rows:


array(['B07PWXN3RG', 'B07KSBWFZS', 'B07HRW7DBC', 'B07B2WQPR3',
       'B0BJN4T6NJ', 'B01FE7K184', 'B08WPV9RM7', 'B08N5R7XXZ',
       'B08N5S64Z2', 'B08N5S9B8X', 'B09V4478YQ', 'B08WJBVM5G'],
      dtype=object)

## Save Data to File

In [45]:
df.to_csv(name_for_save_file, index=False)
#Relevant für RF: asin, price,(time),sellerid, sellerbewertung, seller_sterne, lieferpreis, Fulfillment_type, date_diff, ranking, numberOfSellers

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6591111 entries, 0 to 6591110
Data columns (total 20 columns):
 #   Column            Dtype         
---  ------            -----         
 0   id                float64       
 1   asin              object        
 2   price             float64       
 3   currency          object        
 4   time              datetime64[ns]
 5   crawlTime         object        
 6   condition         object        
 7   sellerName        object        
 8   sellerId          object        
 9   sellerbewertung   float64       
 10  seller_sterne     float64       
 11  lieferdatum       object        
 12  lieferpreis       float64       
 13  lieferung_durch   object        
 14  ranking           float64       
 15  buyBoxWinner      object        
 16  numberOfSellers   float64       
 17  trigByReactive    object        
 18  Fulfillment_type  object        
 19  date_diff         float64       
dtypes: datetime64[ns](1), float64(8), object(11)
m

In [46]:
# Count the null values in each column of the DataFrame
null_values_count_per_column = df.isnull().sum()

# Print the number of null values for each column
print("Number of null values per column in df:")
print(null_values_count_per_column)

Number of null values per column in df:
id                        0
asin                      0
price                     0
currency                  0
time                      0
crawlTime                 0
condition                 0
sellerName                0
sellerId                  0
sellerbewertung     1617113
seller_sterne       1617113
lieferdatum          208608
lieferpreis          208608
lieferung_durch           0
ranking                   0
buyBoxWinner              0
numberOfSellers           0
trigByReactive            0
Fulfillment_type          0
date_diff            271265
dtype: int64


In [47]:
#Was erwarte ich: date diff null values, sellberwertung, lieferdatum, seller_sterne, lieferpreis. Und date_diff sollte höher sein als lieferdatum