In [1]:
import pandas as pd
import numpy as np


df = pd.read_csv('../Grunddatein/OffersData.csv')
name_for_save_file = '../Grunddatein/Zwischendatein/CleanedDataComplete.csv'

asins2 = pd.read_csv('../Grunddatein/ASINS.csv')
skip_cell = True

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6603268 entries, 0 to 6603267
Data columns (total 18 columns):
 #   Column           Dtype  
---  ------           -----  
 0   id               int64  
 1   asin             object 
 2   price            float64
 3   currency         object 
 4   time             object 
 5   crawlTime        object 
 6   condition        object 
 7   sellerName       object 
 8   sellerId         object 
 9   sellerbewertung  object 
 10  seller_sterne    object 
 11  lieferdatum      object 
 12  lieferpreis      object 
 13  lieferung_durch  object 
 14  ranking          int64  
 15  buyBoxWinner     object 
 16  numberOfSellers  int64  
 17  trigByReactive   object 
dtypes: float64(1), int64(3), object(14)
memory usage: 906.8+ MB


In [3]:
print("Length of the DataFrame:", df.shape[0])
num_true = (df['buyBoxWinner'] == True).sum()
print(f"The Number of Offers winning the BuyBox is {num_true}")
distinct_asins = df['asin'].unique()
asins_complete = asins2['id'].unique()
print("Number of distinct asins:", len(distinct_asins))
set_distinct_asins = set(distinct_asins)
set_asins_complete = set(asins_complete)

asins_not_in_both = set_distinct_asins.symmetric_difference(set_asins_complete)
print("ASINs not present in both:", asins_not_in_both)

Length of the DataFrame: 6603268
The Number of Offers winning the BuyBox is 0
Number of distinct asins: 995
ASINs not present in both: {'B09MY48N93', 'B0B2RRZLY6', 'B09L6247P8', 'B09W8GB7KY', 'B07H4FRQ4T'}


## Delete ASIN with Missing Values and
## Add Missing Values to the two Offers

In [4]:
df = df[~(df['asin'] == 'B0743BR42Y')]
row_index = df[df['id'] == 1637198].index[0]
# Update the 'price' and 'currency' for the row with 'id' equal to 1637198 with the price from the hour before
df.loc[row_index, 'price'] = 152.44
df.loc[row_index, 'currency'] = '€'
# Find the index of the row with 'id' equal to 2486948
row_index = df[df['id'] == 2486948].index[0]
# Update the 'price' and 'currency' for the row with 'id' equal to 2486948 with the price from the hour before
df.loc[row_index, 'price'] = 185.32
df.loc[row_index, 'currency'] = '€'



## Transform BuyBoxWinner Column to Boolean Format

In [None]:
df['buyBoxWinner'] = df['buyBoxWinner'].apply(lambda x: True if x == 't' else False)

## Transform Time Column to Datetime Format

In [5]:
# convert the 'time' column to datetime format
df['time'] = pd.to_datetime(df['time'], format="%Y-%m-%d-%H")
df.head()

Unnamed: 0,id,asin,price,currency,time,crawlTime,condition,sellerName,sellerId,sellerbewertung,seller_sterne,lieferdatum,lieferpreis,lieferung_durch,ranking,buyBoxWinner,numberOfSellers,trigByReactive
0,341683,B09SBXZV9V,141.55,€,2023-03-22 16:00:00,2023-03-22 16:54:19.521639,Neu,belli-shop,AOZ9PW800A1WK,(4211 Bewertungen) 100 % positiv in...,5 von 5,"Samstag, 25. März",GRATIS,Amazon,0,t,13,f
1,341684,B0000C72GD,79.9,€,2023-03-22 16:00:00,2023-03-22 16:54:19.522425,Neu,STILE IMMAGINE DIGITAL HD,A16E8RFMSALSSB,(29 Bewertungen) 97 % positiv über ...,5 von 5,29. - 31. März,"9,90 €",STILE IMMAGINE DIGITAL HD,0,t,2,f
2,341685,B0001GRVJQ,55.31,€,2023-03-22 16:00:00,2023-03-22 16:54:19.523125,Neu,amazon,amazon,,,,,Amazon,0,t,8,f
3,341686,B0002CZU1U,273.28,€,2023-03-22 16:00:00,2023-03-22 16:54:19.523736,Neu,Musikhaus Kirstein GmbH,A2LUZCVBLA57KT,(38466 Bewertungen) 95 % positiv in...,4.5 von 5,27. - 28. März,"2,99 €",Musikhaus Kirstein GmbH,0,t,3,f
4,341687,B0002HOS7M,75.62,€,2023-03-22 16:00:00,2023-03-22 16:54:19.524467,Neu,amazon,amazon,,,"Samstag, 25. März",GRATIS,Amazon,0,t,23,f


## Transforming/Adding Missing Rows/Times
Missing Rows are "Replaced" by the rows for the same asin one hour before
Search For Asins with an Hour Gap

In [6]:
if not skip_cell:
    # create a MultiIndex using columns "asin", "time", and "id"
    df.set_index(['asin', 'time'], inplace=True)

    # sort the DataFrame by the MultiIndex
    df.sort_index(inplace=True)

    # Initialize an empty list called asins_and_gaps
    asins_and_gaps = []

    def check_one_hour_gap(group):
        group = group.reset_index()  # reset index before indexing with missing_gap Series
        time_diff = group['time'].diff()
        one_hour_gap = pd.Timedelta(hours=1)
        missing_gap = time_diff > one_hour_gap

        if missing_gap.any():
            gap_start_time = group['time'][missing_gap].iloc[0] - one_hour_gap
            missing_hour = gap_start_time.strftime("%Y-%m-%d %H")
            asin = group["asin"].iloc[0]
            print(f'Missing one-hour gap in ASIN {asin} at {missing_hour}')
            print("-------------------------")
            print("\n")

            # Add the ASIN and the missing hour to the asins_and_gaps list
            asins_and_gaps.append((asin, missing_hour))

    # apply custom function to each group
    df.groupby('asin').apply(check_one_hour_gap)

    #print("ASINs and gaps:")
    #print(asins_and_gaps)
    #asins_and_gaps
    distinct_asins = {tup[0] for tup in asins_and_gaps}
    print(f"For {len(distinct_asins)} there is at least one time Gap")
    print(f"In Total there are {len(asins_and_gaps)} time Gaps")

### Find Rows from one hour before and insert them into missing_data_df

In [7]:
if not skip_cell:
    # Assuming you already have the DataFrame df and the asins_and_gaps list

    # Initialize an empty DataFrame called missing_data_df
    missing_data_df = pd.DataFrame()

    for asin, gap_time_str in asins_and_gaps:
        # Convert the gap time string to a datetime object
        gap_time = pd.to_datetime(gap_time_str, format='%Y-%m-%d %H')

        matching_rows = pd.DataFrame()
        gap_found = 0

        # Search for rows up to 3 hours earlier
        for hours in range(1, 4):
            time_earlier = gap_time - pd.Timedelta(hours=hours)
            matching_rows = df.loc[(df.index.get_level_values('asin') == asin) & (df.index.get_level_values('time') == time_earlier)]

            if not matching_rows.empty:
                gap_found = -hours
                break

        # If matching rows are found
        if not matching_rows.empty:
            # Make a copy of matching_rows before modifying it
            matching_rows = matching_rows.copy()

            # Update the time column value for the copied rows to the appropriate gap
            matching_rows.reset_index(inplace=True)
            matching_rows[matching_rows.columns[matching_rows.columns.get_loc('time')]] = matching_rows['time'] - pd.Timedelta(hours=gap_found)

            # Add a new column named "time_gap" to indicate the gap at which the rows were found
            matching_rows['time_gap'] = gap_found

            # Append the matching rows to the missing_data_df DataFrame
            missing_data_df = pd.concat([missing_data_df, matching_rows])

    # Reset the index of the missing_data_df DataFrame
    missing_data_df.reset_index(drop=True, inplace=True)
    #missing_data_df.info()
    distinct_values = missing_data_df['asin'].nunique()
    distinct_values
    print(f"For {distinct_values} there could be found rows from 1/2/3 hourls earlier to replace the time gap")

## Concat Missing Dataframe to Normal Dataframe

In [8]:
if not skip_cell:
    df['time_gap'] = 0
    df['copyed_cause_missing'] = False
    df = df.reset_index()
    missing_data_df['copyed_cause_missing'] = True
    old_size = df.shape[0]
    df = pd.concat([df, missing_data_df])
    new_size = df.shape[0]
    del missing_data_df
    print(f"The Old Size was {old_size} and the New Size is {new_size}. This Means {new_size-old_size} have been added")

In [9]:
df.head()

Unnamed: 0,id,asin,price,currency,time,crawlTime,condition,sellerName,sellerId,sellerbewertung,seller_sterne,lieferdatum,lieferpreis,lieferung_durch,ranking,buyBoxWinner,numberOfSellers,trigByReactive
0,341683,B09SBXZV9V,141.55,€,2023-03-22 16:00:00,2023-03-22 16:54:19.521639,Neu,belli-shop,AOZ9PW800A1WK,(4211 Bewertungen) 100 % positiv in...,5 von 5,"Samstag, 25. März",GRATIS,Amazon,0,t,13,f
1,341684,B0000C72GD,79.9,€,2023-03-22 16:00:00,2023-03-22 16:54:19.522425,Neu,STILE IMMAGINE DIGITAL HD,A16E8RFMSALSSB,(29 Bewertungen) 97 % positiv über ...,5 von 5,29. - 31. März,"9,90 €",STILE IMMAGINE DIGITAL HD,0,t,2,f
2,341685,B0001GRVJQ,55.31,€,2023-03-22 16:00:00,2023-03-22 16:54:19.523125,Neu,amazon,amazon,,,,,Amazon,0,t,8,f
3,341686,B0002CZU1U,273.28,€,2023-03-22 16:00:00,2023-03-22 16:54:19.523736,Neu,Musikhaus Kirstein GmbH,A2LUZCVBLA57KT,(38466 Bewertungen) 95 % positiv in...,4.5 von 5,27. - 28. März,"2,99 €",Musikhaus Kirstein GmbH,0,t,3,f
4,341687,B0002HOS7M,75.62,€,2023-03-22 16:00:00,2023-03-22 16:54:19.524467,Neu,amazon,amazon,,,"Samstag, 25. März",GRATIS,Amazon,0,t,23,f


## Transforming Dataframe Seller Sterne

In [12]:
# Get distinct values for the 'seller_sterne' column
distinct_seller_sterne = df['seller_sterne'].unique()

# Print the distinct values
print("Distinct values for seller_sterne:", distinct_seller_sterne)

Distinct values for seller_sterne: ['5 von 5' nan '4.5 von 5' '4 von 5' '3.5 von 5' '3 von 5' '2 von 5'
 '1 von 5' '2.5 von 5' '1.5 von 5']


In [28]:

    # Get distinct values for the 'seller_sterne' column
    distinct_seller_sterne = df['seller_sterne'].unique()
    print("Distinct values for seller_sterne:", distinct_seller_sterne)

    # Define a lambda function to convert the string to a float or return None for null values
    str_to_float_or_none = lambda x: float(x.split()[0].replace(",", ".")) if isinstance(x, str) and len(x.split()) > 0 else None

    # Apply the lambda function to the 'seller_sterne' column to replace the string values with their corresponding float values
    df['seller_sterne'] = df['seller_sterne'].apply(str_to_float_or_none)

    # Check the distinct values for 'seller_sterne' after the replacement
    new_distinct_seller_sterne = df['seller_sterne'].unique()
    new_distinct_seller_sterne = np.sort(new_distinct_seller_sterne)
    print("Distinct values for seller_sterne after replacement:", [f"{value:.1f}" for value in new_distinct_seller_sterne])

Distinct values for seller_sterne: ['5 von 5' nan '4.5 von 5' '4 von 5' '3.5 von 5' '3 von 5' '2 von 5'
 '1 von 5' '2.5 von 5' '1.5 von 5']
4.308178099114687
4.5
Distinct values for seller_sterne after replacement: ['1.0', '1.5', '2.0', '2.5', '3.0', '3.5', '4.0', '4.5', '5.0', 'nan']


## Transformin Dataframe Lieferpreis


In [14]:
df['lieferpreis'] = df['lieferpreis'].replace(['GRATIS', 'FREE'], 0.0)
##Replace Euro Signs
df['lieferpreis'] = df['lieferpreis'].replace('€', '', regex=True)
df = df.replace(',', '.', regex=True)
df['lieferpreis'] = df['lieferpreis'].astype(float, errors='ignore')
num_missing = df['lieferpreis'].isna().sum()
print(f"The column 'lieferpreis' has {num_missing} missing values.")


The column 'lieferpreis' has 208608 missing values.


## Transform Dataframe Sellerbewertung


In [15]:
# Split values into two columns based on first space
df[['vorderer_Teil', 'hinterer_Teil']] = df['sellerbewertung'].str.split(')', 1, expand=True)
df['hinterer_Teil'] = df['hinterer_Teil'].str.replace('%.*', '', regex=True)
# Replace NaN values in vorderer_Teil column with 0
df['vorderer_Teil'] = df['vorderer_Teil'].fillna(0)
df['vorderer_Teil'] = df['vorderer_Teil'].str.extract('(\d+)').astype(float)
df['hinterer_Teil'] = df['hinterer_Teil'].astype(float) / 100.0
df['sellerbewertung_ausgerechnet'] = df['vorderer_Teil'] * df['hinterer_Teil']
df['sellerbewertung'] = df['sellerbewertung_ausgerechnet']

# Drop columns that were created in the process
df = df.drop(['vorderer_Teil', 'hinterer_Teil', 'sellerbewertung_ausgerechnet'], axis=1)

  df[['vorderer_Teil', 'hinterer_Teil']] = df['sellerbewertung'].str.split(')', 1, expand=True)


In [16]:
df.head()

Unnamed: 0,id,asin,price,currency,time,crawlTime,condition,sellerName,sellerId,sellerbewertung,seller_sterne,lieferdatum,lieferpreis,lieferung_durch,ranking,buyBoxWinner,numberOfSellers,trigByReactive
0,341683,B09SBXZV9V,141.55,€,2023-03-22 16:00:00,2023-03-22 16:54:19.521639,Neu,belli-shop,AOZ9PW800A1WK,4211.0,5 von 5,Samstag. 25. März,0.0,Amazon,0,t,13,f
1,341684,B0000C72GD,79.9,€,2023-03-22 16:00:00,2023-03-22 16:54:19.522425,Neu,STILE IMMAGINE DIGITAL HD,A16E8RFMSALSSB,28.13,5 von 5,29. - 31. März,9.9,STILE IMMAGINE DIGITAL HD,0,t,2,f
2,341685,B0001GRVJQ,55.31,€,2023-03-22 16:00:00,2023-03-22 16:54:19.523125,Neu,amazon,amazon,,,,,Amazon,0,t,8,f
3,341686,B0002CZU1U,273.28,€,2023-03-22 16:00:00,2023-03-22 16:54:19.523736,Neu,Musikhaus Kirstein GmbH,A2LUZCVBLA57KT,36542.7,4.5 von 5,27. - 28. März,2.99,Musikhaus Kirstein GmbH,0,t,3,f
4,341687,B0002HOS7M,75.62,€,2023-03-22 16:00:00,2023-03-22 16:54:19.524467,Neu,amazon,amazon,,,Samstag. 25. März,0.0,Amazon,0,t,23,f


In [17]:
if not skip_cell:
    # Find rows where 'lieferung_durch' is a float
    float_rows = df[df['lieferung_durch'].apply(lambda x: isinstance(x, float))]

    # Print the rows where 'lieferung_durch' is a float
    len(float_rows)
    float_rows

### Tramsform Column lieferung_durch

In [18]:
df['Fulfillment_type'] = ['FBA' if isinstance(x, str) and 'amazon' in x.lower() else 'FBM' for x in df['lieferung_durch']]

## Transform Dataframe Lieferdauer

In [19]:
df['month'] = -1

# loop over each row in the dataframe
for i, row in df.iterrows():
    # check if the 'lieferdatum' value contains the word "April"
    if 'April' in str(row['lieferdatum']):
        # if it does, set the 'month' value to 4
        df.at[i, 'month'] = 4
    # if it does not, check if the 'lieferdatum' value contains the word "März"
    elif 'März' in str(row['lieferdatum']):
        # if it does, set the 'month' value to 3
        df.at[i, 'month'] = 3

In [20]:
import re
df['days'] = [[] for _ in range(len(df.index))]

# loop over each row in the dataframe
for i, row in df.iterrows():
    # extract all the numeric values from the 'lieferdatum' value using regular expressions
    nums = re.findall(r'\d+', str(row['lieferdatum']))
    # add the numeric values as a list to the 'days' column for this row
    df.at[i, 'days'] = [int(num) for num in nums]

In [21]:
# calculate the average value of each non-empty array in the 'days' column
avg_days = df['days'].apply(lambda x: sum(x)/len(x) if x else np.nan)

# replace the 'days' column with the average values
df['days'] = avg_days.astype(float)
#df['lieferdatum']

In [22]:
df['datetime'] = pd.to_datetime(df['time'].str[:10], format='%Y-%m-%d')
# create a new column called 'lieferdatum_datetime' and initialize it to NaN
df['lieferdatum_datetime'] = np.nan

# loop over each row in the dataframe
for i, row in df.iterrows():
    # get the 'month' and 'days' values for this row
    month = row['month']
    days = row['days']
    
    # skip this row if the 'month' or 'days' value is NaN or -1
    if np.isnan(days) or month == -1:
        continue
    
    # create a datetime object for this row and extract only the date portion
    date_str = f"2023-{int(month):02d}-{int(days):02d}"
    datetime_obj = pd.to_datetime(date_str, format='%Y-%m-%d')
    date_only = datetime_obj.date()
    
    # save the date object to the 'lieferdatum_datetime' column for this row
    df.at[i, 'lieferdatum_datetime'] = date_only
df.head()

# convert 'lieferdatum_datetime' column to a Pandas Timestamp object
df['lieferdatum_datetime'] = pd.to_datetime(df['lieferdatum_datetime'])
df['date_diff'] = np.where(df['lieferdatum_datetime'].isna(), np.nan, (df['lieferdatum_datetime'] - df['datetime']).dt.days)
df = df.drop(['days', 'month', 'datetime','lieferdatum_datetime'], axis=1)
df.head()


Unnamed: 0,id,asin,price,currency,time,crawlTime,condition,sellerName,sellerId,sellerbewertung,seller_sterne,lieferdatum,lieferpreis,lieferung_durch,ranking,buyBoxWinner,numberOfSellers,trigByReactive,Fulfillment_type,date_diff
0,341683,B09SBXZV9V,141.55,€,2023-03-22 16:00:00,2023-03-22 16:54:19.521639,Neu,belli-shop,AOZ9PW800A1WK,4211.0,5 von 5,Samstag. 25. März,0.0,Amazon,0,t,13,f,FBA,3.0
1,341684,B0000C72GD,79.9,€,2023-03-22 16:00:00,2023-03-22 16:54:19.522425,Neu,STILE IMMAGINE DIGITAL HD,A16E8RFMSALSSB,28.13,5 von 5,29. - 31. März,9.9,STILE IMMAGINE DIGITAL HD,0,t,2,f,FBM,8.0
2,341685,B0001GRVJQ,55.31,€,2023-03-22 16:00:00,2023-03-22 16:54:19.523125,Neu,amazon,amazon,,,,,Amazon,0,t,8,f,FBA,
3,341686,B0002CZU1U,273.28,€,2023-03-22 16:00:00,2023-03-22 16:54:19.523736,Neu,Musikhaus Kirstein GmbH,A2LUZCVBLA57KT,36542.7,4.5 von 5,27. - 28. März,2.99,Musikhaus Kirstein GmbH,0,t,3,f,FBM,5.0
4,341687,B0002HOS7M,75.62,€,2023-03-22 16:00:00,2023-03-22 16:54:19.524467,Neu,amazon,amazon,,,Samstag. 25. März,0.0,Amazon,0,t,23,f,FBA,3.0


## Transform Price

In [23]:
# Select rows where the 'price' column is between 1 and 2
selected_rows = df[(df['price'] >= 1) & (df['price'] <= 2)]
selected_rows.head()

Unnamed: 0,id,asin,price,currency,time,crawlTime,condition,sellerName,sellerId,sellerbewertung,seller_sterne,lieferdatum,lieferpreis,lieferung_durch,ranking,buyBoxWinner,numberOfSellers,trigByReactive,Fulfillment_type,date_diff
264,341947,B08WPV9RM7,1.063,€,2023-03-22 16:00:00,2023-03-22 16:54:19.659015,Neu,Stortle,A5LE00RCDFR6J,328.96,3.5 von 5,31. März - 6. April,3.1,Stortle,10,f,50,f,FBM,27.0
790,342473,B01FE7K184,1.118,€,2023-03-22 16:00:00,2023-03-22 16:54:19.836242,Neu,GetMarket,AKAKN0OST1C61,59.22,3.5 von 5,28. März - 3. April,9.99,GetMarket,1,f,3,f,FBM,24.0
791,342474,B01FE7K184,1.242,€,2023-03-22 16:00:00,2023-03-22 16:54:19.836543,Neu,Solution 4YOU,A3UY995ZKNNQ2M,84.46,3 von 5,4. - 14. April,0.0,Solution 4YOU,2,f,3,f,FBM,18.0
792,342475,B01FE7K184,1.326,€,2023-03-22 16:00:00,2023-03-22 16:54:19.836840,Neu,E-Mistero,A1T7OOZ029MPBM,48.4,4.5 von 5,4. - 12. April,78.57,E-Mistero,3,f,3,f,FBM,17.0
1228,342911,B08WPV9RM7,1.253,€,2023-03-22 16:00:00,2023-03-22 16:54:28.776958,Neu,To B To C,A26BPTIVQURTF4,62.31,3.5 von 5,31. März - 5. April,0.0,To B To C,19,f,50,f,FBM,27.0


In [24]:
#Prices over 1000 € are misinterpreted due to a wrongly placed . sign. Transforming it.
## All Prices below 2 Euros where checked if Products where existing, for which this price is real. All Products under question are actully priced over 1000€

# Select rows where the 'price' column is between 1 and 2
selected_rows = df[(df['price'] >= 1) & (df['price'] <= 2)]

# Multiply the price by 1000 for rows where the price is between 1 and 2
selected_rows.loc[(selected_rows['price'] >= 1) & (selected_rows['price'] <= 2), 'price'] *= 1000

# Update the original DataFrame with the updated values
df.update(selected_rows)

# Print the result
#selected_rows.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_rows.loc[(selected_rows['price'] >= 1) & (selected_rows['price'] <= 2), 'price'] *= 1000


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6591111 entries, 0 to 6591110
Data columns (total 20 columns):
 #   Column            Dtype  
---  ------            -----  
 0   id                float64
 1   asin              object 
 2   price             float64
 3   currency          object 
 4   time              object 
 5   crawlTime         object 
 6   condition         object 
 7   sellerName        object 
 8   sellerId          object 
 9   sellerbewertung   float64
 10  seller_sterne     object 
 11  lieferdatum       object 
 12  lieferpreis       float64
 13  lieferung_durch   object 
 14  ranking           float64
 15  buyBoxWinner      object 
 16  numberOfSellers   float64
 17  trigByReactive    object 
 18  Fulfillment_type  object 
 19  date_diff         float64
dtypes: float64(7), object(13)
memory usage: 1005.7+ MB


## Save Data to File

In [26]:
df.to_csv(name_for_save_file, index=False)
#Relevant für RF: asin, price,(time),sellerid, sellerbewertung, seller_sterne, lieferpreis, Fulfillment_type, date_diff, ranking, numberOfSellers

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6591111 entries, 0 to 6591110
Data columns (total 20 columns):
 #   Column            Dtype  
---  ------            -----  
 0   id                float64
 1   asin              object 
 2   price             float64
 3   currency          object 
 4   time              object 
 5   crawlTime         object 
 6   condition         object 
 7   sellerName        object 
 8   sellerId          object 
 9   sellerbewertung   float64
 10  seller_sterne     object 
 11  lieferdatum       object 
 12  lieferpreis       float64
 13  lieferung_durch   object 
 14  ranking           float64
 15  buyBoxWinner      object 
 16  numberOfSellers   float64
 17  trigByReactive    object 
 18  Fulfillment_type  object 
 19  date_diff         float64
dtypes: float64(7), object(13)
memory usage: 1005.7+ MB
