In [1]:
# Import Library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from statsmodels.stats.proportion import proportions_ztest
from scipy.stats import normaltest, chi2_contingency, mannwhitneyu, ttest_ind, kstest
import statsmodels.api as sm
import os
import missingno as msno

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
pd.set_option('display.max_colwidth', None)

In [2]:
# List of date columns for each Olist dataset
# This dictionary maps each dataset filename to a list of columns that should be parsed as dates.
date_cols = {
    'olist_orders_dataset.csv': [
        'order_purchase_timestamp',
        'order_approved_at',
        'order_delivered_carrier_date',
        'order_delivered_customer_date',
        'order_estimated_delivery_date',
    ],
    'olist_order_items_dataset.csv': [
        'shipping_limit_date',
    ],
    'olist_order_reviews_dataset.csv': [
        'review_creation_date',
        'review_answer_timestamp',
    ],
    # The following datasets have NO date columns:
    # 'olist_customers_dataset.csv'
    # 'olist_geolocation_dataset.csv'
    # 'olist_order_payments_dataset.csv'
    # 'olist_products_dataset.csv'
    # 'olist_sellers_dataset.csv'
    # 'product_category_name_translation.csv'
    'master_olist_dataset.csv': [
        'order_purchase_timestamp',
        'order_approved_at',
        'order_delivered_carrier_date',
        'order_delivered_customer_date',
        'order_estimated_delivery_date',
        'shipping_limit_date',
        'review_creation_date',
        'review_answer_timestamp',
    ]
}

def read_olist_csv(path):
    """
    Reads an Olist CSV and parses dates for the correct columns.
    Args:
        path (str): Path to the CSV file.
    Returns:
        pd.DataFrame: Loaded dataframe with date columns parsed as datetime.
    """
    # Extract just the filename, e.g., 'olist_orders_dataset.csv'
    filename = os.path.basename(path)
    # Get the correct date columns for this file, or an empty list
    parse_dates = date_cols.get(filename, [])
    # Read the CSV, parsing the specified date columns (if any)
    return pd.read_csv(path, parse_dates=parse_dates)

Load original (raw) datasets:

In [3]:
original_df_orders           = read_olist_csv('../data/original_data/olist_orders_dataset.csv')
original_df_customers        = read_olist_csv('../data/original_data/olist_customers_dataset.csv')
original_df_order_items      = read_olist_csv('../data/original_data/olist_order_items_dataset.csv')
original_df_order_payments   = read_olist_csv('../data/original_data/olist_order_payments_dataset.csv')
original_df_reviews          = read_olist_csv('../data/original_data/olist_order_reviews_dataset.csv')
original_df_products         = read_olist_csv('../data/original_data/olist_products_dataset.csv')
original_df_prod_cat_tr      = read_olist_csv('../data/original_data/product_category_name_translation.csv')
original_df_sellers          = read_olist_csv('../data/original_data/olist_sellers_dataset.csv')
original_df_geolocation      = read_olist_csv('../data/original_data/olist_geolocation_dataset.csv')

Load cleaned datasets

In [4]:
cleaned_delivered_df_orders           = read_olist_csv('../data/cleaned_data/olist_orders_dataset.csv')
cleaned_df_customers        = read_olist_csv('../data/cleaned_data/olist_customers_dataset.csv')
cleaned_delivered_df_order_items      = read_olist_csv('../data/cleaned_data/olist_order_items_dataset.csv')
cleaned_delivered_df_order_payments   = read_olist_csv('../data/cleaned_data/olist_order_payments_dataset.csv')
cleaned_delivered_df_reviews          = read_olist_csv('../data/cleaned_data/olist_order_reviews_dataset.csv')
cleaned_df_products         = read_olist_csv('../data/cleaned_data/olist_products_dataset.csv')
cleaned_df_prod_cat_tr      = read_olist_csv('../data/cleaned_data/product_category_name_translation.csv')
cleaned_df_sellers          = read_olist_csv('../data/cleaned_data/olist_sellers_dataset.csv')
cleaned_df_geolocation      = read_olist_csv('../data/cleaned_data/olist_geolocation_dataset.csv')

In [5]:
df_meta = pd.read_csv("../data/cleaned_data/olist_orders_dataset.csv", nrows=0)
print(df_meta.columns.tolist())

['order_id', 'customer_id', 'order_status', 'order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date']


Load master dataset

In [6]:
master_olist_dataset = read_olist_csv('../data/cleaned_data/master_olist_dataset.csv')

In [7]:
master_olist_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115093 entries, 0 to 115092
Data columns (total 41 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       115093 non-null  object        
 1   customer_id                    115093 non-null  object        
 2   order_status                   115093 non-null  object        
 3   order_purchase_timestamp       115093 non-null  datetime64[ns]
 4   order_approved_at              115093 non-null  datetime64[ns]
 5   order_delivered_carrier_date   115093 non-null  datetime64[ns]
 6   order_delivered_customer_date  115093 non-null  datetime64[ns]
 7   order_estimated_delivery_date  115093 non-null  datetime64[ns]
 8   customer_unique_id             115093 non-null  object        
 9   customer_zip_code_prefix       115093 non-null  int64         
 10  customer_city                  115093 non-null  object        
 11  

In [8]:
master_olist_dataset.describe()

Unnamed: 0,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,customer_zip_code_prefix,order_item_id,shipping_limit_date,price,freight_value,...,product_weight_g,product_length_cm,product_height_cm,product_width_cm,seller_zip_code_prefix,payment_sequential,payment_installments,payment_value,review_score,review_answer_timestamp
count,115093,115093,115093,115093,115093,115093.0,115093.0,115093,115093.0,115093.0,...,115093.0,115093.0,115093.0,115093.0,115093.0,115093.0,115093.0,115093.0,113721.0,113721
mean,2017-12-31 11:30:04.998384128,2017-12-31 21:56:23.543899392,2018-01-03 18:23:03.572310784,2018-01-12 23:24:51.927893248,2018-01-24 07:25:19.567653888,35057.944697,1.196554,2018-01-07 02:08:09.010356992,120.060541,19.988354,...,2107.770021,30.242352,16.587812,23.062628,24451.798702,1.091248,2.938737,171.93889,4.0825,2018-01-16 08:06:48.084285440
min,2016-10-03 09:44:50,2016-10-04 09:43:32,2016-10-08 10:34:01,2016-10-11 13:46:32,2016-10-27 00:00:00,1004.0,1.0,2016-10-08 10:34:01,0.85,0.0,...,0.0,7.0,2.0,6.0,1001.0,1.0,0.0,0.0,1.0,2016-10-16 03:20:17
25%,2017-09-12 22:11:35,2017-09-13 13:33:17,2017-09-15 17:13:26,2017-09-24 13:14:56,2017-10-04 00:00:00,11250.0,1.0,2017-09-20 02:43:52,39.9,13.08,...,300.0,18.0,8.0,15.0,6429.0,1.0,1.0,60.85,4.0,2017-09-28 00:37:34
50%,2018-01-18 22:23:16,2018-01-19 10:32:57,2018-01-23 17:49:54,2018-02-01 12:46:41,2018-02-15 00:00:00,24310.0,1.0,2018-01-25 17:00:30,74.9,16.28,...,700.0,25.0,13.0,20.0,13660.0,1.0,2.0,108.12,5.0,2018-02-05 00:55:20
75%,2018-05-04 03:59:20,2018-05-04 14:53:32,2018-05-07 15:28:00,2018-05-14 22:56:29,2018-05-25 00:00:00,58701.0,1.0,2018-05-10 09:50:40,133.0,21.17,...,1800.0,38.0,20.0,30.0,27930.0,1.0,4.0,188.93,5.0,2018-05-20 11:36:07
max,2018-08-29 15:00:37,2018-08-29 15:10:26,2018-09-11 19:48:28,2018-10-17 13:22:46,2018-10-25 00:00:00,99980.0,21.0,2020-04-09 22:35:08,6735.0,409.68,...,40425.0,105.0,105.0,118.0,99730.0,26.0,24.0,13664.08,5.0,2018-10-29 12:27:35
std,,,,,,29844.633374,0.700871,,183.015633,15.734193,...,3775.318176,16.131124,13.426735,11.731936,27584.181337,0.687378,2.774541,266.193435,1.346116,


In [9]:
# 4. Core delay & flag features
master_olist_dataset['is_late']    = master_olist_dataset['order_delivered_customer_date'] > master_olist_dataset['order_estimated_delivery_date']
master_olist_dataset['late_days']    = (master_olist_dataset['order_delivered_customer_date'] - master_olist_dataset['order_estimated_delivery_date']).dt.days
master_olist_dataset['dispatch_delay'] = (master_olist_dataset['order_delivered_carrier_date'] - master_olist_dataset['shipping_limit_date']).dt.days
master_olist_dataset['is_late_dispatch'] = (master_olist_dataset['dispatch_delay'] > 0)
master_olist_dataset['dispatch_time'] = (master_olist_dataset['order_delivered_carrier_date'] - master_olist_dataset['order_approved_at']).dt.days
master_olist_dataset['transit_time']  = (master_olist_dataset['order_delivered_customer_date']  - master_olist_dataset['order_delivered_carrier_date']).dt.days

# 5. Review bucket
master_olist_dataset['review_bucket'] = master_olist_dataset['review_score'].apply(lambda x: 'Good (4-5)' if x > 3 else 'Bad (1-3)')

In [10]:
master_olist_dataset.head()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,customer_unique_id,customer_zip_code_prefix,...,review_answer_timestamp,has_review,product_category_name_english,is_late,late_days,dispatch_delay,is_late_dispatch,dispatch_time,transit_time,review_bucket
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18,7c396fd4830fd04220f754e42b4e5bff,3149,...,2017-10-12 03:43:48,True,housewares,False,-8,-2,False,2,6,Good (4-5)
1,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18,7c396fd4830fd04220f754e42b4e5bff,3149,...,2017-10-12 03:43:48,True,housewares,False,-8,-2,False,2,6,Good (4-5)
2,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18,7c396fd4830fd04220f754e42b4e5bff,3149,...,2017-10-12 03:43:48,True,housewares,False,-8,-2,False,2,6,Good (4-5)
3,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13,af07308b275d755c9edb36a90c618231,47813,...,2018-08-08 18:37:50,True,perfumery,False,-6,-4,False,0,12,Good (4-5)
4,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04,3a653a41f6f9fc3d2a113cf8398680e8,75265,...,2018-08-22 19:07:58,True,auto,False,-18,-5,False,0,9,Good (4-5)


In [11]:
# 6. Customer repeat & first-order flags
master_olist_dataset = master_olist_dataset.sort_values(['customer_unique_id','order_purchase_timestamp'])
master_olist_dataset['order_rank']        = master_olist_dataset.groupby('customer_unique_id')['order_purchase_timestamp'] \
                              .rank(method='first')
master_olist_dataset['first_order_flag']  = master_olist_dataset['order_rank'] == 1

order_counts = master_olist_dataset.groupby('customer_unique_id')['order_id'].nunique()
master_olist_dataset['customer_repeat_flag'] = master_olist_dataset['customer_unique_id'].map(order_counts > 1)

master_olist_dataset.head()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,customer_unique_id,customer_zip_code_prefix,...,is_late,late_days,dispatch_delay,is_late_dispatch,dispatch_time,transit_time,review_bucket,order_rank,first_order_flag,customer_repeat_flag
61019,e22acc9c116caa3f2b7121bbb380d08e,fadbb3709178fc513abc1b2670aa1ad2,delivered,2018-05-10 10:56:27,2018-05-10 11:11:18,2018-05-12 08:18:00,2018-05-16 20:48:37,2018-05-21,0000366f3b9a7992bf8c76cfdf3221e2,7787,...,False,-5,-4,False,1,4,Good (4-5),1.0,True,False
85359,3594e05a005ac4d06a72673270ef9ec9,4cb282e167ae9234755102258dd52ee8,delivered,2018-05-07 11:11:27,2018-05-07 18:25:44,2018-05-09 12:18:00,2018-05-10 18:02:42,2018-05-15,0000b849f77a49e4a4ce2b2a4ca5be3f,6053,...,False,-5,-3,False,1,1,Good (4-5),1.0,True,False
30624,b33ec3b699337181488304f362a6b734,9b3932a6253894a02c1df9d19004239f,delivered,2017-03-10 21:05:03,2017-03-10 21:05:03,2017-03-13 12:58:30,2017-04-05 14:38:47,2017-04-07,0000f46a3911fa3c0805444483337064,88115,...,False,-2,-3,False,2,23,Bad (1-3),1.0,True,False
113968,41272756ecddd9a9ed0180413cc22fb6,914991f0c02ef0843c0e7010c819d642,delivered,2017-10-12 20:29:41,2017-10-12 20:49:17,2017-10-13 20:08:19,2017-11-01 21:23:05,2017-11-13,0000f6ccb0745a6a4b88665a16c9f078,66812,...,False,-12,-6,False,0,19,Good (4-5),1.0,True,False
48104,d957021f1127559cd947b62533f484f7,47227568b10f5f58a524a75507e6992c,delivered,2017-11-14 19:45:42,2017-11-14 20:06:52,2017-11-16 19:52:10,2017-11-27 23:08:56,2017-12-05,0004aac84e0df4da2b147fca70cf8255,18040,...,False,-8,-7,False,1,11,Good (4-5),1.0,True,False


In [13]:
# 7. Retention delta (Δ repeat-rate between on-time vs. late first orders)
firsts = master_olist_dataset[master_olist_dataset['first_order_flag']]
ret = firsts.groupby('is_late').agg(
    total_customers=('customer_unique_id','nunique'),
    repeat_customers=('customer_repeat_flag', 'sum')
).reset_index()
ret['repeat_rate'] = ret['repeat_customers']/ret['total_customers']

rr_on_time = ret.loc[ret['is_late']==False, 'repeat_rate'].iloc[0]
rr_late    = ret.loc[ret['is_late']==True,  'repeat_rate'].iloc[0]
retention_delta = rr_on_time - rr_late

print(f"Retention on time first orders: {rr_on_time*100:.4f}%")
print(f"Retention late first orders: {rr_late*100:.4f}%")
print(f"Retention Δ (on-time vs late first orders): {retention_delta*100:.4f}%")

Retention on time first orders: 3.0413%
Retention late first orders: 2.5132%
Retention Δ (on-time vs late first orders): 0.5282%


In [None]:
# 8. Impact model: lost revenue from late first orders
late_first_count = ret.loc[ret['is_late']==True, 'total_customers'].iloc[0]
avg_order_value  = master_olist_dataset['payment_value'].mean()
lost_revenue     = retention_delta * late_first_count * avg_order_value

print(f"Estimated lost revenue from late first orders: {lost_revenue:,.2f}")

# 9. Seller- and state-level aggregates
seller_agg = master_olist_dataset.groupby('seller_id').agg(
    total_orders   = ('order_id','nunique'),
    late_orders    = ('is_late','sum'),
    late_rate      = ('is_late','mean'),
    avg_late_days  = ('late_days','mean'),
    avg_review_score = ('review_score', 'mean'),
    avg_payment_value = ('payment_value', 'mean'),
    late_dispatch_count = ('is_late_dispatch', 'sum'),
    late_dispatch_rate = ('is_late_dispatch', 'mean'),
    avg_dispatch_time = ('dispatch_time', 'mean'),
    avg_dispatch_delay = ('dispatch_delay', 'mean'),
).reset_index()

state_agg = master_olist_dataset.groupby('customer_state').agg(
    total_orders   = ('order_id','nunique'),
    late_orders    = ('is_late','sum'),
    late_rate      = ('is_late','mean'),
    avg_late_days  = ('late_days','mean'),
    avg_review_score = ('review_score', 'mean'),
    avg_payment_value = ('payment_value', 'mean'),
    late_dispatch_count = ('is_late_dispatch', 'sum'),
    late_dispatch_rate = ('is_late_dispatch', 'mean'),
    avg_dispatch_time = ('dispatch_time', 'mean'),
    avg_dispatch_delay = ('dispatch_delay', 'mean'),
).reset_index()

Estimated lost revenue from late first orders: 6,901.66


In [16]:
master_olist_dataset.shape

(115093, 51)

In [17]:
# 9. Merge location data
# Get median geo coordinates for each zip code
median_coords = cleaned_df_geolocation.groupby('geolocation_zip_code_prefix').agg({
    'geolocation_lat': 'median',
    'geolocation_lng': 'median'
}).reset_index()

# Merge customer geo data
master_olist_dataset = master_olist_dataset.merge(
    median_coords,
    left_on='customer_zip_code_prefix',
    right_on='geolocation_zip_code_prefix',
    how='left'
).rename(columns={
    'geolocation_lat': 'customer_lat',
    'geolocation_lng': 'customer_lng'
})

# Merge seller geo data 
master_olist_dataset = master_olist_dataset.merge(
    median_coords,
    left_on='seller_zip_code_prefix',
    right_on='geolocation_zip_code_prefix',
    how='left',
    suffixes=('_drop', '')
).rename(columns={
    'geolocation_lat': 'seller_lat',
    'geolocation_lng': 'seller_lng'
})

# Drop redundant columns
master_olist_dataset = master_olist_dataset.drop(columns=['geolocation_zip_code_prefix', 'geolocation_zip_code_prefix_drop'], errors='ignore')
master_olist_dataset.shape

(115093, 55)

In [None]:
display(master_olist_dataset.head())

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,customer_unique_id,customer_zip_code_prefix,...,dispatch_time,transit_time,review_bucket,order_rank,first_order_flag,customer_repeat_flag,customer_lat,customer_lng,seller_lat,seller_lng
0,e22acc9c116caa3f2b7121bbb380d08e,fadbb3709178fc513abc1b2670aa1ad2,delivered,2018-05-10 10:56:27,2018-05-10 11:11:18,2018-05-12 08:18:00,2018-05-16 20:48:37,2018-05-21,0000366f3b9a7992bf8c76cfdf3221e2,7787,...,1,4,Good (4-5),1.0,True,False,-23.337558,-46.828132,-22.709287,-47.665524
1,3594e05a005ac4d06a72673270ef9ec9,4cb282e167ae9234755102258dd52ee8,delivered,2018-05-07 11:11:27,2018-05-07 18:25:44,2018-05-09 12:18:00,2018-05-10 18:02:42,2018-05-15,0000b849f77a49e4a4ce2b2a4ca5be3f,6053,...,1,1,Good (4-5),1.0,True,False,-23.55785,-46.788771,-23.491089,-46.583632
2,b33ec3b699337181488304f362a6b734,9b3932a6253894a02c1df9d19004239f,delivered,2017-03-10 21:05:03,2017-03-10 21:05:03,2017-03-13 12:58:30,2017-04-05 14:38:47,2017-04-07,0000f46a3911fa3c0805444483337064,88115,...,2,23,Bad (1-3),1.0,True,False,-27.5437,-48.635335,-23.21132,-46.762824
3,41272756ecddd9a9ed0180413cc22fb6,914991f0c02ef0843c0e7010c819d642,delivered,2017-10-12 20:29:41,2017-10-12 20:49:17,2017-10-13 20:08:19,2017-11-01 21:23:05,2017-11-13,0000f6ccb0745a6a4b88665a16c9f078,66812,...,0,19,Good (4-5),1.0,True,False,-1.307305,-48.481787,-23.542864,-46.490706
4,d957021f1127559cd947b62533f484f7,47227568b10f5f58a524a75507e6992c,delivered,2017-11-14 19:45:42,2017-11-14 20:06:52,2017-11-16 19:52:10,2017-11-27 23:08:56,2017-12-05,0004aac84e0df4da2b147fca70cf8255,18040,...,1,11,Good (4-5),1.0,True,False,-23.506028,-47.470332,-23.302839,-45.972729


In [20]:
# Calculate distance between customer and seller
# Using Haversine formula to calculate distance between two coordinates
def haversine(lat1, lon1, lat2, lon2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # Convert decimal degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    
    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    r = 6371  # Radius of earth in kilometers
    return c * r

# Apply the formula to calculate distance for all rows with valid coordinates
mask = (~master_olist_dataset['customer_lat'].isna()) & (~master_olist_dataset['seller_lat'].isna())
master_olist_dataset.loc[mask, 'distance_km'] = haversine(
    master_olist_dataset.loc[mask, 'customer_lat'],
    master_olist_dataset.loc[mask, 'customer_lng'],
    master_olist_dataset.loc[mask, 'seller_lat'],
    master_olist_dataset.loc[mask, 'seller_lng']
)

# Print summary statistics of the new distance feature
print(f"Distance statistics (km):")
print(f"Mean: {master_olist_dataset['distance_km'].mean():.2f}")
print(f"Median: {master_olist_dataset['distance_km'].median():.2f}")
print(f"Min: {master_olist_dataset['distance_km'].min():.2f}")
print(f"Max: {master_olist_dataset['distance_km'].max():.2f}")
print(f"Null values: {master_olist_dataset['distance_km'].isna().sum()}")

Distance statistics (km):
Mean: 596.36
Median: 431.82
Min: 0.00
Max: 3398.55
Null values: 559


In [24]:
master_olist_dataset.isna().sum()

order_id                              0
customer_id                           0
order_status                          0
order_purchase_timestamp              0
order_approved_at                     0
order_delivered_carrier_date          0
order_delivered_customer_date         0
order_estimated_delivery_date         0
customer_unique_id                    0
customer_zip_code_prefix              0
customer_city                         0
customer_state                        0
order_item_id                         0
product_id                            0
seller_id                             0
shipping_limit_date                   0
price                                 0
freight_value                         0
product_category_name                 0
product_name_lenght                   0
product_description_lenght            0
product_photos_qty                    0
product_weight_g                      0
product_length_cm                     0
product_height_cm                     0


In [25]:
# 10. Write outputs
master_olist_dataset.to_csv('../data/cleaned_data/master_olist_dataset_with_features.csv', index=False)
seller_agg.to_csv('../data/cleaned_data/seller_aggregation.csv', index=False)
state_agg.to_csv('../data/cleaned_data/state_aggregation.csv', index=False)