In [1]:
# Load data using the pyarrow engine
import pandas as pd
file_name = "./Data/mro_daily_clean.csv"
data = pd.read_csv(file_name, index_col=0, engine="pyarrow")
data

Unnamed: 0,yr_nbr,mth_nbr,week_nbr,week_day,hard_braking,mild_hb,hard_braking2,harsh_hb,very_harsh_hb,est_hh_incm_prmr_cd,...,mro,record_days,latitude1,longitude1,purchase_lat1,purchase_lng1,purchase_yr_nbr,purchase_mth_nbr,tavg,random_avg_traffic
,,,,,,,,,,,,,,,,,,,,,
0,2019,3,13,7,0,0,0,0,0,6.0,...,0.0,70,45.0,-93.1,45.1,-93.2,2019.0,3.0,14.346983,12472.338289
1,2019,4,14,4,9,0,0,0,0,6.0,...,0.0,70,44.8,-92.9,45.1,-93.2,2019.0,3.0,13.670879,12410.618966
2,2019,4,14,6,9,1,0,0,0,6.0,...,0.0,70,44.8,-92.9,45.1,-93.2,2019.0,3.0,13.699830,12391.577959
3,2019,4,14,7,20,8,0,0,0,6.0,...,0.0,70,45.0,-93.1,45.1,-93.2,2019.0,3.0,13.704561,12313.165404
4,2019,4,15,4,0,0,0,0,0,6.0,...,0.0,70,45.0,-93.1,45.1,-93.2,2019.0,3.0,13.884265,12342.054130
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18018968,2019,11,48,5,2,1,0,0,0,9.0,...,1.0,269,42.4,-82.9,42.4,-82.9,2018.0,6.0,12.700000,18816.070000
18018969,2019,12,49,5,1,0,0,0,0,9.0,...,0.0,269,42.4,-82.9,42.4,-82.9,2018.0,6.0,2.200000,6551.590000
18018970,2019,12,50,7,5,1,0,0,0,9.0,...,0.0,269,42.5,-82.9,42.4,-82.9,2018.0,6.0,-2.400000,15358.730000


## Feature Selection

In [2]:
from constants import (
    driver_behavior,
    vehicle_attributes,
    driver_attributes,
    driver_navigation,
    gis_attributes,
    record_day,
    target_mro,
)

In [3]:
# driver_behavior = [
#     "hard_braking",
#     "hard_acceleration",
#     "speeding_sum",
#     "day_mileage",
# ]

# # Vehicle and driver attributes
# vehicle_attributes = [
#     "gmqualty_model",
#     "umf_xref_finc_gbl_trim",
#     "engn_size",
#     "purchase_yr_nbr",
#     "purchase_mth_nbr",
# ]

# # driver attributes
# driver_attributes = [
#     "est_hh_incm_prmr_cd",
#     "purchaser_age_at_tm_of_purch",
#     "input_indiv_gndr_prmr_cd",
# ]

# # driver nevigation
# driver_navigation = [
#     "id",
#     "yr_nbr",
#     "mth_nbr",
#     "week_nbr",
# ]

# # gis attributes
# gis_attributes = [
#     "tavg",
#     "random_avg_traffic",
# ]

# # record day, use this as a filter to make sure we have at least 16 weeks of records
# record_day = ["record_days"]

# # target
# target = ["mro"]

In [4]:
selected_columns = (
    driver_navigation
    + driver_behavior
    + vehicle_attributes
    + driver_attributes
    + gis_attributes
    + record_day
    + target_mro
)

data = data[selected_columns]

In [5]:
data["purchase_time"] = (
    data["purchase_yr_nbr"].astype(int).astype(str)
    + "_"
    + data["purchase_mth_nbr"].astype(int).astype(str)
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["purchase_time"] = (


In [6]:
data = data[data["record_days"] >= 120]
data = data.drop(["purchase_yr_nbr", "purchase_mth_nbr"], axis=1)

In [7]:
data.columns

Index(['id', 'yr_nbr', 'mth_nbr', 'week_nbr', 'hard_braking',
       'hard_acceleration', 'speeding_sum', 'day_mileage', 'gmqualty_model',
       'umf_xref_finc_gbl_trim', 'engn_size', 'est_hh_incm_prmr_cd',
       'purchaser_age_at_tm_of_purch', 'input_indiv_gndr_prmr_cd', 'tavg',
       'random_avg_traffic', 'record_days', 'mro', 'purchase_time'],
      dtype='object')

In [8]:
weekly_level_sum = data.groupby(["id", "yr_nbr", "week_nbr"]).agg(
    {
        "mth_nbr": "first",
        # if exists mro = 1, then mro = 1
        "mro": "max",
        "hard_braking": "sum",
        "hard_acceleration": "sum",
        "speeding_sum": "sum",
        "day_mileage": "sum",
        "est_hh_incm_prmr_cd": "first",
        "purchaser_age_at_tm_of_purch": "first",
        "input_indiv_gndr_prmr_cd": "first",
        "gmqualty_model": "first",
        "umf_xref_finc_gbl_trim": "first",
        "engn_size": "first",
        "purchase_time": "first",
        "tavg": "mean",
        "record_days": "first",
        "random_avg_traffic": "mean",
    }
)

weekly_level_sum.reset_index(inplace=True)
weekly_level_sum

Unnamed: 0,id,yr_nbr,week_nbr,mth_nbr,mro,hard_braking,hard_acceleration,speeding_sum,day_mileage,est_hh_incm_prmr_cd,purchaser_age_at_tm_of_purch,input_indiv_gndr_prmr_cd,gmqualty_model,umf_xref_finc_gbl_trim,engn_size,purchase_time,tavg,record_days,random_avg_traffic
0,w4HClcKnwrzCv8KgwrjDi8Klwr3Cm8KVwqfCrsKowprClg==,2018,1,12,0.0,1,1,0.0,6.203125,6.0,54.0,F,Regal,BASE_TRIM,2.0,2018_1,5.048431,596,12886.225115
1,w4HClcKnwrzCv8KgwrjDi8Klwr3Cm8KVwqfCrsKowprClg==,2018,3,1,0.0,1,0,4.0,32.968750,6.0,54.0,F,Regal,BASE_TRIM,2.0,2018_1,15.092748,596,14554.620499
2,w4HClcKnwrzCv8KgwrjDi8Klwr3Cm8KVwqfCrsKowprClg==,2018,4,1,0.0,39,5,21.0,319.812500,6.0,54.0,F,Regal,BASE_TRIM,2.0,2018_1,15.273847,596,14559.007102
3,w4HClcKnwrzCv8KgwrjDi8Klwr3Cm8KVwqfCrsKowprClg==,2018,5,1,0.0,60,12,33.0,541.687500,6.0,54.0,F,Regal,BASE_TRIM,2.0,2018_1,15.005342,596,14433.908044
4,w4HClcKnwrzCv8KgwrjDi8Klwr3Cm8KVwqfCrsKowprClg==,2018,6,2,0.0,50,10,46.0,352.406250,6.0,54.0,F,Regal,BASE_TRIM,2.0,2018_1,14.859090,596,14390.440682
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3434757,wrbCt8K1wrvDi8KtwrjCtMONwrvCrsKXwqbCqcKqwpzCnA==,2019,48,11,0.0,30,4,9.0,485.750000,4.0,34.0,F,Envision,MIDLEVEL_TRIM,2.5,2018_1,15.293022,499,9881.400949
3434758,wrbCt8K1wrvDi8KtwrjCtMONwrvCrsKXwqbCqcKqwpzCnA==,2019,49,12,0.0,13,1,6.0,195.640625,4.0,34.0,F,Envision,MIDLEVEL_TRIM,2.5,2018_1,14.654773,499,9743.572901
3434759,wrbCt8K1wrvDi8KtwrjCtMONwrvCrsKXwqbCqcKqwpzCnA==,2019,50,12,0.0,13,2,7.0,456.000000,4.0,34.0,F,Envision,MIDLEVEL_TRIM,2.5,2018_1,14.872987,499,9732.679965
3434760,wrbCt8K1wrvDi8KtwrjCtMONwrvCrsKXwqbCqcKqwpzCnA==,2019,51,12,0.0,7,0,1.0,497.359375,4.0,34.0,F,Envision,MIDLEVEL_TRIM,2.5,2018_1,12.262551,499,10792.873524


In [9]:
# weekly_level_sum.to_csv(
#     "./Data/weekly_level_sum.csv", index=True
# )

In [10]:
weekly_level_mean = data.groupby(["id", "yr_nbr", "week_nbr"]).agg(
    {
        "mth_nbr": "first",
        # if exists mro = 1, then mro = 1
        "mro": "max",
        "hard_braking": "mean",
        "hard_acceleration": "mean",
        "speeding_sum": "mean",
        "day_mileage": "mean",
        "est_hh_incm_prmr_cd": "first",
        "purchaser_age_at_tm_of_purch": "first",
        "input_indiv_gndr_prmr_cd": "first",
        "gmqualty_model": "first",
        "umf_xref_finc_gbl_trim": "first",
        "engn_size": "first",
        "purchase_time": "first",
        "tavg": "mean",
        "record_days": "first",
        "random_avg_traffic": "mean",
    }
)

weekly_level_mean.reset_index(inplace=True)
weekly_level_mean

Unnamed: 0,id,yr_nbr,week_nbr,mth_nbr,mro,hard_braking,hard_acceleration,speeding_sum,day_mileage,est_hh_incm_prmr_cd,purchaser_age_at_tm_of_purch,input_indiv_gndr_prmr_cd,gmqualty_model,umf_xref_finc_gbl_trim,engn_size,purchase_time,tavg,record_days,random_avg_traffic
0,w4HClcKnwrzCv8KgwrjDi8Klwr3Cm8KVwqfCrsKowprClg==,2018,1,12,0.0,1.000000,1.000000,0.000000,6.203125,6.0,54.0,F,Regal,BASE_TRIM,2.0,2018_1,5.048431,596,12886.225115
1,w4HClcKnwrzCv8KgwrjDi8Klwr3Cm8KVwqfCrsKowprClg==,2018,3,1,0.0,0.500000,0.000000,2.000000,16.484375,6.0,54.0,F,Regal,BASE_TRIM,2.0,2018_1,15.092748,596,14554.620499
2,w4HClcKnwrzCv8KgwrjDi8Klwr3Cm8KVwqfCrsKowprClg==,2018,4,1,0.0,6.500000,0.833333,3.500000,53.302083,6.0,54.0,F,Regal,BASE_TRIM,2.0,2018_1,15.273847,596,14559.007102
3,w4HClcKnwrzCv8KgwrjDi8Klwr3Cm8KVwqfCrsKowprClg==,2018,5,1,0.0,8.571429,1.714286,4.714286,77.383929,6.0,54.0,F,Regal,BASE_TRIM,2.0,2018_1,15.005342,596,14433.908044
4,w4HClcKnwrzCv8KgwrjDi8Klwr3Cm8KVwqfCrsKowprClg==,2018,6,2,0.0,7.142857,1.428571,6.571429,50.343750,6.0,54.0,F,Regal,BASE_TRIM,2.0,2018_1,14.859090,596,14390.440682
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3434757,wrbCt8K1wrvDi8KtwrjCtMONwrvCrsKXwqbCqcKqwpzCnA==,2019,48,11,0.0,5.000000,0.666667,1.500000,80.958333,4.0,34.0,F,Envision,MIDLEVEL_TRIM,2.5,2018_1,15.293022,499,9881.400949
3434758,wrbCt8K1wrvDi8KtwrjCtMONwrvCrsKXwqbCqcKqwpzCnA==,2019,49,12,0.0,4.333333,0.333333,2.000000,65.213542,4.0,34.0,F,Envision,MIDLEVEL_TRIM,2.5,2018_1,14.654773,499,9743.572901
3434759,wrbCt8K1wrvDi8KtwrjCtMONwrvCrsKXwqbCqcKqwpzCnA==,2019,50,12,0.0,2.600000,0.400000,1.400000,91.200000,4.0,34.0,F,Envision,MIDLEVEL_TRIM,2.5,2018_1,14.872987,499,9732.679965
3434760,wrbCt8K1wrvDi8KtwrjCtMONwrvCrsKXwqbCqcKqwpzCnA==,2019,51,12,0.0,1.166667,0.000000,0.166667,82.893229,4.0,34.0,F,Envision,MIDLEVEL_TRIM,2.5,2018_1,12.262551,499,10792.873524


In [11]:
# weekly_level_mean.to_csv(
#     "./Data/weekly_level_mean.csv", index=True
# )

In [12]:
car_trim_map = data.groupby('gmqualty_model')['umf_xref_finc_gbl_trim'].unique()
car_trim_map

gmqualty_model
ATS                                  [MIDLEVEL_TRIM, LUXURY_TRIM]
Acadia C1                                         [MIDLEVEL_TRIM]
Blazer                                            [MIDLEVEL_TRIM]
CT6                                                   [BASE_TRIM]
Camaro A                              [MIDLEVEL_TRIM, SPORT_TRIM]
Canyon                              [MIDLEVEL_TRIM, UPLEVEL_TRIM]
Colorado                                          [MIDLEVEL_TRIM]
Encore                                    [SPORT_TRIM, BASE_TRIM]
Envision                                          [MIDLEVEL_TRIM]
Equinox D2                                        [MIDLEVEL_TRIM]
Impala E                                          [MIDLEVEL_TRIM]
LaCrosse                 [BASE_TRIM, UPLEVEL_TRIM, MIDLEVEL_TRIM]
Malibu E2                           [UPLEVEL_TRIM, MIDLEVEL_TRIM]
Regal                    [MIDLEVEL_TRIM, UPLEVEL_TRIM, BASE_TRIM]
Sierra HD                                          [UPLEVEL_T