# MRO Demand Prediction with LightGBM

In [1]:
# Load data using the pyarrow engine
import pandas as pd
file_name = "./Data/mro_daily_clean.csv"
data = pd.read_csv(file_name, index_col=0, engine="pyarrow")
data

Unnamed: 0,yr_nbr,mth_nbr,week_nbr,week_day,hard_braking,mild_hb,hard_braking2,harsh_hb,very_harsh_hb,est_hh_incm_prmr_cd,...,mro,record_days,latitude1,longitude1,purchase_lat1,purchase_lng1,purchase_yr_nbr,purchase_mth_nbr,tavg,random_avg_traffic
,,,,,,,,,,,,,,,,,,,,,
0,2019,3,13,7,0,0,0,0,0,6.0,...,0.0,70,45.0,-93.1,45.1,-93.2,2019.0,3.0,14.346983,12472.338289
1,2019,4,14,4,9,0,0,0,0,6.0,...,0.0,70,44.8,-92.9,45.1,-93.2,2019.0,3.0,13.670879,12410.618966
2,2019,4,14,6,9,1,0,0,0,6.0,...,0.0,70,44.8,-92.9,45.1,-93.2,2019.0,3.0,13.699830,12391.577959
3,2019,4,14,7,20,8,0,0,0,6.0,...,0.0,70,45.0,-93.1,45.1,-93.2,2019.0,3.0,13.704561,12313.165404
4,2019,4,15,4,0,0,0,0,0,6.0,...,0.0,70,45.0,-93.1,45.1,-93.2,2019.0,3.0,13.884265,12342.054130
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18018968,2019,11,48,5,2,1,0,0,0,9.0,...,1.0,269,42.4,-82.9,42.4,-82.9,2018.0,6.0,12.700000,18816.070000
18018969,2019,12,49,5,1,0,0,0,0,9.0,...,0.0,269,42.4,-82.9,42.4,-82.9,2018.0,6.0,2.200000,6551.590000
18018970,2019,12,50,7,5,1,0,0,0,9.0,...,0.0,269,42.5,-82.9,42.4,-82.9,2018.0,6.0,-2.400000,15358.730000


Split the dataset into 2 part
* 90% for training set and 10% for validation

In [2]:
from sklearn.model_selection import train_test_split

unique_ids = data["id"].unique()

train_ids, val_ids = train_test_split(unique_ids, test_size=0.1, random_state=42)

train_data = data[data["id"].isin(train_ids)]
val_data = data[data["id"].isin(val_ids)]

# Check the lengths of the train and validation sets
print("Length of train_ids:", len(train_ids))
print("Length of val_ids:", len(val_ids))

Length of train_ids: 77652
Length of val_ids: 8628


In [3]:
data.columns

Index(['yr_nbr', 'mth_nbr', 'week_nbr', 'week_day', 'hard_braking', 'mild_hb',
       'hard_braking2', 'harsh_hb', 'very_harsh_hb', 'est_hh_incm_prmr_cd',
       'purchaser_age_at_tm_of_purch', 'input_indiv_gndr_prmr_cd',
       'gmqualty_model', 'umf_xref_finc_gbl_trim', 'engn_size',
       'hard_acceleration', 'mild_ha', 'harsh_ha', 'very_harsh_ha',
       'vin_nbr_id', 'speeding_sum', 'speeding2_sum', 'day_mileage', 'id',
       'day_indicator', 'service_days', 'battery_dummy', 'brake_dummy',
       'tire_dummy', 'lof_dummy', 'wiper_dummy', 'filter_dummy', 'others',
       'mro', 'record_days', 'latitude1', 'longitude1', 'purchase_lat1',
       'purchase_lng1', 'purchase_yr_nbr', 'purchase_mth_nbr', 'tavg',
       'random_avg_traffic'],
      dtype='object')

## Feature Selection

In [4]:
# time features are not relavant for the model (I think)
time_info = ["yr_nbr", "mth_nbr", "week_nbr", "week_day"]
# driver behavior features are needed for the model
driver_behavior = [
    "hard_braking",
    "mild_hb",
    "hard_braking2",
    "harsh_hb",
    "very_harsh_hb",
    "hard_acceleration",
    "mild_ha",
    "harsh_ha",
    "very_harsh_ha",
]
# driver information features
driver_info = [
    "est_hh_incm_prmr_cd",
    "purchaser_age_at_tm_of_purch",
    "input_indiv_gndr_prmr_cd",
]
# vehicle information features
vehicle_info = [
    # "gmqualty_model",
    # "umf_xref_finc_gbl_trim",
    "engn_size",
    # "vin_nbr_id",
]
# speed and mileage features
speed_mileage = ["speeding_sum", "speeding2_sum", "day_mileage"]
# service and maintenance features (target variable)
service_maintenance = [
    # "service_days",
    # "battery_dummy",
    # "brake_dummy",
    # "tire_dummy",
    # "lof_dummy",
    # "wiper_dummy",
    # "filter_dummy",
    # "others",
    "mro",
]
# Record-related Columns
record_related = [
    # "id",
    "day_indicator",
    "record_days",
]

# Geographic information features (not relevant)
geo_info = [
    "latitude1",
    "longitude1",
    "purchase_lat1",
    "purchase_lng1",
]


# Purchase Time-related Columns (not relevant)
purchase_time = [
    "purchase_yr_nbr",
    "purchase_mth_nbr",
]

# External Environment-related Columns (used for prediction)
external_environment = [
    "tavg",
    "random_avg_traffic",
]

In [5]:
full_feature_list = (
    driver_behavior
    + driver_info
    + vehicle_info
    + speed_mileage
    + record_related
    + external_environment
)
target_variable = service_maintenance

In [6]:
import lightgbm as lgb

X_train = train_data[full_feature_list]
y_train = train_data[target_variable]
X_valid = val_data[full_feature_list]
y_valid = val_data[target_variable]

train_dataset = lgb.Dataset(X_train, label=y_train)
valid_dataset = lgb.Dataset(X_valid, label=y_valid, reference=train_dataset)

In [7]:
params = {
    "objective": "binary",
    "metric": ["auc", "binary_logloss"],
    "boosting_type": "gbdt",
    "learning_rate": 0.05,
    "num_leaves": 31,
    "max_depth": -1,
    "verbose": -1,
}

In [9]:
num_round = 10
bst = lgb.train(params, train_dataset, num_round, valid_sets=[valid_dataset])

ValueError: pandas dtypes must be int, float or bool.
Fields with bad pandas dtypes: input_indiv_gndr_prmr_cd: object