# Data Preprocessing

In [None]:
import pandas as pd
from tqdm import tqdm

tqdm.pandas()
file_name = "./Data/mro_daily_clean.csv"

In [None]:
data = pd.read_csv(file_name, index_col=0, engine="pyarrow")
data

In [None]:
data_columns_name_lst = data.columns.tolist()
data_columns_lst_len = len(data_columns_name_lst)
print("The columns of the data are: ", data_columns_name_lst)
print("The number of columns of the data are: ", data_columns_lst_len)

This dataset contains the following columns, which can be categorized into several groups:

1. Time-related Columns (4)
	* yr_nbr – Year of the event (e.g., 2023, 2024).
	* mth_nbr – Month of the event (ranging from 1 to 12).
	* week_nbr – Week number of the event (ranging from 1 to 52).
	* week_day – Day of the week when the event occurred (0 to 6 or 1 to 7, where 0 or 1 typically represents Monday).

2. Driving Behavior-related Columns (9)
	* hard_braking – Number or intensity of hard braking events.
	* mild_hb – Number or intensity of mild braking events.
	* hard_braking2 – Possibly another measure of hard braking events under different conditions.
	* harsh_hb – Number or intensity of harsh braking events.
	* very_harsh_hb – Number or intensity of very harsh braking events.
	* hard_acceleration – Number or intensity of hard acceleration events.
	* mild_ha – Number or intensity of mild acceleration events.
	* harsh_ha – Number or intensity of harsh acceleration events.
	* very_harsh_ha – Number or intensity of very harsh acceleration events.

3. Driver-related Columns (3)
	* est_hh_incm_prmr_cd – Estimated primary household income level or code of the driver.
	* purchaser_age_at_tm_of_purch – Age of the purchaser at the time of purchase.
	* input_indiv_gndr_prmr_cd – Gender code of the driver (e.g., M for male, F for female).

4. Vehicle Information-related Columns (4)
	* gmqualty_model – Vehicle model or quality classification.
	* umf_xref_finc_gbl_trim – Vehicle configuration or trim level.
	* engn_size – Engine size (likely in liters or cubic centimeters).
	* vin_nbr_id – Vehicle Identification Number (VIN).

5. Speed and Mileage-related Columns (3)
	* speeding_sum – Total number of speeding events or total distance driven while speeding.
	* speeding2_sum – Possibly another measure of speeding events (e.g., separating highway and urban driving).
	* day_mileage – Daily driving mileage (likely in kilometers or miles).

6. Service and Maintenance-related Columns (9)
	* service_days – Total number of days the vehicle received maintenance or service.
	* battery_dummy – Indicator for battery service (1 = serviced, 0 = not serviced).
	* brake_dummy – Indicator for brake service.
	* tire_dummy – Indicator for tire service.
	* lof_dummy – Indicator for lube, oil, and filter service.
	* wiper_dummy – Indicator for wiper service.
	* filter_dummy – Indicator for air or oil filter service.
	* others – Indicator for other types of service.
	* **mro – Maintenance, rep
	air, and overhaul**, a bool target.

7. Record-related Columns (3)
	* id – Unique identifier for the record.
	* day_indicator – Indicator for the specific day of the record (e.g., 1 for current day).
	* record_days – Total number of days of recorded data.

8. Geographic Information-related Columns (4)
	* latitude1 – Latitude of the vehicle’s location.
	* longitude1 – Longitude of the vehicle’s location.
	* purchase_lat1 – Latitude of the purchase location.
	* purchase_lng1 – Longitude of the purchase location.

9. Purchase Time-related Columns (2)
	* purchase_yr_nbr – Year of vehicle purchase.
	* purchase_mth_nbr – Month of vehicle purchase.

10. External Environment-related Columns (2)
	* tavg – Average temperature (likely in degrees Celsius or Fahrenheit).
	* random_avg_traffic – Randomly sampled average traffic flow (may indicate traffic congestion level).

## Explorational Data Analysis

In [None]:
data['gmqualty_model'].value_counts()

In [None]:
import plotly.express as px
import pandas as pd

# calculate the count and percentage
model_counts = data["gmqualty_model"].value_counts().reset_index()
model_counts.columns = ["gmqualty_model", "count"]

# calculate the percentage
total = model_counts["count"].sum()
model_counts["percentage"] = model_counts["count"] / total * 100

# set the threshold for the small categories, here we use 1%
threshold = 0.5  # when the percentage is less than 1%, we will merge them into the "Other" category
small_models = model_counts[model_counts["percentage"] < threshold]
large_models = model_counts[model_counts["percentage"] >= threshold]

# merge the small categories into the "Other" category
other_count = small_models["count"].sum()
other_row = pd.DataFrame(
    [["Other", other_count, other_count / total * 100]], columns=model_counts.columns
)
final_df = pd.concat([large_models, other_row], ignore_index=True)

# draw the pie chart
fig = px.pie(
    final_df,
    names="gmqualty_model",
    values="count",
    title="The percentage of the GM Qualty Model",
    hole=0.3,
    hover_data=["percentage"],
    labels={"percentage": "Percentage (%)"},
)


fig.update_layout(
    width=750,
    height=500,
    margin=dict(l=50, r=50, t=50, b=50)
)

# show the plot
fig.show()

In [None]:
mro_ratio = data.groupby('gmqualty_model')['mro'].mean().reset_index()
mro_ratio.columns = ['gmqualty_model', 'mro_ratio']
mro_ratio

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


# draw a bar plot to show the mro ratio of each gmqualty_model
plt.figure(figsize=(12, 8))
# set x-axis label size
plt.xticks(fontsize=10, rotation=45)
plt.grid()
plt.xlabel("GM Qualty Model")
plt.ylabel("MRO Ratio")
sns.barplot(x="gmqualty_model", y="mro_ratio", data=mro_ratio)
plt.title("MRO ratio of each gmqualty_model")
plt.show()

In [None]:
gm_data_full = data.groupby("gmqualty_model").aggregate({"gmqualty_model": "count"})
gm_mro_1 = (
    data[data["gmqualty_model"] == 1]
    .groupby("gmqualty_model")
    .aggregate({"gmqualty_model": "count"})
)

In [None]:
# when mro = 1, calculate the proportion of gmqualty_model
data_mro_1 = data[data["mro"] == 1]
data_mro_1_gmqualty_model = data_mro_1.groupby("gmqualty_model")
# data_mro_1_gmqualty_model.aggregate({"gmqualty_model": "count"})
data_mro_1_gmqualty_model.aggregate({"gmqualty_model": "count"})

In [None]:
# calculate the proportion of gmqualty_model when mro = 1
mro_trend = data.groupby(['yr_nbr', 'mth_nbr'])['mro'].sum().reset_index()
mro_trend['time'] = mro_trend['yr_nbr'].astype(str) + '-' + mro_trend['mth_nbr'].astype(str).str.zfill(2)


# calculate the MRO trend of different car models
mro_trend_by_model = data.groupby(['yr_nbr', 'mth_nbr', 'gmqualty_model'])['mro'].sum().reset_index()
mro_trend_by_model['time'] = mro_trend_by_model['yr_nbr'].astype(str) + '-' + mro_trend_by_model['mth_nbr'].astype(str).str.zfill(2)

# create a figure with two subplots
fig, axes = plt.subplots(2, 1, figsize=(14, 12), sharex=True)

# draw mro with full data
sns.lineplot(ax=axes[0], data=mro_trend, x='time', y='mro', marker='o', color='b')
axes[0].set_title('Overall MRO Trend Over Time')
axes[0].set_ylabel('MRO Count')
axes[0].grid(True)

# draw mro with different car models
sns.lineplot(ax=axes[1], data=mro_trend_by_model, x='time', y='mro', hue='gmqualty_model', marker='o')
axes[1].set_title('MRO Trend Over Time by Car Model')
axes[1].set_ylabel('MRO Count')
axes[1].legend(title='Car Model', bbox_to_anchor=(1.05, 1), loc='upper left')
axes[1].grid(True)

# x-axis label
plt.xticks(rotation=45)
plt.xlabel('Time (Year-Month)')

plt.tight_layout()
plt.show()

In [None]:
mro_overall = data.groupby(["yr_nbr", "mth_nbr"])["mro"].mean().reset_index()
mro_overall["gmqualty_model"] = "Overall"  # 添加标识，代表整体


mro_model = (
    data.groupby(["yr_nbr", "mth_nbr", "gmqualty_model"])["mro"].mean().reset_index()
)

# 合并整体数据和车型数据
mro_combined = pd.concat([mro_overall, mro_model])

mro_combined["time"] = (
    mro_combined["yr_nbr"].astype(str)
    + "-"
    + mro_combined["mth_nbr"].astype(str).str.zfill(2)
)

plt.figure(figsize=(16, 9))
sns.lineplot(
    data=mro_combined[mro_combined["gmqualty_model"] != "Overall"],
    x="time",
    y="mro",
    hue="gmqualty_model",
    marker="o",
    linewidth=2,
    alpha=0.7,
)

sns.lineplot(
    data=mro_combined[mro_combined["gmqualty_model"] == "Overall"],
    x="time",
    y="mro",
    color="black",
    label="Overall",
    marker="o",
    linewidth=4,
)

plt.xticks(rotation=45)
plt.xlabel("Time (Year-Month)")
plt.ylabel("MRO Ratio")
plt.title("MRO Ratio Over Time (Overall & By Car Model)")
plt.legend(title="Car Model", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.grid(True)
plt.show()

## Check Missing Values

In [None]:
# check the missing value in data
data.isnull().sum()

## Discussion: If We Need to Re-define MRO?

In [None]:
# 'battery_dummy', 'brake_dummy','tire_dummy', 'lof_dummy', 'wiper_dummy', 'filter_dummy', 'others'
battery_dummy_sum = data[['battery_dummy']].sum().values[0]
print("The proportion of battery_dummy is: ", battery_dummy_sum)
brake_dummy_sum = data[['brake_dummy']].sum().values[0]
print("The proportion of brake_dummy is: ", brake_dummy_sum)
tire_dummy_sum = data[['tire_dummy']].sum().values[0]
print("The proportion of tire_dummy is: ", tire_dummy_sum)
lof_dummy_sum = data[['lof_dummy']].sum().values[0]
print("The proportion of lof_dummy is: ", lof_dummy_sum)
wiper_dummy_sum = data[['wiper_dummy']].sum().values[0]
print("The proportion of wiper_dummy is: ", wiper_dummy_sum)
filter_dummy_sum = data[['filter_dummy']].sum().values[0]
print("The proportion of filter_dummy is: ", filter_dummy_sum)
others_sum = data[['others']].sum().values[0]
print("The proportion of others is: ", others_sum)

In [None]:
# 'battery_dummy', 'brake_dummy','tire_dummy', 'lof_dummy', 'wiper_dummy', 'filter_dummy', 'others'
data["mro_new"] = (
    data["brake_dummy"]
    + data["battery_dummy"]
    + data["tire_dummy"]
    + data["lof_dummy"]
    + data["wiper_dummy"]
    + data["filter_dummy"]
    + data["others"]
)

In [None]:
print('The number of mro_new is: ', data['mro_new'].nunique())
print('The value counts of mro_new is:\n', data['mro_new'].value_counts())

In [None]:
from matplotlib import pyplot as plt
from upsetplot import from_indicators, plot
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)
# create a new data frame for UpSet plot with the columns of interest only
# transform the data type to boolean
dummy_data = data[
    [
        "battery_dummy",
        "brake_dummy",
        "tire_dummy",
        "lof_dummy",
        "wiper_dummy",
        "filter_dummy",
        "others",
    ]
]
# remove rows with all zeros and transform the data type to boolean
dummy_data = dummy_data[dummy_data.any(axis=1)].astype(bool)

# create the UpSet plot
upset_data = from_indicators(dummy_data.columns, dummy_data)
fig = plt.figure(figsize=(18, 12))
plot(
    upset_data,
    fig=fig,
    element_size=None,
    orientation="horizontal",
    facecolor="darkblue",
)
plt.title("UpSet plot of MRO categories")
plt.show()

## Data Transfermation

Combine `purchase_yr_nbr` and `purchase_mth_nbr` to make a new colomn standing for the `purchase_time`

In [None]:
data["purchase_time"] = (
    data["purchase_yr_nbr"].astype(int).astype(str)
    + "_"
    + data["purchase_mth_nbr"].astype(int).astype(str)
)

Robustness check: divide into `repair`/`maintenance`

In [None]:
import numpy as np

# TODO: I don't know why we need to add this line
data["maintenance"] = np.where((data["mro"] == 1) & (data["service_days"] <= 3), 1, 0)
data["repair"] = np.where((data["mro"] == 1) & (data["service_days"] > 3), 1, 0)

Robustness check: `traffic density`

In [None]:
data["traffic_count"] = data.groupby(
    ["yr_nbr", "mth_nbr", "week_nbr", "week_day", "latitude1", "longitude1"]
)["yr_nbr"].transform("size")