In [20]:
import pandas as pd
import numpy as np
from sklearn.metrics import make_scorer, f1_score, recall_score,precision_score
from sklearn.model_selection import train_test_split, GridSearchCV
from lightgbm import LGBMClassifier

In [21]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [22]:
df = pd.read_csv('2month_new1.csv', index_col=0)
df_new = df.copy()

In [23]:
df_new.head()

Unnamed: 0,mro_new,est_hh_incm_prmr_cd,purchaser_age_at_tm_of_purch,input_indiv_gndr_prmr_cd,gmqualty_model,umf_xref_finc_gbl_trim,engn_size,purchase_time,mro_new_indicator_1,hard_braking_1,hard_acceleration_1,speeding_sum_1,day_mileage_1,tavg_1,random_avg_traffic_1,hard_braking_2,hard_acceleration_2,speeding_sum_2,day_mileage_2,tavg_2,random_avg_traffic_2
2,0.0,6.0,54.0,0,13,0,2.0,0,0.0,45.666667,4.666667,23.666667,325.572917,15.110922,0.240708,33.333333,5.666667,19.333333,298.15625,15.123979,0.245754
3,0.0,6.0,54.0,0,13,0,2.0,0,0.0,51.333333,7.666667,29.0,288.0625,15.106969,0.238053,45.666667,4.666667,23.666667,325.572917,15.110922,0.240708
4,0.0,6.0,54.0,0,13,0,2.0,0,0.0,51.666667,11.0,28.0,252.713542,15.050637,0.233105,51.333333,7.666667,29.0,288.0625,15.106969,0.238053
5,1.0,6.0,54.0,0,13,0,2.0,0,0.0,68.5,13.5,36.5,507.960935,15.497578,0.2276,51.666667,11.0,28.0,252.713542,15.050637,0.233105
6,0.0,6.0,54.0,0,13,0,2.0,0,1.0,64.333333,10.666667,27.0,371.135417,15.588944,0.225047,68.5,13.5,36.5,507.960935,15.497578,0.2276


### robustness check

In [32]:
id = pd.read_csv('../id_record_weeks.csv')
id_lst = id[(id['record_weeks']>40)&(id['record_weeks']<=70)]['id']
            #&(id['record_weeks']<=70)]['id']
df_new = df_new[df_new['id'].isin(id_lst)]
df_new['id'].nunique()

26685

In [22]:
result_df_1 = pd.read_csv('../weekly_filter_new.csv')
id_2018 = list(set(result_df_1[result_df_1['purchase_time'].str[:4] == '2018']['id']))
id_2019 = list(set(result_df_1[result_df_1['purchase_time'].str[:4] == '2019']['id']))
df_new = df_new[df_new['id'].isin(id_2019)]
df_new['id'].nunique()

23965

In [20]:
df_new['maintenance'] = df_new.groupby('id')['maintenance'].transform(lambda x: x.shift(1))
df_new = df_new.dropna()
df_new = df_new.drop('id', axis=1)
df_new = df_new.drop(['maintenance_indicator_0', 'maintenance_indicator_2',
       'maintenance_indicator_3', 'maintenance_indicator_4',
       'maintenance_indicator_5', 'maintenance_indicator_6',
       'maintenance_indicator_7'], axis=1)

In [5]:
df_new['repair'] = df_new.groupby('id')['repair'].transform(lambda x: x.shift(1))
df_new = df_new.dropna()
df_new = df_new.drop('id', axis=1)
df_new = df_new.drop(['repair_indicator_0', 'repair_indicator_2', 'repair_indicator_3',
       'repair_indicator_4', 'repair_indicator_5', 'repair_indicator_6',
       'repair_indicator_7'], axis=1)

### Modeling

In [24]:
y = df_new['mro_new']
X = df_new.iloc[:, 1:]
y = y.astype(int)
X = X.astype(float)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
lgbm = LGBMClassifier(
    class_weight='balanced',
    learning_rate=0.1, 
    n_estimators=100,
    random_state=12,
    force_col_wise=True
)

param_grid = {
    'num_leaves': [7, 15, 31]
}

f1_scorer = make_scorer(f1_score)

grid_search = GridSearchCV(
    estimator=lgbm, 
    param_grid=param_grid, 
    cv=5, 
    scoring=f1_scorer, 
    verbose=1, 
    n_jobs=-1
)

grid_search.fit(X_train, y_train)
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation F1 score: {grid_search.best_score_}")

In [25]:
lgbm = LGBMClassifier(class_weight = 'balanced', learning_rate = 0.1, n_estimators=100, random_state=42, num_leaves=31, force_col_wise=True) 
lgbm.fit(X_train, y_train)
y_probs = lgbm.predict_proba(X_test)[:, 1]
threshold = 0.5
pred= (y_probs >= threshold).astype(int)
precision = precision_score(np.array(y_test), pred)
recall = recall_score(np.array(y_test), pred)
f1 = f1_score(np.array(y_test), pred)
precision, recall, f1

[LightGBM] [Info] Number of positive: 88011, number of negative: 478676
[LightGBM] [Info] Total Bins 3082
[LightGBM] [Info] Number of data points in the train set: 566687, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


(0.20156987625352663, 0.6495634170492394, 0.30766606975356015)

## exclude driving behavior

In [26]:
X_train_2 = X_train[ X_train.columns.drop(list( X_train.filter(regex='hard|speed')))]
X_test_2 = X_test[ X_test.columns.drop(list(X_test.filter(regex='hard|speed')))]

In [27]:
lgbm = LGBMClassifier(class_weight = 'balanced', learning_rate = 0.1, n_estimators=100, random_state=42, num_leaves=31, force_col_wise=True) 
lgbm.fit(X_train_2, y_train)
y_probs = lgbm.predict_proba(X_test_2)[:, 1]
threshold = 0.5
pred= (y_probs >= threshold).astype(int)
precision = precision_score(np.array(y_test), pred)
recall = recall_score(np.array(y_test), pred)
f1 = f1_score(np.array(y_test), pred)
precision, recall, f1

[LightGBM] [Info] Number of positive: 88011, number of negative: 478676
[LightGBM] [Info] Total Bins 1692
[LightGBM] [Info] Number of data points in the train set: 566687, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


(0.20049181694406623, 0.6495184084976146, 0.3064036689456028)

## permutation importance

In [None]:
def calculate_permutation_importance(
    model, X, y, scoring=f1_score, n_repeats=1, random_state=42, average="binary"
):
    np.random.seed(random_state)
    baseline_score = scoring(y, model.predict(X), average=average)
    importances = np.zeros(len(X.columns))

    feature_groups = [
        ["input_indiv_gndr_prmr_cd"],
        ["gmqualty_model"],
        ["umf_xref_finc_gbl_trim"],
        ["purchase_time"],
        ["est_hh_incm_prmr_cd"],
        ["purchaser_age_at_tm_of_purch"],
        ["engn_size"],
        ["mro_new_indicator_1"],
        [
            "tavg_0",
            "tavg_1",
            "tavg_2",
            "tavg_3",
            "tavg_4",
            "tavg_5",
            "tavg_6",
            "tavg_7",
        ],
        [
            "random_avg_traffic_0",
            "random_avg_traffic_1",
            "random_avg_traffic_2",
            "random_avg_traffic_3",
            "random_avg_traffic_4",
            "random_avg_traffic_5",
            "random_avg_traffic_6",
            "random_avg_traffic_7",
        ],
        [
            "hard_braking_0",
            "hard_braking_1",
            "hard_braking_2",
            "hard_braking_3",
            "hard_braking_4",
            "hard_braking_5",
            "hard_braking_6",
            "hard_braking_7",
        ],
        [
            "hard_acceleration_0",
            "hard_acceleration_1",
            "hard_acceleration_2",
            "hard_acceleration_3",
            "hard_acceleration_4",
            "hard_acceleration_5",
            "hard_acceleration_6",
            "hard_acceleration_7",
        ],
        [
            "speeding_0",
            "speeding_1",
            "speeding_2",
            "speeding_3",
            "speeding_4",
            "speeding_5",
            "speeding_6",
            "speeding_7",
        ],
        [
            "day_mileage_0",
            "day_mileage_1",
            "day_mileage_2",
            "day_mileage_3",
            "day_mileage_4",
            "day_mileage_5",
            "day_mileage_6",
            "day_mileage_7",
        ],
    ]

    for group in feature_groups:
        group_importance = 0.0

        for _ in range(n_repeats):
            X_permuted = X.copy()
            # 对该组特征的行进行整体 shuffle
            shuffled_values = X_permuted[group].sample(frac=1).values
            X_permuted[group] = shuffled_values

            # 计算模型在打乱特征后的得分
            permuted_score = scoring(y, model.predict(X_permuted), average=average)
            group_importance += baseline_score - permuted_score

        # 取均值作为该组特征的重要性
        group_importance /= n_repeats
        for col in group:
            importances[X.columns.get_loc(col)] = group_importance

    return importances

In [None]:
feature_importances = calculate_permutation_importance(lgbm, X_test, y_test, scoring=f1_score, average='binary')
feature_names = X_train.columns if hasattr(X_train, 'columns') else np.arange(X_train.shape[1])
fi = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)
fi['Importance'] = fi['Importance'] / fi['Importance'].sum()
fi