In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import make_scorer, precision_recall_curve, f1_score, auc, recall_score,precision_score, fbeta_score, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split, GridSearchCV
import random
from lightgbm import LGBMClassifier, LGBMRegressor
from datetime import datetime, timedelta
from sklearn.preprocessing import LabelEncoder



In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [8]:
df = pd.read_csv('../weekly_filter_new_2.csv', index_col=0)

In [6]:
df.head()

Unnamed: 0,id,mro_new,record_days,hard_braking,hard_acceleration,speeding_sum,day_mileage,est_hh_incm_prmr_cd,purchaser_age_at_tm_of_purch,input_indiv_gndr_prmr_cd,gmqualty_model,umf_xref_finc_gbl_trim,engn_size,purchase_time,tavg,random_avg_traffic
0,w4HClcKnwrzCv8KgwrjDi8Klwr3Cm8KVwqfCrsKowprClg==,0.0,596,33.333333,5.666667,19.333333,298.15625,6.0,54.0,0,Regal,BASE_TRIM,2.0,2018-1,15.123979,0.245754
1,w4HClcKnwrzCv8KgwrjDi8Klwr3Cm8KVwqfCrsKowprClg==,0.0,596,45.666667,4.666667,23.666667,325.572917,6.0,54.0,0,Regal,BASE_TRIM,2.0,2018-1,15.110922,0.240708
2,w4HClcKnwrzCv8KgwrjDi8Klwr3Cm8KVwqfCrsKowprClg==,0.0,596,51.333333,7.666667,29.0,288.0625,6.0,54.0,0,Regal,BASE_TRIM,2.0,2018-1,15.106969,0.238053
3,w4HClcKnwrzCv8KgwrjDi8Klwr3Cm8KVwqfCrsKowprClg==,0.0,596,51.666667,11.0,28.0,252.713542,6.0,54.0,0,Regal,BASE_TRIM,2.0,2018-1,15.050637,0.233105
4,w4HClcKnwrzCv8KgwrjDi8Klwr3Cm8KVwqfCrsKowprClg==,0.0,596,68.5,13.5,36.5,507.960935,6.0,54.0,0,Regal,BASE_TRIM,2.0,2018-1,15.497578,0.2276


In [12]:
#aggregate to monthly level
df1 = df.groupby(['id', 'yr_nbr', 'mth_nbr']).agg({
    'mro_new':'sum',
    'record_days':'first',
    'hard_braking':'mean',
    'hard_acceleration':'mean',
    'speeding_sum':'mean',
    'day_mileage': 'mean',
    'est_hh_incm_prmr_cd':'first',
    'purchaser_age_at_tm_of_purch':'first',
    'input_indiv_gndr_prmr_cd':'first',
    'gmqualty_model':'first',
    'umf_xref_finc_gbl_trim':'first',
    'engn_size':'first',
    'purchase_time':'first',
    'tavg':'mean',
    'random_avg_traffic':'mean'
})
df1 = df1.reset_index()
df1 = df1.drop(['yr_nbr','mth_nbr'], axis=1)
#df=df1.copy()

In [14]:
df1.loc[df1['mro_new'] > 1, 'mro_new'] = 1
df=df1.copy()

In [12]:
df.loc[(df['mth_nbr'] == 12) & (df['week_nbr'] == 1), 'week_nbr'] = 53
def get_date_from_year_week(year, week):
    first_day_of_year = datetime(year, 1, 1)
    days_to_add = timedelta(days=(week - 1) * 7)
    target_date = first_day_of_year + days_to_add
    return target_date
df['time'] = df.apply(lambda row: get_date_from_year_week(row['yr_nbr'], row['week_nbr']), axis=1)
df = df.drop(['yr_nbr','week_nbr','mth_nbr'], axis=1)

In [16]:
categorical_features = ['input_indiv_gndr_prmr_cd', 'gmqualty_model', 'umf_xref_finc_gbl_trim', 'purchase_time']
label_encoders = {col: LabelEncoder().fit(df[col]) for col in categorical_features}
for col, encoder in label_encoders.items():
    df[col] = encoder.transform(df[col])

In [17]:
time_window = 2    #4, 8, 16 
for i in range(1, time_window+1):
    df[f'mro_new_indicator_{i}'] = df.groupby('id')['mro_new'].transform(lambda x: x.shift(i))
    df[f'hard_braking_{i}'] = df.groupby('id')['hard_braking'].transform(lambda x: x.shift(i))
    df[f'hard_acceleration_{i}'] = df.groupby('id')['hard_acceleration'].transform(lambda x: x.shift(i))
    df[f'speeding_sum_{i}'] = df.groupby('id')['speeding_sum'].transform(lambda x: x.shift(i))
    df[f'day_mileage_{i}'] = df.groupby('id')['day_mileage'].transform(lambda x: x.shift(i))
    df[f'tavg_{i}'] = df.groupby('id')['tavg'].transform(lambda x: x.shift(i))
    df[f'random_avg_traffic_{i}'] = df.groupby('id')['random_avg_traffic'].transform(lambda x: x.shift(i))

In [None]:
df = df.drop(['hard_braking', 'hard_acceleration', 'speeding_sum','day_mileage','tavg','random_avg_traffic'], axis=1)
df_new = df.drop(['record_days', 'id'], axis=1)
#week
#df_new = df_new.drop(['time', 'mro_new_indicator_2', 'mro_new_indicator_3','mro_new_indicator_4', 'mro_new_indicator_5', 'mro_new_indicator_6','mro_new_indicator_7'], axis=1)
#month
df_new = df_new.drop(['mro_new_indicator_2'], axis=1)

In [19]:
df_new = df_new.dropna()
df_new.head()

Unnamed: 0,mro_new,est_hh_incm_prmr_cd,purchaser_age_at_tm_of_purch,input_indiv_gndr_prmr_cd,gmqualty_model,umf_xref_finc_gbl_trim,engn_size,purchase_time,mro_new_indicator_1,hard_braking_1,hard_acceleration_1,speeding_sum_1,day_mileage_1,tavg_1,random_avg_traffic_1,hard_braking_2,hard_acceleration_2,speeding_sum_2,day_mileage_2,tavg_2,random_avg_traffic_2
2,0.0,6.0,54.0,0,13,0,2.0,0,0.0,45.666667,4.666667,23.666667,325.572917,15.110922,0.240708,33.333333,5.666667,19.333333,298.15625,15.123979,0.245754
3,0.0,6.0,54.0,0,13,0,2.0,0,0.0,51.333333,7.666667,29.0,288.0625,15.106969,0.238053,45.666667,4.666667,23.666667,325.572917,15.110922,0.240708
4,0.0,6.0,54.0,0,13,0,2.0,0,0.0,51.666667,11.0,28.0,252.713542,15.050637,0.233105,51.333333,7.666667,29.0,288.0625,15.106969,0.238053
5,1.0,6.0,54.0,0,13,0,2.0,0,0.0,68.5,13.5,36.5,507.960935,15.497578,0.2276,51.666667,11.0,28.0,252.713542,15.050637,0.233105
6,0.0,6.0,54.0,0,13,0,2.0,0,1.0,64.333333,10.666667,27.0,371.135417,15.588944,0.225047,68.5,13.5,36.5,507.960935,15.497578,0.2276


In [20]:
df_new.to_csv('2month_new1.csv')