In [None]:
import numpy as np, pandas as pd, os, warnings

pd.set_option("display.max_columns", 100)
warnings.filterwarnings("ignore")

mbd  = "microbusiness_density"
act  = "active"

# train
train  = pd.read_csv("/kaggle/input/godaddy-microbusiness-density-forecasting/train.csv")
data   = pd.read_csv("/kaggle/input/godaddy-microbusiness-density-forecasting/revealed_test.csv")

# datetime
train["first_day_of_month"] = pd.to_datetime(train["first_day_of_month"])
data["first_day_of_month"] = pd.to_datetime(data["first_day_of_month"])

train  = pd.concat([train, data])

# test
test = pd.read_csv("/kaggle/input/godaddy-microbusiness-density-forecasting/test.csv")

# datetime
test['first_day_of_month']  = pd.to_datetime(test["row_id"].str[-10:])

# submit
submit = pd.read_csv('/kaggle/input/godaddy-microbusiness-density-forecasting/sample_submission.csv')
submit["cfips"] = submit.row_id.apply(lambda x: int(x.split("_")[0]))

# location
coords = pd.read_csv("/kaggle/input/usa-counties-coordinates/cfips_location.csv")

# census
census = pd.read_csv("/kaggle/input/godaddy-microbusiness-density-forecasting/census_starter.csv")

# county data
county_data = pd.read_csv('/kaggle/input/us-county-areas/us-county-areas.csv',dtype={'statefp':str,'countyfp':str})
county_data['cfips'] = (county_data['statefp'] + county_data['countyfp']).astype(int)

# tax data
tax = pd.read_csv('/kaggle/input/godaddy-metadata/clean_data/state_corporate_tax_rates.csv')
tax = tax.loc[tax["year"]>=2019].reset_index(drop=True)

# employment
employment = pd.read_csv("/kaggle/input/godaddy-metadata/clean_data/state_employment.csv")

employment["first_day_of_month"] = pd.to_datetime(employment["first_day_of_month"])
employment["year"]  = employment["first_day_of_month"].dt.year
employment["month"] = employment["first_day_of_month"].dt.month

del employment["first_day_of_month"]

# estimates
estimates = pd.read_csv("/kaggle/input/godaddy-metadata/clean_data/county_rent_estimates.csv")

# co-est
co_est = pd.read_csv("/kaggle/input/us-indicator/co-est2021-alldata.csv", encoding='latin-1')
co_est["cfips"] = co_est.STATE*1000 + co_est.COUNTY

# google-trends
trends = pd.read_csv("/kaggle/input/microbusiness-related-google-trends/google-trends-microbusiness-smallbusiness-internetbusiness-sidejob.csv")

# put year and month
train["year"]  = train["first_day_of_month"].dt.year
train["month"] = train["first_day_of_month"].dt.month

test["year"]   = test["first_day_of_month"].dt.year
test["month"]  = test["first_day_of_month"].dt.month

train = train.sort_values(['cfips','first_day_of_month']).reset_index(drop=True)

test  = test.sort_values(['cfips','first_day_of_month'])
test = test[test["year"] != 2022].reset_index(drop=True)

# --------- population ---------
def my_round(val, digit=0):
    p = 10 ** digit
    return (val * p * 2 + 1) // 2 / p

pop = "population"

train[pop] = my_round( train[act] / train[mbd] * 100 )

# ------------------------------

# Issaquena County: active 0
# Loving County: active 0

train.loc[train["county"]=="Issaquena County", pop] = 1162.0
train.loc[train["county"]=="Loving County", pop] = train.loc[train["county"]=="Loving County", pop].bfill()

unemployment = pd.read_excel('/kaggle/input/unemployment-rate-by-county/Unemployment.xlsx')
poverty      = pd.read_excel('/kaggle/input/county-level-poverty/PovertyEstimates.xlsx')
education    = pd.read_excel('/kaggle/input/county-level-education-us/Education.xlsx')

# --------- SMAPE ---------

def smape(true, pred):
    return 1 / len(true) * np.sum( 2 * np.abs(pred - true) / (np.abs(true) + np.abs(pred)) *100 )

train.shape, test.shape

In [None]:
COLS = ['GEO_ID','NAME','S0101_C01_026E']

df2020 = pd.read_csv('/kaggle/input/census-data-for-godaddy/ACSST5Y2020.S0101-Data.csv',usecols=COLS)
df2021 = pd.read_csv('/kaggle/input/census-data-for-godaddy/ACSST5Y2021.S0101-Data.csv',usecols=COLS)

df2020 = df2020.iloc[1:]
df2021 = df2021.iloc[1:]

df2020['S0101_C01_026E'] = df2020['S0101_C01_026E'].astype('int')
df2021['S0101_C01_026E'] = df2021['S0101_C01_026E'].astype('int')

df2020["cfips"] = df2020["GEO_ID"].apply(lambda x : int(x.split("US")[-1]))
df2021["cfips"] = df2021["GEO_ID"].apply(lambda x : int(x.split("US")[-1]))

adult2020 = df2020.set_index('cfips').S0101_C01_026E.to_dict()
adult2021 = df2021.set_index('cfips').S0101_C01_026E.to_dict()

submit["adult2020"] = submit.cfips.map(adult2020)
submit["adult2021"] = submit.cfips.map(adult2021)

display(submit.head(1))

In [None]:
train['istest'] = 0
test ['istest'] = 1

raw = pd.concat((train, test)).sort_values(['cfips','row_id']).reset_index(drop=True)

# census
raw = raw.merge(census, on='cfips', how='left')

# coords
raw = raw.merge(coords.drop("name", axis=1), on="cfips", how="left")

# co_est
raw = raw.merge(co_est, on="cfips", how="left")

# # county_data
# raw = raw.merge(county_data, on="cfips", how="left")

# # tax
# raw = raw.merge(tax, on=["state", "year"], how="left")

# # estimates
# raw = raw.merge(estimates, on=["cfips", "year"], how="left")

# # employment
# raw = raw.merge(employment, on=["state","year","month"], how="left")

raw["adult2020"] = raw.cfips.map(adult2020)
raw["adult2021"] = raw.cfips.map(adult2021)

In [None]:
raw["county"] = raw.groupby("cfips")["county"].ffill()
raw["state"]  = raw.groupby("cfips")["state"].ffill()
raw["dcount"] = raw.groupby("cfips")["county"].cumcount()

raw["county_i"] = (raw['county'] + raw['state']).factorize()[0]
raw["state_i"]  = raw["state"].factorize()[0]

In [None]:
# cfips: 35039
mean = raw[raw["state"]=="New Mexico"]["pct_it_workers_2018"].mean()
raw.loc[raw["pct_it_workers_2018"].isnull(), "pct_it_workers_2018"] = mean

mean = raw[raw["state"]=="New Mexico"]["median_hh_inc_2018"].mean()
raw.loc[raw["median_hh_inc_2018"].isnull(), "median_hh_inc_2018"] = mean

# cfips: 48243
mean = raw[raw["state"]=="Texas"]["median_hh_inc_2020"].mean()
raw.loc[raw["median_hh_inc_2020"].isnull(), "median_hh_inc_2020"] = mean

# cfips: 48301
mean = raw[raw["state"]=="Texas"]["median_hh_inc_2021"].mean()
raw.loc[raw["median_hh_inc_2021"].isnull(), "median_hh_inc_2021"] = mean

conditions = [raw['year']==2019,raw['year']==2020,raw['year']==2021,
              raw['year']==2022,raw['year']==2023]

choices_bb      = [raw['pct_bb_2017'],raw['pct_bb_2018'],raw['pct_bb_2019'],
                   raw['pct_bb_2020'],raw['pct_bb_2021']]

choices_college = [raw['pct_college_2017'],raw['pct_college_2018'],
                   raw['pct_college_2019'],raw['pct_college_2020'],
                   raw['pct_college_2021']]

choices_foreign = [raw['pct_foreign_born_2017'],raw['pct_foreign_born_2018'],
                   raw['pct_foreign_born_2019'],raw['pct_foreign_born_2020'],
                   raw['pct_foreign_born_2021']]

choices_workers = [raw['pct_it_workers_2017'],raw['pct_it_workers_2018'],
                   raw['pct_it_workers_2019'],raw['pct_it_workers_2020'],
                   raw['pct_it_workers_2021']]

choices_inc     = [raw['median_hh_inc_2017'],raw['median_hh_inc_2018'],
                   raw['median_hh_inc_2019'],raw['median_hh_inc_2020'],
                   raw['median_hh_inc_2021']]

raw["pct_bb"]      = np.select(conditions, choices_bb)
raw["pct_college"] = np.select(conditions, choices_college)
raw["pct_foreign"] = np.select(conditions, choices_foreign)
raw["pct_workers"] = np.select(conditions, choices_workers)
raw["pct_inc"]     = np.select(conditions, choices_inc)

In [None]:
for i in range(1, 6):
    mbd_target = f"mbd_target{i}"

    raw[mbd_target] = raw.groupby('cfips')[mbd].shift(-i)
    raw[mbd_target] = raw[mbd_target] / raw[mbd] - 1

# -----------------------------------------------------------------
for i in range(1, 6):
    act_target = f"act_target{i}"

    raw[act_target] = raw.groupby('cfips')[act].shift(-i)
    raw[act_target] = raw[act_target] / raw[act] - 1

In [None]:
from tqdm import tqdm

outliers = []
cnt = 0
for o in tqdm(raw.cfips.unique()):
    indices = (raw['cfips']==o)
    tmp = raw.loc[indices].copy().reset_index(drop=True)
    var = tmp[mbd].values.copy()
    
    for i in range(40, 0, -1):
        thr = 0.20 * np.mean(var[:i])
        difa = abs(var[i]-var[i-1])
        if ( difa>=thr ):
            var[:i] *= (var[i]/var[i-1])
            outliers.append(o)
            cnt+=1
    var[0] = var[1] * 0.99
    raw.loc[indices, mbd] = var
    
outliers = np.unique(outliers)
len(outliers), cnt

In [None]:
# 48269（キング郡、人口多い）
for i in range(1, 6):
    raw.loc[125448:125450, f"mbd_target{i}"] = 0.0
    raw.loc[125448:125450, f"act_target{i}"] = 0.0

In [None]:
# 48301（ラヴィング郡、人口超少ない） mbd データが途切れ途切れ
for i in range(1, 6):
    raw.loc[125461:125462, f"mbd_target{i}"] = 0.0
    raw.loc[125461:125462, f"act_target{i}"] = 0.0

In [None]:
# 28055（イサケナ郡、人口少ない）
for i in range(1, 6):
    raw.loc[raw['cfips']==28055, f"mbd_target{i}"] = 0.0
    raw.loc[raw['cfips']==28055, f"act_target{i}"] = 0.0

In [None]:
%%time
def build_features(raw, lags=1):
    for lag in range(1, lags):
        raw[f"mbd_lag_{lag}"] = raw.groupby("cfips")[mbd].shift(lag)
    
    for lag in range(1, lags):
        raw[f"act_lag_{lag}"] = raw.groupby("cfips")[act].shift(lag)
    
    for lag in range(1, 6):
        for window in [2, 4, 6, 8, 10]:
            raw[f"mbd_roll_target{lag}_{window}"] = raw.groupby("cfips")[f"mbd_target{lag}"].transform(lambda s: s.rolling(window, min_periods=1).sum())
        for window in [2, 4, 6, 8, 10]:
            raw[f"act_roll_target{lag}_{window}"] = raw.groupby("cfips")[f"act_target{lag}"].transform(lambda s: s.rolling(window, min_periods=1).sum())

        for window in [2, 4, 6, 8, 10]:
            raw[f"mbd_roll{lag}_{window}"] = raw.groupby("cfips")[f"mbd_lag_{lag}"].transform(lambda s: s.rolling(window, min_periods=1).sum())
        for window in [2, 4, 6, 8, 10]:
            raw[f"act_roll{lag}_{window}"] = raw.groupby("cfips")[f"act_lag_{lag}"].transform(lambda s: s.rolling(window, min_periods=1).sum())

    return raw

raw = build_features(raw, lags=6)

raw.head(3)

In [None]:
for ts in range(30, 41):
    dt = raw.loc[raw.dcount==ts].groupby('cfips')[mbd].agg('last')
    raw[f'dcount{ts}_mbd'] = raw['cfips'].map(dt)

for ts in range(30, 41):
    dt = raw.loc[raw.dcount==ts].groupby('cfips')[act].agg('last')
    raw[f'dcount{ts}_act'] = raw['cfips'].map(dt)

In [None]:
def rot(df):
    for angle in [15, 30, 45]:
        df[f'rot_{angle}_x'] = (np.cos(np.radians(angle)) * df['lat']) + \
                                (np.sin(np.radians(angle)) * df['lng'])
        
        df[f'rot_{angle}_y'] = (np.cos(np.radians(angle)) * df['lat']) - \
                                (np.sin(np.radians(angle)) * df['lng'])
        
    return df

raw = rot(raw)

In [None]:
raw.columns[:100]

In [None]:
raw.columns[100:200]

In [None]:
raw.columns[200:]

In [None]:
features1 = ['cfips','county_i','state_i',
             'mbd_lag_1','mbd_lag_2','mbd_lag_3','mbd_lag_4','mbd_lag_5',
             'act_lag_1','act_lag_2','act_lag_3','act_lag_4','act_lag_5',
             'act_target1','act_target2','act_target3','act_target4','act_target5',
             
             'mbd_roll_target1_2','mbd_roll_target1_4','mbd_roll_target1_6','mbd_roll_target1_8','mbd_roll_target1_10',
             'act_roll_target1_2','act_roll_target1_4','act_roll_target1_6','act_roll_target1_8','act_roll_target1_10',
             'mbd_roll1_2','mbd_roll1_4','mbd_roll1_6','mbd_roll1_8','mbd_roll1_10',
             'act_roll1_2','act_roll1_4','act_roll1_6','act_roll1_8','act_roll1_10']

features2 = ['cfips','county_i','state_i',
             'mbd_lag_1','mbd_lag_2','mbd_lag_3','mbd_lag_4','mbd_lag_5',
             'act_lag_1','act_lag_2','act_lag_3','act_lag_4','act_lag_5',
             'act_target1','act_target2','act_target3','act_target4','act_target5',
             
             'mbd_roll_target2_2','mbd_roll_target2_4','mbd_roll_target2_6','mbd_roll_target2_8','mbd_roll_target2_10',
             'act_roll_target2_2','act_roll_target2_4','act_roll_target2_6','act_roll_target2_8','act_roll_target2_10',
             'mbd_roll2_2','mbd_roll2_4','mbd_roll2_6','mbd_roll2_8','mbd_roll2_10',
             'act_roll2_2','act_roll2_4','act_roll2_6','act_roll2_8','act_roll2_10']

features3 = ['cfips','county_i','state_i',
             'mbd_lag_1','mbd_lag_2','mbd_lag_3','mbd_lag_4','mbd_lag_5',
             'act_lag_1','act_lag_2','act_lag_3','act_lag_4','act_lag_5',
             'act_target1','act_target2','act_target3','act_target4','act_target5',
             
             'mbd_roll_target3_2','mbd_roll_target3_4','mbd_roll_target3_6','mbd_roll_target3_8','mbd_roll_target3_10',
             'act_roll_target3_2','act_roll_target3_4','act_roll_target3_6','act_roll_target3_8','act_roll_target3_10',
             'mbd_roll3_2','mbd_roll3_4','mbd_roll3_6','mbd_roll3_8','mbd_roll3_10',
             'act_roll3_2','act_roll3_4','act_roll3_6','act_roll3_8','act_roll3_10']

features4 = ['cfips','county_i','state_i',
             'mbd_lag_1','mbd_lag_2','mbd_lag_3','mbd_lag_4','mbd_lag_5',
             'act_lag_1','act_lag_2','act_lag_3','act_lag_4','act_lag_5',
             'act_target1','act_target2','act_target3','act_target4','act_target5',
             
             'mbd_roll_target4_2','mbd_roll_target4_4','mbd_roll_target4_6','mbd_roll_target4_8','mbd_roll_target4_10',
             'act_roll_target4_2','act_roll_target4_4','act_roll_target4_6','act_roll_target4_8','act_roll_target4_10',
             'mbd_roll4_2','mbd_roll4_4','mbd_roll4_6','mbd_roll4_8','mbd_roll4_10',
             'act_roll4_2','act_roll4_4','act_roll4_6','act_roll4_8','act_roll4_10']

features5 = ['cfips','county_i','state_i',
             'mbd_lag_1','mbd_lag_2','mbd_lag_3','mbd_lag_4','mbd_lag_5',
             'act_lag_1','act_lag_2','act_lag_3','act_lag_4','act_lag_5',
             'act_target1','act_target2','act_target3','act_target4','act_target5',
             
             'mbd_roll_target5_2','mbd_roll_target5_4','mbd_roll_target5_6','mbd_roll_target5_8','mbd_roll_target5_10',
             'act_roll_target5_2','act_roll_target5_4','act_roll_target5_6','act_roll_target5_8','act_roll_target5_10',
             'mbd_roll5_2','mbd_roll5_4','mbd_roll5_6','mbd_roll5_8','mbd_roll5_10',
             'act_roll5_2','act_roll5_4','act_roll5_6','act_roll5_8','act_roll5_10']

In [None]:
all_features = ['cfips','county_i','state_i',
                
                
                'act_target1','act_target2','act_target3','act_target4','act_target5',
                'pct_bb_2017', 'pct_bb_2018', 'pct_bb_2019', 'pct_bb_2020', 'pct_bb_2021',
                'pct_college_2017','pct_college_2018','pct_college_2019','pct_college_2020','pct_college_2021',
                'pct_foreign_born_2017','pct_foreign_born_2018','pct_foreign_born_2019','pct_foreign_born_2020','pct_foreign_born_2021',
                'pct_it_workers_2017','pct_it_workers_2018','pct_it_workers_2019','pct_it_workers_2020','pct_it_workers_2021',
                'median_hh_inc_2017','median_hh_inc_2018','median_hh_inc_2019','median_hh_inc_2020','median_hh_inc_2021',
                'SUMLEV','DIVISION','ESTIMATESBASE2020','POPESTIMATE2020','POPESTIMATE2021','NPOPCHG2020',
                'NPOPCHG2021','BIRTHS2020','BIRTHS2021','DEATHS2020','DEATHS2021','NATURALCHG2020',
                'NATURALCHG2021','INTERNATIONALMIG2020','INTERNATIONALMIG2021','DOMESTICMIG2020',
                'DOMESTICMIG2021','NETMIG2020','NETMIG2021','RESIDUAL2020','RESIDUAL2021',
                'GQESTIMATESBASE2020','GQESTIMATES2020','GQESTIMATES2021','RBIRTH2021','RDEATH2021',
                'RNATURALCHG2021','RINTERNATIONALMIG2021','RDOMESTICMIG2021','RNETMIG2021',
                'lng','lat',
                'rot_15_x','rot_15_y','rot_30_x','rot_30_y','rot_45_x','rot_45_y']

ACT_THR = 140
ABS_THR = 0

In [None]:
def training(df, idx, ts_idx, i, model):
    if i == 0:
        model.fit(
            df.loc[idx, features1],
            df.loc[idx, f"mbd_target{i+1}"],
            verbose=-1)
        pred = model.predict(df.loc[ts_idx, features1])
        return pred
    if i == 1:
        model.fit(
            df.loc[idx, features2],
            df.loc[idx, f"mbd_target{i+1}"],
            verbose=-1)
        pred = model.predict(df.loc[ts_idx, features2])
        return pred
    if i == 2:
        model.fit(
            df.loc[idx, features3],
            df.loc[idx, f"mbd_target{i+1}"],
            verbose=-1)
        pred = model.predict(df.loc[ts_idx, features3])
        return pred
    if i == 3:
        model.fit(
            df.loc[idx, features4],
            df.loc[idx, f"mbd_target{i+1}"],
            verbose=-1)
        pred = model.predict(df.loc[ts_idx, features4])
        return pred
    if i == 4:
        model.fit(
            df.loc[idx, features5],
            df.loc[idx, f"mbd_target{i+1}"],
            verbose=-1)
        pred = model.predict(df.loc[ts_idx, features5])
        return pred

In [None]:
def training_add(df, idx, ts_idx, i, model):
    add = 1000
    if i == 0:
        model.fit(
            df.loc[idx, features1],
            df.loc[idx, f"mbd_target{i+1}"]*add,
            verbose=-1)
        pred = model.predict(df.loc[ts_idx, features1])
        return pred
    if i == 1:
        model.fit(
            df.loc[idx, features2],
            df.loc[idx, f"mbd_target{i+1}"]*add,
            verbose=-1)
        pred = model.predict(df.loc[ts_idx, features2])
        return pred
    if i == 2:
        model.fit(
            df.loc[idx, features3],
            df.loc[idx, f"mbd_target{i+1}"]*add,
            verbose=-1)
        pred = model.predict(df.loc[ts_idx, features3])
        return pred
    if i == 3:
        model.fit(
            df.loc[idx, features4],
            df.loc[idx, f"mbd_target{i+1}"]*add,
            verbose=-1)
        pred = model.predict(df.loc[ts_idx, features4])
        return pred
    if i == 4:
        model.fit(
            df.loc[idx, features5],
            df.loc[idx, f"mbd_target{i+1}"]*add,
            verbose=-1)
        pred = model.predict(df.loc[ts_idx, features5])
        return pred

# check cv

In [None]:
%%time
import lightgbm as lgb

scores = []

params = {
    'num_iterations': 500,
    'boosting_type': 'dart',
    'objective': 'l1',
    'random_state': 2023,
    'learning_rate': 0.0399361536271749}


for i in range(5):
    print("predict: ", i+36, f" (use ts {35})")
    print("training_data: until",  (34-i))
    train_indices = (raw.istest == 0) & (raw.dcount <= (34-i)) & (raw.dcount >= 1) # & (raw[f'dcount{ts}_act']>ACT_THR)  & (raw[f'dcount{ts}_mbd']>ABS_THR)
    test_indices  = (raw.dcount == 35)

    model = lgb.LGBMRegressor(**params)
    
    pred = training(raw, train_indices, test_indices, i, model)

    # target
    raw.loc[test_indices, 'pred'] = pred + 1.
    raw.loc[test_indices, 'pred'] = raw.loc[test_indices, mbd] * raw.loc[test_indices, 'pred']
    
    # -----------------------------------
    ts_mbd = raw.loc[raw.dcount==35, ['cfips', mbd]].set_index('cfips').to_dict()[mbd]
    ts_pre = raw.loc[raw.dcount==35, ['cfips', 'pred']].set_index('cfips').to_dict()['pred']

    df = raw.loc[raw.dcount==(i+36), ['cfips', 'state_i', mbd]].reset_index(drop=True)
    df['previous_mbd'] = df['cfips'].map(ts_mbd)
    df['predict_mbd']  = df['cfips'].map(ts_pre)
    
    # df.loc[df[f'dcount{ts}_act']<=ACT_THR, 'predict_mbd'] = df.loc[df[f'dcount{ts}_act']<=ACT_THR, f'dcount{ts}_mbd']
    # df.loc[df[f'dcount{ts}_mbd']<=ABS_THR, 'predict_mbd'] = df.loc[df[f'dcount{ts}_mbd']<=ABS_THR, f'dcount{ts}_mbd']
    
    raw.loc[raw.dcount==(i+36), 'previous_mbd'] = df['previous_mbd'].values
    raw.loc[raw.dcount==(i+36), 'predict_mbd']  = df['predict_mbd'].values
    
    score = smape(df[mbd], df['predict_mbd'])
    print()
    print("SMAPE:", score)
    print()

    scores.append(score)

# Mean SMAPE: 0.7859664428460815
print(f'Mean SMAPE: {np.mean(scores)}')

In [None]:
# predict:  36  (use ts 35)
# training_data: until 34

# SMAPE: 0.5805352506503692

# predict:  37  (use ts 35)
# training_data: until 33

# SMAPE: 0.7095039485603237

# predict:  38  (use ts 35)
# training_data: until 32

# SMAPE: 0.7175701765957819

# predict:  39  (use ts 35)
# training_data: until 31

# SMAPE: 0.8352400664767288

# predict:  40  (use ts 35)
# training_data: until 30

# SMAPE: 1.0688326966909572

# Mean SMAPE: 0.7823364277948321
# CPU times: user 47.5 s, sys: 594 ms, total: 48.1 s
# Wall time: 25.6 s

# INFERENCE

# lightgbm

In [None]:
%%time
import lightgbm as lgb

dts = []

params = {
    'num_iterations': 3000,
    'boosting_type': 'dart',
    'objective': 'l1',
    'random_state': 2023,
    'learning_rate': 0.0399361536271749}

# 41-45(PB:41, PL:43-45)
for i in range(5):
    print("predict: ", i+41, f" (use ts {40})")
    print("training_data: until",  (39-i))
    train_indices = (raw.istest == 0) & (raw.dcount <= (39-i)) & (raw.dcount >= 1) # & (raw[f'dcount{ts}_act']>ACT_THR)  & (raw[f'dcount{ts}_mbd']>ABS_THR)
    test_indices  = (raw.dcount == 40)

    model = lgb.LGBMRegressor(**params)
    
    pred = training_add(raw, train_indices, test_indices, i)
    
    # target
    raw.loc[test_indices, 'pred'] = pred + 1.
    raw.loc[test_indices, 'pred'] = raw.loc[test_indices, mbd] * raw.loc[test_indices, 'pred']
    
    # -----------------------------------
    ts_mbd = raw.loc[raw.dcount==40, ['cfips', mbd]].set_index('cfips').to_dict()[mbd]
    ts_pre = raw.loc[raw.dcount==40, ['cfips', 'pred']].set_index('cfips').to_dict()['pred']

    df = raw.loc[raw.dcount==(i+41), ['cfips', 'state_i']].reset_index(drop=True)
    df['dcount40_mbd'] = df['cfips'].map(ts_mbd)
    df['predict_mbd']  = df['cfips'].map(ts_pre)
        
    raw.loc[raw.dcount==(i+41), 'predict_mbd']  = df['predict_mbd'].values

    dt = raw.loc[raw.dcount==(i+41), ['row_id','predict_mbd']].set_index('row_id').to_dict()['predict_mbd']
    
    dts.append(dt)
    print()

In [None]:
df = pd.DataFrame()

for i in range(5):
    df = pd.concat([df, pd.DataFrame(dts).T.iloc[3135*i:3135*(i+1)][i].reset_index().rename({"index": "row_id", i: mbd}, axis=1)]).reset_index(drop=True)

df["cfips"] = df.row_id.apply(lambda x: int(x.split("_")[0]))
df["first_day_of_month"] = pd.to_datetime(df.row_id.apply(lambda x: (x.split("_")[1])))

df = df.sort_values(["cfips", "first_day_of_month"]).reset_index(drop=True)

df[df["cfips"]==1001]

In [None]:
submit = pd.read_csv('/kaggle/input/godaddy-microbusiness-density-forecasting/sample_submission.csv')
submit["cfips"] = submit.row_id.apply(lambda x: int(x.split("_")[0]))
submit["first_day_of_month"] = pd.to_datetime(submit.row_id.apply(lambda x: (x.split("_")[1])))
submit = pd.concat([data[["row_id", mbd, "first_day_of_month", "cfips"]], submit.loc[submit.first_day_of_month >= "2023-01-01"]])
submit = submit.sort_values(["cfips", "first_day_of_month"]).reset_index(drop=True)

submit["adult2020"] = submit.cfips.map(adult2020)
submit["adult2021"] = submit.cfips.map(adult2021)

In [None]:
sub = submit.loc[(submit.first_day_of_month >= "2023-01-01") & (submit.first_day_of_month < "2023-06-01")].reset_index(drop=True)
sub[mbd] = df[mbd]

sub    = pd.concat([submit.loc[submit.first_day_of_month < "2023-01-01"], sub])
submit = pd.concat([submit.loc[submit.first_day_of_month > "2023-05-01"], sub])
submit = submit.sort_values(["cfips", "first_day_of_month"]).reset_index(drop=True)

del sub

In [None]:
# original : 0.015443651235829803
(submit.loc[submit.cfips==1001, mbd].shift(-1) / submit.loc[submit.cfips==1001, mbd]).mean() - 1

In [None]:
# 0.003793530228790143
(train.loc[train.cfips==1001, mbd].shift(-1) / train.loc[train.cfips==1001, mbd]).mean() - 1

In [None]:
submit[mbd] * (submit.adult2020 / submit.adult2021) * 1.005

In [None]:
submit.loc[submit["first_day_of_month"] == "2023-01-01", mbd] = submit.loc[submit["first_day_of_month"] == "2023-01-01", mbd] * 1.005

In [None]:
submit.loc[submit["first_day_of_month"] >= "2023-01-01", mbd] = submit.loc[submit["first_day_of_month"] >= "2023-01-01", mbd] * (submit.loc[submit["first_day_of_month"] >= "2023-01-01", "adult2020"] / submit.loc[submit["first_day_of_month"] >= "2023-01-01", "adult2021"]) * 1.005
submit[["row_id", mbd]].to_csv('submission_lgb3.csv', index=False)

display(submit.isnull().sum())
submit.loc[submit.cfips==1001]

# DL

# GRU

In [None]:
import tensorflow as tf
import keras

PATH_MODEL = './model'

ES = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=2)
CD = tf.keras.callbacks.ModelCheckpoint("model.hdf5", save_best_only=True, period=3)

def set_seed(seed=2023):
    import os, random as rn
    tf.random.set_seed(seed)
    np.random.seed(seed)
    rn.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = str(seed)
    os.environ['TF_CUDNN_DETERMINISTIC'] = str(seed)


def build_model():
    set_seed(2023)
    inp = tf.keras.Input(shape=(raw[features1].shape[1], 1))
    # x = keras.layers.CuDNNGRU(units=128, return_sequences=False)(inp)
    x = tf.keras.layers.GRU(units=128, return_sequences=False)(inp)
    x = tf.keras.layers.Dense(32, activation='relu', kernel_initializer='glorot_uniform')(x)
    x = tf.keras.layers.Dense(16, activation='relu', kernel_initializer='glorot_uniform')(x)
    x = tf.keras.layers.Dense( 1, activation='linear')(x)
    model = tf.keras.Model(inputs=inp, outputs=x)
    
    lr = 0.001
    opt = tf.keras.optimizers.Adam(learning_rate=lr)
    los = tf.keras.losses.MeanSquaredError()
    model.compile(loss=los, optimizer=opt)
    
    return model

# cv

In [None]:
def training2(df, idx, ts_idx, i, model):
    if i == 0:
        model.fit(
            raw.loc[idx, features1].fillna(0).values,
            raw.loc[idx, f"mbd_target{i+1}"].values,
            batch_size=34, epochs=epochs, callbacks=[ES,CD], verbose=1)
        pred = model.predict(raw.loc[test_indices, features1].fillna(0).values)
        return pred
    if i == 1:
        model.fit(
            raw.loc[idx, features2].fillna(0).values,
            raw.loc[idx, f"mbd_target{i+1}"].values,
            batch_size=34, epochs=epochs, callbacks=[ES,CD], verbose=1)
        pred = model.predict(raw.loc[test_indices, features2].fillna(0).values)
        return pred
    if i == 2:
        model.fit(
            raw.loc[idx, features3].fillna(0).values,
            raw.loc[idx, f"mbd_target{i+1}"].values,
            batch_size=34, epochs=epochs, callbacks=[ES,CD], verbose=1)
        pred = model.predict(raw.loc[test_indices, features3].fillna(0).values)
        return pred
    if i == 3:
        model.fit(
            raw.loc[idx, features4].fillna(0).values,
            raw.loc[idx, f"mbd_target{i+1}"].values,
            batch_size=34, epochs=epochs, callbacks=[ES,CD], verbose=1)
        pred = model.predict(raw.loc[test_indices, features4].fillna(0).values)
        return pred
    if i == 4:
        model.fit(
            raw.loc[idx, features5].fillna(0).values,
            raw.loc[idx, f"mbd_target{i+1}"].values,
            batch_size=34, epochs=epochs, callbacks=[ES,CD], verbose=1)
        pred = model.predict(raw.loc[test_indices, features5].fillna(0).values)
        return pred

In [None]:
%%time
epochs = 3

scores = []

for i in range(5):
    print("predict: ", i+36, f" (use ts {35})")
    print("training_data: until",  (34-i))
    train_indices = (raw.istest == 0) & (raw.dcount <= (34-i)) & (raw.dcount >= 1)
    test_indices  = (raw.dcount == 35)
    
    model = build_model()
    
    pred = training2(raw, train_indices, test_indices, i, model)
    
    # target
    raw.loc[test_indices, 'pred'] = pred + 1.
    raw.loc[test_indices, 'pred'] = raw.loc[test_indices, mbd] * raw.loc[test_indices, 'pred']
    
    # -----------------------------------
    ts_mbd = raw.loc[raw.dcount==35, ['cfips', mbd]].set_index('cfips').to_dict()[mbd]
    ts_pre = raw.loc[raw.dcount==35, ['cfips', 'pred']].set_index('cfips').to_dict()['pred']

    df = raw.loc[raw.dcount==(i+36), ['cfips', 'state_i', mbd]].reset_index(drop=True)
    df['previous_mbd'] = df['cfips'].map(ts_mbd)
    df['predict_mbd']  = df['cfips'].map(ts_pre)
        
    raw.loc[raw.dcount==(i+36), 'previous_mbd'] = df['previous_mbd'].values
    raw.loc[raw.dcount==(i+36), 'predict_mbd']  = df['predict_mbd'].values
    
    score = smape(df[mbd], df['predict_mbd'])
    print()
    print("SMAPE:", score)
    print()

    scores.append(score)

# Mean SMAPE: 0.7411208031981452
print(f'Mean SMAPE: {np.mean(scores)}')

In [None]:
# predict:  36  (use ts 35)
# training_data: until 34
# Epoch 1/3
# 3135/3135 [==============================] - 12s 4ms/step - loss: 0.0353
# Epoch 2/3
# 3135/3135 [==============================] - 11s 4ms/step - loss: 0.0336
# Epoch 3/3
# 3135/3135 [==============================] - 11s 3ms/step - loss: 0.0300

# SMAPE: 1.3041652825554289

# predict:  37  (use ts 35)
# training_data: until 33
# Epoch 1/3
# 3043/3043 [==============================] - 14s 4ms/step - loss: 0.0267
# Epoch 2/3
# 3043/3043 [==============================] - 13s 4ms/step - loss: 0.0211
# Epoch 3/3
# 3043/3043 [==============================] - 13s 4ms/step - loss: 0.0199

# SMAPE: 0.719662362107902

# predict:  38  (use ts 35)
# training_data: until 32
# Epoch 1/3
# 2951/2951 [==============================] - 15s 5ms/step - loss: 0.0371
# Epoch 2/3
# 2951/2951 [==============================] - 12s 4ms/step - loss: 0.0327
# Epoch 3/3
# 2951/2951 [==============================] - 12s 4ms/step - loss: 0.0304

# SMAPE: 2.6053024509393583

# predict:  39  (use ts 35)
# training_data: until 31
# Epoch 1/3
# 2859/2859 [==============================] - 13s 4ms/step - loss: 0.0481
# Epoch 2/3
# 2859/2859 [==============================] - 12s 4ms/step - loss: 0.0409
# Epoch 3/3
# 2859/2859 [==============================] - 12s 4ms/step - loss: 0.0425

# SMAPE: 3.55401900255682

# predict:  40  (use ts 35)
# training_data: until 30
# Epoch 1/3
# 2767/2767 [==============================] - 14s 4ms/step - loss: 0.0643
# Epoch 2/3
# 2767/2767 [==============================] - 12s 4ms/step - loss: 0.0534
# Epoch 3/3
# 2767/2767 [==============================] - 11s 4ms/step - loss: 0.0445

# SMAPE: 3.2930341610021907

# Mean SMAPE: 2.29523665183234
# CPU times: user 3min 35s, sys: 16.5 s, total: 3min 52s
# Wall time: 3min 13s

# inference

In [None]:
%%time

epochs = 6

dts = []

# 41-45(PB:41, PL:43-45)
for i in range(5):
    print("predict: ", i+41, f" (use ts {40})")
    print("training_data: until",  (39-i))
    train_indices = (raw.istest == 0) & (raw.dcount <= (39-i)) & (raw.dcount >= 1) # & (raw[f'dcount{ts}_act']>ACT_THR)  & (raw[f'dcount{ts}_mbd']>ABS_THR)
    test_indices  = (raw.dcount == 40)

    model = build_model()
    pred = training2(raw, train_indices, test_indices, i, model)

    # target
    raw.loc[test_indices, 'pred'] = pred + 1.
    raw.loc[test_indices, 'pred'] = raw.loc[test_indices, mbd] * raw.loc[test_indices, 'pred']
    
    # -----------------------------------
    ts_mbd = raw.loc[raw.dcount==40, ['cfips', mbd]].set_index('cfips').to_dict()[mbd]
    ts_pre = raw.loc[raw.dcount==40, ['cfips', 'pred']].set_index('cfips').to_dict()['pred']

    df = raw.loc[raw.dcount==(i+41), ['cfips', 'state_i']].reset_index(drop=True)
    df['dcount40_mbd'] = df['cfips'].map(ts_mbd)
    df['predict_mbd']  = df['cfips'].map(ts_pre)
        
    raw.loc[raw.dcount==(i+41), 'predict_mbd']  = df['predict_mbd'].values

    dt = raw.loc[raw.dcount==(i+41), ['row_id','predict_mbd']].set_index('row_id').to_dict()['predict_mbd']
    
    dts.append(dt)
    print()

In [None]:
df = pd.DataFrame()

for i in range(5):
    df = pd.concat([df, pd.DataFrame(dts).T.iloc[3135*i:3135*(i+1)][i].reset_index().rename({"index": "row_id", i: mbd}, axis=1)]).reset_index(drop=True)

df["cfips"] = df.row_id.apply(lambda x: int(x.split("_")[0]))
df["first_day_of_month"] = pd.to_datetime(df.row_id.apply(lambda x: (x.split("_")[1])))

df = df.sort_values(["cfips", "first_day_of_month"]).reset_index(drop=True)

df[df["cfips"]==1001]

In [None]:
submit = pd.read_csv('/kaggle/input/godaddy-microbusiness-density-forecasting/sample_submission.csv')
submit["cfips"] = submit.row_id.apply(lambda x: int(x.split("_")[0]))
submit["first_day_of_month"] = pd.to_datetime(submit.row_id.apply(lambda x: (x.split("_")[1])))
submit = pd.concat([data[["row_id", mbd, "first_day_of_month", "cfips"]], submit.loc[submit.first_day_of_month >= "2023-01-01"]])
submit = submit.sort_values(["cfips", "first_day_of_month"]).reset_index(drop=True)

submit["adult2020"] = submit.cfips.map(adult2020)
submit["adult2021"] = submit.cfips.map(adult2021)

In [None]:
sub = submit.loc[(submit.first_day_of_month >= "2023-01-01") & (submit.first_day_of_month < "2023-06-01")].reset_index(drop=True)
sub[mbd] = df[mbd]

sub    = pd.concat([submit.loc[submit.first_day_of_month < "2023-01-01"], sub])
submit = pd.concat([submit.loc[submit.first_day_of_month > "2023-05-01"], sub])
submit = submit.sort_values(["cfips", "first_day_of_month"]).reset_index(drop=True)

del sub

In [None]:
submit[mbd] * (submit.adult2020 / submit.adult2021)

In [None]:
submit.loc[submit["first_day_of_month"] >= "2023-01-01", mbd] = submit.loc[submit["first_day_of_month"] >= "2023-01-01", mbd] * (submit.loc[submit["first_day_of_month"] >= "2023-01-01", "adult2020"] / submit.loc[submit["first_day_of_month"] >= "2023-01-01", "adult2021"])
submit[["row_id", mbd]].to_csv('submission_GRU4.csv', index=False)

display(submit.isnull().sum())
submit.loc[submit.cfips==1001]

In [None]:
# submit[mbd] = pd.read_csv("/kaggle/working/submission_lgb3.csv")[mbd]*0.5 + pd.read_csv("/kaggle/working/submission_GRU4.csv")[mbd]*0.5
# submit[["row_id", mbd]].to_csv('submission_ensemble2.csv', index=False)

# display(submit.isnull().sum())
# submit.loc[submit.cfips==1001]