In [264]:
import pandas as pd
import lightgbm as lgb
import numpy as np
import category_encoders as ce
from tqdm import tqdm

In [379]:
# calculate the smape
def compute_smape(y_true, y_pred):

    if not (isinstance(y_true, np.ndarray) and isinstance(y_pred, np.ndarray)):
        y_true, y_pred = np.array(y_true), np.array(y_pred)

    return round(np.mean(
        np.abs(y_pred - y_true) /
        ((np.abs(y_pred) + np.abs(y_true))/2)
    )*100, 3)



In [380]:
train_data = pd.read_csv('../../data/godaddy-microbusiness-density-forecasting/train_with_census.csv')
train_data.head()

Unnamed: 0,row_id,cfips,county,state,first_day_of_month,microbusiness_density,active,year,pct_bb,pct_college,pct_foreign_born,pct_it_workers,median_hh_inc
0,1001_2019-08-01,1001,Autauga County,Alabama,2019-08-01,3.007682,1249,2019-01-01,80.6,16.1,2.3,0.7,58731.0
1,1001_2019-09-01,1001,Autauga County,Alabama,2019-09-01,2.88487,1198,2019-01-01,80.6,16.1,2.3,0.7,58731.0
2,1001_2019-10-01,1001,Autauga County,Alabama,2019-10-01,3.055843,1269,2019-01-01,80.6,16.1,2.3,0.7,58731.0
3,1001_2019-11-01,1001,Autauga County,Alabama,2019-11-01,2.993233,1243,2019-01-01,80.6,16.1,2.3,0.7,58731.0
4,1001_2019-12-01,1001,Autauga County,Alabama,2019-12-01,2.993233,1243,2019-01-01,80.6,16.1,2.3,0.7,58731.0


In [381]:
train_data['source'] = 'train'
# LOAD submission
submission = pd.read_csv('../../data/godaddy-microbusiness-density-forecasting/test.csv')
submission['source'] = 'test'
data = pd.concat([train_data, submission], axis=0)
data.head()

Unnamed: 0,row_id,cfips,county,state,first_day_of_month,microbusiness_density,active,year,pct_bb,pct_college,pct_foreign_born,pct_it_workers,median_hh_inc,source
0,1001_2019-08-01,1001,Autauga County,Alabama,2019-08-01,3.007682,1249.0,2019-01-01,80.6,16.1,2.3,0.7,58731.0,train
1,1001_2019-09-01,1001,Autauga County,Alabama,2019-09-01,2.88487,1198.0,2019-01-01,80.6,16.1,2.3,0.7,58731.0,train
2,1001_2019-10-01,1001,Autauga County,Alabama,2019-10-01,3.055843,1269.0,2019-01-01,80.6,16.1,2.3,0.7,58731.0,train
3,1001_2019-11-01,1001,Autauga County,Alabama,2019-11-01,2.993233,1243.0,2019-01-01,80.6,16.1,2.3,0.7,58731.0,train
4,1001_2019-12-01,1001,Autauga County,Alabama,2019-12-01,2.993233,1243.0,2019-01-01,80.6,16.1,2.3,0.7,58731.0,train


In [382]:
data['cfips_id'] = data['cfips']
data= ce.BaseNEncoder(cols=['cfips'], base=50).fit_transform(data)
data.head()

Unnamed: 0,row_id,cfips_0,cfips_1,cfips_2,county,state,first_day_of_month,microbusiness_density,active,year,pct_bb,pct_college,pct_foreign_born,pct_it_workers,median_hh_inc,source,cfips_id
0,1001_2019-08-01,0,0,1,Autauga County,Alabama,2019-08-01,3.007682,1249.0,2019-01-01,80.6,16.1,2.3,0.7,58731.0,train,1001
1,1001_2019-09-01,0,0,1,Autauga County,Alabama,2019-09-01,2.88487,1198.0,2019-01-01,80.6,16.1,2.3,0.7,58731.0,train,1001
2,1001_2019-10-01,0,0,1,Autauga County,Alabama,2019-10-01,3.055843,1269.0,2019-01-01,80.6,16.1,2.3,0.7,58731.0,train,1001
3,1001_2019-11-01,0,0,1,Autauga County,Alabama,2019-11-01,2.993233,1243.0,2019-01-01,80.6,16.1,2.3,0.7,58731.0,train,1001
4,1001_2019-12-01,0,0,1,Autauga County,Alabama,2019-12-01,2.993233,1243.0,2019-01-01,80.6,16.1,2.3,0.7,58731.0,train,1001


In [383]:
data.tail()

Unnamed: 0,row_id,cfips_0,cfips_1,cfips_2,county,state,first_day_of_month,microbusiness_density,active,year,pct_bb,pct_college,pct_foreign_born,pct_it_workers,median_hh_inc,source,cfips_id
25075,56037_2023-06-01,1,12,31,,,2023-06-01,,,,,,,,,test,56037
25076,56039_2023-06-01,1,12,32,,,2023-06-01,,,,,,,,,test,56039
25077,56041_2023-06-01,1,12,33,,,2023-06-01,,,,,,,,,test,56041
25078,56043_2023-06-01,1,12,34,,,2023-06-01,,,,,,,,,test,56043
25079,56045_2023-06-01,1,12,35,,,2023-06-01,,,,,,,,,test,56045


In [384]:
data['first_day_of_month'] = pd.to_datetime(data['first_day_of_month'])

In [385]:
# create lag
lags = [1, 2,3]

# sort by date
data = data.sort_values('first_day_of_month')

for lag in lags:
    data[f'lag_{lag}'] = data.groupby('cfips_id')['microbusiness_density'].shift(lag)

keep_cols = [ 'cfips_id', 'cfips_0', 'cfips_1', 'cfips_2', 'first_day_of_month', 'microbusiness_density' ] + [f'lag_{lag}' for lag in lags]
data = data[keep_cols]

data.head()

Unnamed: 0,cfips_id,cfips_0,cfips_1,cfips_2,first_day_of_month,microbusiness_density,lag_1,lag_2,lag_3
0,1001,0,0,1,2019-08-01,3.007682,,,
57564,28155,0,29,27,2019-08-01,0.923942,,,
57525,28153,0,29,26,2019-08-01,0.631922,,,
57486,28151,0,29,25,2019-08-01,1.152354,,,
57447,28149,0,29,24,2019-08-01,2.041158,,,


In [386]:
data = data.set_index('first_day_of_month')

In [387]:
# split into train and test and valid
train = data[data.index < '2022-04-01']
val = data[(data.index >= '2022-04-01') & (data.index < '2022-11-01')]
test = data[data.index >= '2022-11-01']

In [388]:
train = train.dropna()

In [389]:
val

Unnamed: 0_level_0,cfips_id,cfips_0,cfips_1,cfips_2,microbusiness_density,lag_1,lag_2,lag_3
first_day_of_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-04-01,38051,0,40,13,1.666667,1.619048,1.619048,1.666667
2022-04-01,21095,0,20,39,0.769041,0.788760,0.833128,0.842987
2022-04-01,38069,0,40,22,1.806084,1.837769,1.837769,1.806084
2022-04-01,48273,1,3,6,1.170900,1.170900,1.153681,1.140766
2022-04-01,18117,0,15,4,2.449358,2.455978,2.416258,2.403019
...,...,...,...,...,...,...,...,...
2022-10-01,27047,0,26,36,2.501682,2.526909,2.522704,2.526909
2022-10-01,48391,1,4,15,2.023765,2.023765,1.986632,2.005199
2022-10-01,27043,0,26,34,2.318190,2.299495,2.327538,2.346233
2022-10-01,27041,0,26,33,6.084878,6.071416,6.054589,5.994009


In [618]:
params = {
    'task': 'train',
    'boosting': 'gbdt',
    'objective': 'regression',
    'num_leaves': 30,
    'n_estimators' : 1000,
    'max_depth': 4,
    'learning_rate': 0.085,
    'reg_alpha' : 1,
    'reg_lambda' : 0.1,
    'metric': {'l2','l1'},
    'verbose': -1
}


In [619]:
cols_to_drop = ['cfips_id', 'microbusiness_density']

train_x = train.drop(columns=cols_to_drop, axis=1)
train_y = train['microbusiness_density']

val_x = val.drop(columns=cols_to_drop, axis=1)
val_y = val['microbusiness_density']


lgb_train = lgb.Dataset(train_x, train_y)
lgb_eval = lgb.Dataset(val_x, val_y, reference=lgb_train)

In [620]:
model = lgb.train(params,
                  train_set=lgb_train,
                  valid_sets=lgb_eval,
                  early_stopping_rounds=50)



[1]	valid_0's l1: 2.31131	valid_0's l2: 27.5625
Training until validation scores don't improve for 50 rounds
[2]	valid_0's l1: 2.12152	valid_0's l2: 24.2191
[3]	valid_0's l1: 1.94559	valid_0's l2: 21.3729
[4]	valid_0's l1: 1.78641	valid_0's l2: 19.0132
[5]	valid_0's l1: 1.63914	valid_0's l2: 16.9432
[6]	valid_0's l1: 1.50609	valid_0's l2: 15.2058
[7]	valid_0's l1: 1.38473	valid_0's l2: 13.7557
[8]	valid_0's l1: 1.27219	valid_0's l2: 12.5246
[9]	valid_0's l1: 1.16947	valid_0's l2: 11.5139
[10]	valid_0's l1: 1.07484	valid_0's l2: 10.6408
[11]	valid_0's l1: 0.988932	valid_0's l2: 9.93038
[12]	valid_0's l1: 0.910749	valid_0's l2: 9.25887
[13]	valid_0's l1: 0.840015	valid_0's l2: 8.70383
[14]	valid_0's l1: 0.776482	valid_0's l2: 8.28796
[15]	valid_0's l1: 0.717464	valid_0's l2: 7.89195
[16]	valid_0's l1: 0.665549	valid_0's l2: 7.60267
[17]	valid_0's l1: 0.61703	valid_0's l2: 7.29507
[18]	valid_0's l1: 0.572683	valid_0's l2: 7.0683
[19]	valid_0's l1: 0.531538	valid_0's l2: 6.86672
[20]	valid

In [621]:
y_pred = model.predict(val_x)
len(y_pred)

21945

In [622]:
compute_smape(val_y, y_pred)

1.907

In [623]:
test_x = test.drop('microbusiness_density', axis=1)
test_x

Unnamed: 0_level_0,cfips_id,cfips_0,cfips_1,cfips_2,lag_1,lag_2,lag_3
first_day_of_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-11-01,39089,0,41,35,4.009497,4.005045,3.998368
2022-11-01,39101,0,41,41,1.684869,1.669323,1.655719
2022-11-01,39087,0,41,34,2.605215,2.701545,2.742218
2022-11-01,39093,0,41,37,5.651847,5.635672,5.617838
2022-11-01,39095,0,41,38,4.432659,4.413642,4.417264
...,...,...,...,...,...,...,...
2023-06-01,21113,0,20,48,,,
2023-06-01,21115,0,20,49,,,
2023-06-01,21117,0,21,0,,,
2023-06-01,21121,0,21,2,,,


In [624]:
pred_columns = ['cfips_0', 'cfips_1', 'cfips_2' ] + [f'lag_{lag}' for lag in lags]

pred_columns

['cfips_0', 'cfips_1', 'cfips_2', 'lag_1', 'lag_2', 'lag_3']

In [626]:
tmp_test = test_x.copy()

unique_cfips = test_x['cfips_id'].unique()

for cfips in tqdm(unique_cfips):

    # filter only the cfips
    df = tmp_test[tmp_test['cfips_id'] == cfips]

    predictions = []

    for idx, row in df.iterrows():

        lag_values = [None for _ in range(len(lags))]

        for lag in lags:
            lag_values.append(lag_values[-lag] if lag_values[-lag] else row[f'lag_{lag}'])

        # set the lag columns to lag_values
        feats = [ row['cfips_0'], row['cfips_1'], row['cfips_2'] ] + lag_values[-len(lags):]

        # row.loc[ [f'lag_{lag}' for lag in lags] ] = lag_values
        #
        # feats = row.drop(columns=['cfips_id', 'microbusiness_density'])

        pred = model.predict(np.array(feats).reshape(1, -1))
        predictions.append(pred[0])

    tmp_test.loc[tmp_test['cfips_id'] == cfips, 'microbusiness_density'] = predictions

100%|██████████| 3135/3135 [00:08<00:00, 385.29it/s]


In [627]:
tmp_test['microbusiness_density']

first_day_of_month
2022-11-01    3.996077
2022-11-01    1.696443
2022-11-01    2.614943
2022-11-01    5.689543
2022-11-01    4.445980
                ...   
2023-06-01    0.275225
2023-06-01    0.277010
2023-06-01    0.267079
2023-06-01    0.267199
2023-06-01    0.258718
Name: microbusiness_density, Length: 25080, dtype: float64

In [239]:
submission['microbusiness_density'] = tmp_test['microbusiness_density']

  submission['microbusiness_density'] = tmp_test['microbusiness_density']


ValueError: cannot reindex on an axis with duplicate labels

In [628]:
submission

Unnamed: 0,row_id,cfips,first_day_of_month,source
0,1001_2022-11-01,1001,2022-11-01,test
1,1003_2022-11-01,1003,2022-11-01,test
2,1005_2022-11-01,1005,2022-11-01,test
3,1007_2022-11-01,1007,2022-11-01,test
4,1009_2022-11-01,1009,2022-11-01,test
...,...,...,...,...
25075,56037_2023-06-01,56037,2023-06-01,test
25076,56039_2023-06-01,56039,2023-06-01,test
25077,56041_2023-06-01,56041,2023-06-01,test
25078,56043_2023-06-01,56043,2023-06-01,test


In [629]:
tmp_df = tmp_test.copy()
tmp_df['first_day_of_month'] = tmp_test.index.astype(str)
tmp_df.index = range(tmp_df.shape[0])
tmp_df['cfips'] = tmp_df['cfips_id']
tmp_df

Unnamed: 0,cfips_id,cfips_0,cfips_1,cfips_2,lag_1,lag_2,lag_3,microbusiness_density,first_day_of_month,cfips
0,39089,0,41,35,4.009497,4.005045,3.998368,3.996077,2022-11-01,39089
1,39101,0,41,41,1.684869,1.669323,1.655719,1.696443,2022-11-01,39101
2,39087,0,41,34,2.605215,2.701545,2.742218,2.614943,2022-11-01,39087
3,39093,0,41,37,5.651847,5.635672,5.617838,5.689543,2022-11-01,39093
4,39095,0,41,38,4.432659,4.413642,4.417264,4.445980,2022-11-01,39095
...,...,...,...,...,...,...,...,...,...,...
25075,21113,0,20,48,,,,0.275225,2023-06-01,21113
25076,21115,0,20,49,,,,0.277010,2023-06-01,21115
25077,21117,0,21,0,,,,0.267079,2023-06-01,21117
25078,21121,0,21,2,,,,0.267199,2023-06-01,21121


In [630]:
submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25080 entries, 0 to 25079
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   row_id              25080 non-null  object
 1   cfips               25080 non-null  int64 
 2   first_day_of_month  25080 non-null  object
 3   source              25080 non-null  object
dtypes: int64(1), object(3)
memory usage: 783.9+ KB


In [631]:
merged = pd.merge(submission,tmp_df, on=['first_day_of_month','cfips'], how='inner')
merged

Unnamed: 0,row_id,cfips,first_day_of_month,source,cfips_id,cfips_0,cfips_1,cfips_2,lag_1,lag_2,lag_3,microbusiness_density
0,1001_2022-11-01,1001,2022-11-01,test,1001,0,0,1,3.463856,3.442677,3.423852,3.472218
1,1003_2022-11-01,1003,2022-11-01,test,1003,0,0,2,8.359798,8.341701,8.491150,8.284578
2,1005_2022-11-01,1005,2022-11-01,test,1005,0,0,3,1.232074,1.206827,1.196728,1.236840
3,1007_2022-11-01,1007,2022-11-01,test,1007,0,0,4,1.287240,1.315346,1.292861,1.299538
4,1009_2022-11-01,1009,2022-11-01,test,1009,0,0,5,1.831783,1.852060,1.836289,1.841537
...,...,...,...,...,...,...,...,...,...,...,...,...
25075,56037_2023-06-01,56037,2023-06-01,test,56037,1,12,31,,,,0.267463
25076,56039_2023-06-01,56039,2023-06-01,test,56039,1,12,32,,,,0.267463
25077,56041_2023-06-01,56041,2023-06-01,test,56041,1,12,33,,,,0.267463
25078,56043_2023-06-01,56043,2023-06-01,test,56043,1,12,34,,,,0.267463


In [632]:
merged[ ['row_id', 'microbusiness_density'] ].to_csv('submission.csv', index=False)

In [633]:
submission

Unnamed: 0,row_id,cfips,first_day_of_month,source
0,1001_2022-11-01,1001,2022-11-01,test
1,1003_2022-11-01,1003,2022-11-01,test
2,1005_2022-11-01,1005,2022-11-01,test
3,1007_2022-11-01,1007,2022-11-01,test
4,1009_2022-11-01,1009,2022-11-01,test
...,...,...,...,...
25075,56037_2023-06-01,56037,2023-06-01,test
25076,56039_2023-06-01,56039,2023-06-01,test
25077,56041_2023-06-01,56041,2023-06-01,test
25078,56043_2023-06-01,56043,2023-06-01,test


In [638]:
t = pd.to_datetime('2022-11-01')

pd.to_numeric(t)

TypeError: Invalid object type at position 0