# Simple prediction using TabNet : DL model designed for tabular datasets
- Forked & edited amazing notebook from 
- https://dacon.io/en/codeshare/3837
- https://www.kaggle.com/code/carlmcbrideellis/tabnet-a-very-simple-regression-example

In [1]:
import pandas as pd

In [2]:
rootpath = '/kaggle/input/godaddy-microbusiness-density-forecasting/'
train_df = pd.read_csv(rootpath+'train.csv')
test_df = pd.read_csv(rootpath+'test.csv')

In [3]:
train_df.head(2)

Unnamed: 0,row_id,cfips,county,state,first_day_of_month,microbusiness_density,active
0,1001_2019-08-01,1001,Autauga County,Alabama,2019-08-01,3.007682,1249
1,1001_2019-09-01,1001,Autauga County,Alabama,2019-09-01,2.88487,1198


# "Fast-EDA" using fasteda

In [4]:
# !pip install fasteda
# from fasteda import fast_eda

In [5]:
# fast_eda(train_df, correlation = False, pairplot = True)

# Appending Geo Point data
- Why not using geo point data?
- Some counties/states are close to another, and the distance or absolute location could be informative feature for inferencing microbusiness density.
- Here I'm going to use geo point data, to encapsulate geologic distance and location for each rows.


In [6]:
coordinates = pd.read_table('/kaggle/input/us-county-boundary-and-codes/us-county-boundaries.csv',sep = ';')

In [7]:
coordinates.head(2)

Unnamed: 0,Geo Point,Geo Shape,STATEFP,COUNTYFP,COUNTYNS,GEOID,NAME,NAMELSAD,STUSAB,LSAD,...,CSAFP,CBSAFP,METDIVFP,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,STATE_NAME,COUNTYFP NOZERO
0,"40.1763796295, -98.0471849897","{""coordinates"": [[[-98.273667, 40.089399], [-9...",31,129,835886,31129,Nuckolls,Nuckolls County,NE,6,...,,,,A,1489645188,1718484,40.176492,-98.046842,Nebraska,129
1,"40.7715207081, -84.1057794323","{""coordinates"": [[[-84.397189, 40.786584], [-8...",39,3,1074015,39003,Allen,Allen County,OH,6,...,338.0,30620.0,,A,1042479889,11259523,40.771627,-84.106103,Ohio,3


- There was an error regarding the string of a county, Doña Ana County. 
- So i changed the name of it from the geopoint DB.

In [8]:
print(set(train_df.county) - set(coordinates['NAMELSAD']))
train_df['county'] = train_df['county'].replace('DoÃ±a Ana County','Doña Ana County')

{'DoÃ±a Ana County'}


### Since we need only Geo Point data and their unique identifiers, we'll extract only part of it.
- For additional thought we can apply geo shape of each county to calculate euclidian distance from the center of the states, to use as a feature!

In [9]:
coordinates = coordinates[['Geo Point','NAMELSAD','STATE_NAME']]
coordinates[['latitude','longitude']] = coordinates['Geo Point'].str.split(', ', expand=True).astype(float)
coordinates.drop('Geo Point',axis=1,inplace = True)

In [10]:
coordinates.head(2)

Unnamed: 0,NAMELSAD,STATE_NAME,latitude,longitude
0,Nuckolls County,Nebraska,40.17638,-98.047185
1,Allen County,Ohio,40.771521,-84.105779


In [11]:
# ohe = pd.concat([test_df,train_df])
# ohe = pd.get_dummies(ohe)

In [20]:
train_coord_merged = pd.merge(train_df,coordinates,
                              left_on = ['county','state'],
                              right_on = ['NAMELSAD','STATE_NAME'])
# Removing county, state, names from train_df, since cfips is the unique identifier for each locations


train_coord_merged.drop(['NAMELSAD','STATE_NAME','active','county','state'],axis=1,inplace = True)
train_coord_merged[['year','month','firstday']] = train_coord_merged['first_day_of_month'].str.split('-', expand=True)

test_df = pd.read_csv(rootpath+'test.csv')
cfips_table = train_coord_merged[['cfips','latitude','longitude']].drop_duplicates()
cfips_table.reset_index(drop = True,inplace = True)

test_df = pd.merge(test_df,cfips_table,on = 'cfips',how = 'outer')
test_df[['year','month','firstday']] = test_df['first_day_of_month'].str.split('-', expand=True)
test_df.drop(['first_day_of_month','row_id'],axis=1,inplace = True)
train_coord_merged.drop(['first_day_of_month','row_id'],axis=1,inplace = True)


### Generating test_df into same form without target values

In [21]:
test_df.head(2)

Unnamed: 0,cfips,latitude,longitude,year,month,firstday
0,1001,32.53492,-86.642749,2022,11,1
1,1001,32.53492,-86.642749,2022,12,1


In [22]:
train_coord_merged.head(2)

Unnamed: 0,cfips,microbusiness_density,latitude,longitude,year,month,firstday
0,1001,3.007682,32.53492,-86.642749,2019,8,1
1,1001,2.88487,32.53492,-86.642749,2019,9,1


In [23]:
from sklearn.preprocessing import LabelEncoder

In [24]:
from sklearn.impute import SimpleImputer

# from sklearn.impute import IterativeImputer
# imputer = IterativeImputer(max_iter = 10, random_state = 0)
# imputer.fit_transform(train_df)

census = pd.read_csv(rootpath+'census_starter.csv')
census_col = census.columns
imputer = SimpleImputer(strategy = 'mean')
census = pd.DataFrame(imputer.fit_transform(census),columns=census_col)
census.cfips = census.cfips.astype(int)

# Preproc Train, Test dataset for TabNet Application

In [25]:
train = train_coord_merged.copy()
test = test_df.copy()
test = pd.merge(test,census)
train = pd.merge(train,census)
for idx, col in enumerate(train.columns):
    if 'tude' not in col and '_' not in col and col!='microbusiness_density': 
        le = LabelEncoder()
        # Concat Train, test dataset just to encode categorical features for the model input. 
        # No information leak!
        df_for_le = pd.concat([train[[col]],test[[col]]])
        le.fit(df_for_le[col].values)
        le_dict = dict(zip(le.classes_, le.transform(le.classes_)))
        train[col] = train[col].apply(lambda x: le_dict.get(x, len(le_dict)))
        test[col] = test[col].apply(lambda x: le_dict.get(x, len(le_dict)))

In [26]:
test

Unnamed: 0,cfips,latitude,longitude,year,month,firstday,pct_bb_2017,pct_bb_2018,pct_bb_2019,pct_bb_2020,...,pct_it_workers_2017,pct_it_workers_2018,pct_it_workers_2019,pct_it_workers_2020,pct_it_workers_2021,median_hh_inc_2017,median_hh_inc_2018,median_hh_inc_2019,median_hh_inc_2020,median_hh_inc_2021
0,0,32.534920,-86.642749,3,10,0,76.6,78.9,80.6,82.7,...,1.3,1.1,0.7,0.6,1.1,55317.0,58786.0,58731.0,57982.0,62660.0
1,0,32.534920,-86.642749,3,11,0,76.6,78.9,80.6,82.7,...,1.3,1.1,0.7,0.6,1.1,55317.0,58786.0,58731.0,57982.0,62660.0
2,0,32.534920,-86.642749,4,0,0,76.6,78.9,80.6,82.7,...,1.3,1.1,0.7,0.6,1.1,55317.0,58786.0,58731.0,57982.0,62660.0
3,0,32.534920,-86.642749,4,1,0,76.6,78.9,80.6,82.7,...,1.3,1.1,0.7,0.6,1.1,55317.0,58786.0,58731.0,57982.0,62660.0
4,0,32.534920,-86.642749,4,2,0,76.6,78.9,80.6,82.7,...,1.3,1.1,0.7,0.6,1.1,55317.0,58786.0,58731.0,57982.0,62660.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25075,3134,43.840485,-104.567830,4,1,0,71.1,73.3,76.8,79.7,...,0.6,0.6,0.0,0.0,0.0,59605.0,52867.0,57031.0,53333.0,65566.0
25076,3134,43.840485,-104.567830,4,2,0,71.1,73.3,76.8,79.7,...,0.6,0.6,0.0,0.0,0.0,59605.0,52867.0,57031.0,53333.0,65566.0
25077,3134,43.840485,-104.567830,4,3,0,71.1,73.3,76.8,79.7,...,0.6,0.6,0.0,0.0,0.0,59605.0,52867.0,57031.0,53333.0,65566.0
25078,3134,43.840485,-104.567830,4,4,0,71.1,73.3,76.8,79.7,...,0.6,0.6,0.0,0.0,0.0,59605.0,52867.0,57031.0,53333.0,65566.0


In [27]:
# ohe = pd.concat([train,test])
# ohe = pd.get_dummies(ohe)
# train = ohe[~ohe.microbusiness_density.isna()]
# test = ohe[ohe.microbusiness_density.isna()]

### This is our final input for the model !

In [32]:
train.head(2)

Unnamed: 0,cfips,microbusiness_density,latitude,longitude,year,month,firstday,pct_bb_2017,pct_bb_2018,pct_bb_2019,...,pct_it_workers_2017,pct_it_workers_2018,pct_it_workers_2019,pct_it_workers_2020,pct_it_workers_2021,median_hh_inc_2017,median_hh_inc_2018,median_hh_inc_2019,median_hh_inc_2020,median_hh_inc_2021
0,0,3.007682,32.53492,-86.642749,0,7,0,76.6,78.9,80.6,...,1.3,1.1,0.7,0.6,1.1,55317.0,58786.0,58731.0,57982.0,62660.0
1,0,2.88487,32.53492,-86.642749,0,8,0,76.6,78.9,80.6,...,1.3,1.1,0.7,0.6,1.1,55317.0,58786.0,58731.0,57982.0,62660.0


In [33]:
test.head(2)

Unnamed: 0,cfips,latitude,longitude,year,month,firstday,pct_bb_2017,pct_bb_2018,pct_bb_2019,pct_bb_2020,...,pct_it_workers_2017,pct_it_workers_2018,pct_it_workers_2019,pct_it_workers_2020,pct_it_workers_2021,median_hh_inc_2017,median_hh_inc_2018,median_hh_inc_2019,median_hh_inc_2020,median_hh_inc_2021
0,0,32.53492,-86.642749,3,10,0,76.6,78.9,80.6,82.7,...,1.3,1.1,0.7,0.6,1.1,55317.0,58786.0,58731.0,57982.0,62660.0
1,0,32.53492,-86.642749,3,11,0,76.6,78.9,80.6,82.7,...,1.3,1.1,0.7,0.6,1.1,55317.0,58786.0,58731.0,57982.0,62660.0


In [34]:
set(train.cfips) - set(test.cfips) # CFIPS all matched! This part caused error from a inconsistent county name.

set()

# Applying TabNet for our data.

In [35]:
!pip install pytorch-tabnet==3.1.1
import torch
from torch import nn
from pytorch_tabnet.metrics import Metric
from sklearn.metrics import f1_score

[0m

In [36]:
from pytorch_tabnet.tab_model import TabNetRegressor

In [39]:
import numpy as np
from sklearn.model_selection import train_test_split

X = train.drop('microbusiness_density',axis=1).values
y = train['microbusiness_density'].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.4, random_state=9510)
y_train = y_train.reshape(-1,1)
y_val = y_val.reshape(-1,1)
X_test = test.values
eval_set = (X_val,y_val)

In [40]:
regressor = TabNetRegressor(verbose=1,seed=9510)
regressor.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
              patience=300,
              max_epochs=1000,
              batch_size=512,
              eval_metric=['mse'])

Device used : cuda
epoch 0  | loss: 19.57151| val_0_mse: 36.07831|  0:00:06s
epoch 1  | loss: 17.39455| val_0_mse: 17.94326|  0:00:11s
epoch 2  | loss: 16.43419| val_0_mse: 16.16931|  0:00:16s
epoch 3  | loss: 15.68053| val_0_mse: 20.65297|  0:00:22s


KeyboardInterrupt: 

In [None]:
regressor.best_cost

### In case of K-Fold training

In [None]:
# kf = KFold(n_splits=5, random_state=42, shuffle=True)
# predictions_array =[]
# CV_score_array    =[]
# for train_index, test_index in kf.split(train):
#     X_train, X_valid = X[train_index], X[test_index]
#     y_train, y_valid = y[train_index], y[test_index]
#     y_train = y_train.reshape(-1, 1)
#     y_valid = y_valid.reshape(-1, 1)
#     regressor = TabNetRegressor(verbose=0,seed=42)
#     regressor.fit(X_train=X_train, y_train=y_train,
#               eval_set=[(X_valid, y_valid)],
#               patience=300, max_epochs=2000,
#               eval_metric=['rmse'])
#     CV_score_array.append(regressor.best_cost)
#     predictions_array.append(np.expm1(regressor.predict(X_test)))

# predictions = np.mean(predictions_array,axis=0)

# Inference

In [32]:
test

Unnamed: 0,cfips,latitude,longitude,year,month,firstday,pct_bb_2017,pct_bb_2018,pct_bb_2019,pct_bb_2020,pct_bb_2021,pct_college_2017,pct_college_2018,pct_college_2019,pct_college_2020,pct_college_2021,pct_foreign_born_2017,pct_foreign_born_2018,pct_foreign_born_2019,pct_foreign_born_2020,pct_foreign_born_2021,pct_it_workers_2017,pct_it_workers_2018,pct_it_workers_2019,pct_it_workers_2020,pct_it_workers_2021,median_hh_inc_2017,median_hh_inc_2018,median_hh_inc_2019,median_hh_inc_2020,median_hh_inc_2021
0,1001,37.62436,-84.866820,3,10,0,76.6,78.9,80.6,82.7,85.5,14.5,15.9,16.1,16.7,16.4,2.1,2.0,2.3,2.3,2.1,1.3,1.1,0.7,0.6,1.1,55317.0,58786.0,58731.0,57982.0,62660.0
1,1001,37.62436,-84.866820,3,11,0,76.6,78.9,80.6,82.7,85.5,14.5,15.9,16.1,16.7,16.4,2.1,2.0,2.3,2.3,2.1,1.3,1.1,0.7,0.6,1.1,55317.0,58786.0,58731.0,57982.0,62660.0
2,1001,37.62436,-84.866820,4,0,0,76.6,78.9,80.6,82.7,85.5,14.5,15.9,16.1,16.7,16.4,2.1,2.0,2.3,2.3,2.1,1.3,1.1,0.7,0.6,1.1,55317.0,58786.0,58731.0,57982.0,62660.0
3,1001,37.62436,-84.866820,4,1,0,76.6,78.9,80.6,82.7,85.5,14.5,15.9,16.1,16.7,16.4,2.1,2.0,2.3,2.3,2.1,1.3,1.1,0.7,0.6,1.1,55317.0,58786.0,58731.0,57982.0,62660.0
4,1001,37.62436,-84.866820,4,2,0,76.6,78.9,80.6,82.7,85.5,14.5,15.9,16.1,16.7,16.4,2.1,2.0,2.3,2.3,2.1,1.3,1.1,0.7,0.6,1.1,55317.0,58786.0,58731.0,57982.0,62660.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
763,2290,40.39841,-77.262296,4,1,0,43.7,43.7,42.9,45.9,45.9,7.4,7.6,8.2,8.2,8.4,1.2,1.4,1.3,1.4,1.3,1.1,1.1,0.7,0.6,0.5,37819.0,40000.0,41413.0,41728.0,43405.0
764,2290,40.39841,-77.262296,4,2,0,43.7,43.7,42.9,45.9,45.9,7.4,7.6,8.2,8.2,8.4,1.2,1.4,1.3,1.4,1.3,1.1,1.1,0.7,0.6,0.5,37819.0,40000.0,41413.0,41728.0,43405.0
765,2290,40.39841,-77.262296,4,3,0,43.7,43.7,42.9,45.9,45.9,7.4,7.6,8.2,8.2,8.4,1.2,1.4,1.3,1.4,1.3,1.1,1.1,0.7,0.6,0.5,37819.0,40000.0,41413.0,41728.0,43405.0
766,2290,40.39841,-77.262296,4,4,0,43.7,43.7,42.9,45.9,45.9,7.4,7.6,8.2,8.2,8.4,1.2,1.4,1.3,1.4,1.3,1.1,1.1,0.7,0.6,0.5,37819.0,40000.0,41413.0,41728.0,43405.0


In [None]:
predictions = regressor.predict(test.values)
predictions = [p.squeeze() for p in predictions]

 # Submission

In [None]:
test_og = pd.read_csv(rootpath+'test.csv')
submission = test_og
submission['microbusiness_density'] = predictions
submission = submission[['row_id','microbusiness_density']]

In [None]:
submission.head()

In [None]:
submission.to_csv('submission.csv',index=False)

# Upvote if you liked my notebook! :)