In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/godaddy-microbusiness-density-forecasting/sample_submission.csv
/kaggle/input/godaddy-microbusiness-density-forecasting/census_starter.csv
/kaggle/input/godaddy-microbusiness-density-forecasting/revealed_test.csv
/kaggle/input/godaddy-microbusiness-density-forecasting/train.csv
/kaggle/input/godaddy-microbusiness-density-forecasting/test.csv


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from plotly.subplots import make_subplots
import plotly.graph_objects as go

**Data loading**

In [3]:
# Load in the data
df_train = pd.read_csv('/kaggle/input/godaddy-microbusiness-density-forecasting/train.csv')
df_test = pd.read_csv('/kaggle/input/godaddy-microbusiness-density-forecasting/test.csv')
df_submission = pd.read_csv('/kaggle/input/godaddy-microbusiness-density-forecasting/sample_submission.csv')

**EDA**

In [4]:
# View data
df_train.head()

Unnamed: 0,row_id,cfips,county,state,first_day_of_month,microbusiness_density,active
0,1001_2019-08-01,1001,Autauga County,Alabama,2019-08-01,3.007682,1249
1,1001_2019-09-01,1001,Autauga County,Alabama,2019-09-01,2.88487,1198
2,1001_2019-10-01,1001,Autauga County,Alabama,2019-10-01,3.055843,1269
3,1001_2019-11-01,1001,Autauga County,Alabama,2019-11-01,2.993233,1243
4,1001_2019-12-01,1001,Autauga County,Alabama,2019-12-01,2.993233,1243


In [5]:
# Column types
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122265 entries, 0 to 122264
Data columns (total 7 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   row_id                 122265 non-null  object 
 1   cfips                  122265 non-null  int64  
 2   county                 122265 non-null  object 
 3   state                  122265 non-null  object 
 4   first_day_of_month     122265 non-null  object 
 5   microbusiness_density  122265 non-null  float64
 6   active                 122265 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 6.5+ MB


In [6]:
# Ensure date is actually date type
df_train['first_day_of_month'] = pd.to_datetime(df_train['first_day_of_month'])

In [7]:
# Plot the time series for some of the counties 
num_plots = 5
fig = make_subplots(rows=num_plots, cols=1,
                   subplot_titles=(df_train.groupby('cfips').head(1)['cfips'].iloc[:num_plots].to_list()))

for idx, cfip in enumerate(df_train['cfips'].unique()[:num_plots]):
    
    fig.append_trace(go.Scatter(
    x=df_train['first_day_of_month'].loc[df_train['cfips'] == cfip],
    y=df_train['microbusiness_density'].loc[df_train['cfips'] == cfip],
    name=str(df_train['county'].loc[df_train['cfips'] == cfip].tail(1).values[0]) +\
        ', ' + str(df_train['state'].loc[df_train['cfips'] == cfip].tail(1).values[0])    
    ), row=idx+1, col=1)


fig.update_layout(template="simple_white", font=dict(size=18), width=1000, height=1400)
fig.show()

**Modelling**

In [8]:
# Initialise the column we will store our forecasts in
df_test['microbusiness_density'] = 0

In [9]:
# Loop to compute auto-arima for each county
for cfip in tqdm(df_train['cfips'].unique()):
    
    # Get the naive forecast
    naive_fc = df_train['microbusiness_density'].loc[df_train['cfips'] == cfip].tail(1)
    
    # Insert the forecast into the test set
    df_test['microbusiness_density'].loc[df_test['cfips'] == cfip] = naive_fc.values[0]



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

100%|██████████| 3135/3135 [00:02<00:00, 1209.59it/s]


**Analysis**

In [10]:
# Check results
df_test

Unnamed: 0,row_id,cfips,first_day_of_month,microbusiness_density
0,1001_2022-11-01,1001,2022-11-01,3.463856
1,1003_2022-11-01,1003,2022-11-01,8.359798
2,1005_2022-11-01,1005,2022-11-01,1.232074
3,1007_2022-11-01,1007,2022-11-01,1.287240
4,1009_2022-11-01,1009,2022-11-01,1.831783
...,...,...,...,...
25075,56037_2023-06-01,56037,2023-06-01,2.823801
25076,56039_2023-06-01,56039,2023-06-01,26.273220
25077,56041_2023-06-01,56041,2023-06-01,4.009369
25078,56043_2023-06-01,56043,2023-06-01,3.126551


In [11]:
# Plot the time series for some of the counties 
num_plots = 5
fig = make_subplots(rows=num_plots, cols=1,
                   subplot_titles=(df_train.groupby('cfips').head(1)['cfips'].iloc[:num_plots].to_list()))

for idx, cfip in enumerate(df_train['cfips'].unique()[:num_plots]):
    
    fig.append_trace(go.Scatter(
    x=df_train['first_day_of_month'].loc[df_train['cfips'] == cfip],
    y=df_train['microbusiness_density'].loc[df_train['cfips'] == cfip],    
    name='Train',
    line=dict(color="blue", width=2)), row=idx+1, col=1)
    
    fig.append_trace(go.Scatter(
    x=df_test['first_day_of_month'].loc[df_test['cfips'] == cfip],
    y=df_test['microbusiness_density'].loc[df_test['cfips'] == cfip],    
    name='Forecast',
    line=dict(color="red", width=2)), row=idx+1, col=1)
    
# Removing repeating of names in the legend    
names = set()
fig.for_each_trace(
    lambda trace:
    trace.update(showlegend=False)
    if (trace.name in names) else names.add(trace.name))

fig.update_layout(template="simple_white", font=dict(size=18), width=1000, height=1500)
fig.show()

**Submission**

In [12]:
# Submit our predictions !!
df_submission["microbusiness_density"] = df_test['microbusiness_density'].values
df_submission.to_csv("submission.csv", index=False)

**Reference:**

       https://www.kaggle.com/code/egorphysics/naive-forecasting-baseline-model