In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

### In this notebook we prepare out data for training. The following tasks are completed:

#### Add entries for dates on which a province had no signups

#### Engineer the following features:
   * num_signups_one_back: The number of signups for the province in question in the previous month.
   * num_signups_twelve_back: The number of signups for the province in question 12 months back (to pick up on yearly trends).
   * year: The current year.
   * month: The current month.
   
#### One hot encode the following column:
   * province
   
#### Set date column to be the index
   
#### Drop the following columns:
   * id
   * city
   
#### Our features in the training task will thus be:
   * num_signups_one_back: The number of signups for the province in question in the previous month.
   * num_signups_twelve_back: The number of signups for the province in question 12 months back (to pick up on yearly trends).
   * year: The current year.
   * month: The current month.
   * province: The province in question
   
#### And our target variable will be:
   * num_signups
   
#### We will make predictions for each month from the end of our data (2023-05-01) until the point of interest (2025-01-01). In order to determine the total number of signups at that point in time. We will sum the predictions for each province for each month and add that to the amount of predictions at the starting date (1 500 000). These results will be discussed in a notebook so named. This notebook only focussed on data preperation. 

In [42]:
df = pd.read_csv('data_provincial.csv')

In [43]:
df.head()

Unnamed: 0,id,date_column,city,province,num_signups
0,29589,2018-11-01,,northern cape,4
1,29590,2023-05-01,,free state,2149
2,29591,2019-02-01,,free state,50
3,29592,2021-10-01,,kwazulu natal,6273
4,29593,2019-07-01,,north west,3


In [44]:
df['date_column'] = pd.to_datetime(df['date_column'])

#### Add entries for dates on which a province had no signups

In [45]:
# Create date range
all_dates = pd.date_range(start='2018-01-01', end='2023-05-01', freq='MS')
all_dates_df = pd.DataFrame({'date_column': all_dates})

# Create a dataframe of unique provinces
provinces_df = pd.DataFrame({'province': df['province'].unique()})

# Perform a cross join to create a combination of all_dates and provinces
all_dates_provinces = all_dates_df.assign(key=1).merge(provinces_df.assign(key=1), on='key').drop('key', axis=1)

# Merge the new dataframe with the original dataframe
df = pd.merge(all_dates_provinces, df, how='left', on=['date_column', 'province'])

# Fill NaN values in num_signups with 0
df['num_signups'].fillna(0, inplace=True)

#### Engineer the following features:
   * num_signups_one_back: The number of signups for the province in question in the previous month.
   * num_signups_twelve_back: The number of signups for the province in question 12 months back (to pick up on yearly trends).
   * year: The current year.
   * month: The current month.

In [46]:
#num_signups_one_back
df.sort_values(by=['province', 'date_column'], inplace=True)
df['num_signups_one_back'] = df.groupby('province')['num_signups'].shift(1)
df['num_signups_one_back'].fillna(0, inplace=True)

In [47]:
#num_signups_twelve_back
df.sort_values(by=['province', 'date_column'], inplace=True)
df['num_signups_twelve_back'] = df.groupby('province')['num_signups'].shift(12)
df['num_signups_twelve_back'].fillna(0, inplace=True)

In [48]:
#year and month
df['year'] = df['date_column'].dt.year
df['month'] = df['date_column'].dt.month

#### One hot encode the following column:

In [49]:
#province
df = pd.get_dummies(df, columns=['province'])

#### Set date column to be the index

In [50]:
df.set_index('date_column', inplace=True)

#### Drop the following columns:

In [51]:
# id, city
columns_to_drop = ['id', 'city']
df = df.drop(columns_to_drop, axis=1)

In [52]:
df

Unnamed: 0_level_0,num_signups,num_signups_one_back,num_signups_twelve_back,year,month,province_eastern cape,province_free state,province_gauteng,province_kwazulu natal,province_limpopo,province_mpumalanga,province_north west,province_northern cape,province_western cape
date_column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2018-01-01,0.0,0.0,0.0,2018,1,1,0,0,0,0,0,0,0,0
2018-02-01,1.0,0.0,0.0,2018,2,1,0,0,0,0,0,0,0,0
2018-03-01,0.0,1.0,0.0,2018,3,1,0,0,0,0,0,0,0,0
2018-04-01,2.0,0.0,0.0,2018,4,1,0,0,0,0,0,0,0,0
2018-05-01,0.0,2.0,0.0,2018,5,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-01-01,7935.0,3881.0,8640.0,2023,1,0,0,0,0,0,0,0,0,1
2023-02-01,9423.0,7935.0,7137.0,2023,2,0,0,0,0,0,0,0,0,1
2023-03-01,10783.0,9423.0,6316.0,2023,3,0,0,0,0,0,0,0,0,1
2023-04-01,6395.0,10783.0,4468.0,2023,4,0,0,0,0,0,0,0,0,1


In [53]:
df.num_signups.sum()

1493479.0

In [55]:
#### Write prepared data to file
file_path = 'data_prepared.csv'
df.to_csv(file_path)