In [None]:
import tensorflow as tf
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

<h3> Read the Data </h3>

In [None]:
df = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")
df.head()

In [None]:
df_test.head()

<h3> EDA </h3>

In [None]:
df.info()

In [None]:
df["cfips"].value_counts()

In [None]:
df_test["cfips"].value_counts()

In [None]:
len(df["first_day_of_month"].value_counts())

In [None]:
sample_5 = df[df["cfips"].isin(df["cfips"].sample(1).values)]
sample_5.set_index('first_day_of_month', inplace=True)

sns.set_theme("paper")
fig, ax = plt.subplots(figsize = (12,6))
fig = sns.lineplot(x='first_day_of_month', y='microbusiness_density', data=sample_5, markers=True)
ax.set_title(sample_5["cfips"].values[0])
labels_ = [x[:7] for x in sample_5.index]
ax.set_xticklabels(labels = labels_, rotation=50, ha='right')
plt.show()

In [None]:
def generate_train_data(data, window=38) :
    train = []
    train_y = []
    cfips_id = data["cfips"].unique()
    for x in data["cfips"].unique() :
        data_x = data[data["cfips"] == x].set_index("first_day_of_month")
        train.append(data_x["microbusiness_density"].values[:38])
        train_y.append(data_x["microbusiness_density"].values[38])
    
    return cfips_id, train, train_y
    

In [None]:
cfips_id, train, y_train = generate_train_data(df)
train

In [None]:
train = np.array(train)
y_train = np.array(y_train)

In [None]:
def create_model():
    input_shape = (38,1)
    
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=input_shape),
        tf.keras.layers.LSTM(64, return_sequences=True),
        tf.keras.layers.LSTM(64, return_sequences=True),
        tf.keras.layers.LSTM(64, return_sequences=True),
        tf.keras.layers.LSTM(64, return_sequences=True),
        tf.keras.layers.LSTM(64, return_sequences=True),
        tf.keras.layers.LSTM(64),
        tf.keras.layers.Dense(1)
    ])
    
    model.compile(optimizer='adam', loss='mae')
    
    return model

In [None]:
model = create_model()
history = model.fit(train, y_train, epochs=100)

In [None]:
x = []
for ix in df["cfips"].unique() :
    data_x = df[df["cfips"] == ix].set_index("first_day_of_month")
    x.append([ix, data_x["microbusiness_density"].values])

In [None]:
predicted_result = []
for index,row in df_test.iterrows():
    cfips_idx = 0
    for i in range(len(x)) :
        if x[i][0] == row["cfips"] :
            cfips_idx = i
    predict = model.predict(np.array([x[cfips_idx][1][-38:]]))[0][0]
    np.append(x[cfips_idx][1] , predict)
    predicted_result.append(predict)

len(predicted_result)

In [None]:
df_test["microbusiness_density"] = predicted_result
df_test.head()

In [None]:
df_tested = pd.read_csv("data/revealed_test.csv")
df_tested.head()

In [None]:
result = df_tested.set_index("row_id").join(df_test.set_index("row_id"), lsuffix="_true", rsuffix="_predict")
result.head()

In [None]:
sum_mae = 0
for index,row in result.iterrows():
    sum_mae += np.absolute(row["microbusiness_density_predict"] - row["microbusiness_density_true"])
mae = sum_mae/result.shape[0]

In [None]:
mae

In [None]:
df_test[["row_id","microbusiness_density"]].to_csv("data/submission.csv", index=False)