### Kimchi Model Notebook

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
import sklearn.metrics as metrics
from sklearn.metrics import mean_absolute_percentage_error

from sklearn.preprocessing import RobustScaler
from sklearn.neural_network import MLPRegressor


### Upload Data

In [2]:
data = pd.read_excel("/content/Kimchi_dataset.xlsx")

In [3]:
df = data.copy()

### Create date columns

In [4]:
def get_date_features(df):
    """
    This function creates day and month feature from Date column.
    """
    df['month'] = df['Date'].dt.month
    df['day'] = df['Date'].dt.day
    # df.drop(columns=['Date'], inplace = True)

    return df

In [5]:
df = get_date_features(df)

### Fill null values

In [6]:
def fill_by_grp_mean(df, column):
  """
  Fill numeric column by each region groups mean value.
  """
  df[column] = df.groupby(['Region'])[column]\
      .transform(lambda x: x.fillna(x.mean()))
  return df

In [7]:
df = fill_by_grp_mean(df, "Total Volume")
df = fill_by_grp_mean(df, "Price")

### Get Dummies for Categorical Columns

In [8]:
df = pd.get_dummies(df, prefix=['Region'])

### Split Data

In [9]:

def split_train_test(df):
    """
    This function split df to train test sets.
    Return: 
      price as a y traşn or test.
      last 2 weeks data without price as a test period.
      firts 10 weeks data without price as a train period.
    """
    df = df.set_index('Date') 
    X_train = df[df.index <="2018-03-15"].drop(columns = ["Price"])
    X_test = df[df.index >="2018-03-15"].drop(columns = ["Price"])

    y_train = df[df.index <="2018-03-15"]["Price"].copy()
    y_test = df[df.index>="2018-03-15"]["Price"].copy()

    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
    
    return X_train, X_test, y_train, y_test



In [10]:
X_train, X_test, y_train, y_test = split_train_test(df)

(540, 48) (108, 48) (540,) (108,)


### Scale Categorical Columns

In [11]:
# Define scaler.
scaler = RobustScaler()

# Fit train and transform on test. we dont want data leakage.
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


### Run Model

In [12]:
# Define regressor.

reg = LinearRegression()
reg.fit(X_train_scaled, y_train)

LinearRegression()

### Make Prediction

In [13]:
y_pred = reg.predict(X_test_scaled)
print("The first five prediction {}".format(y_pred[:5]))
print("The real first five labels {}".format(y_test[:5]))

mape = mean_absolute_percentage_error(y_test, y_pred)
print("MAPE w/ regressor {}".format(mape))

The first five prediction [-26.3347168  -18.17602539 -26.17871094 -18.01293945 -26.68078613]
The real first five labels Date
2018-03-25    1.71
2018-03-18    1.66
2018-03-25    1.56
2018-03-18    1.48
2018-03-25    1.33
Name: Price, dtype: float64
MAPE w/ regressor 17.673160428729425


### Try MLP Regressor

In [14]:
mlp = MLPRegressor(random_state=1, max_iter=500).fit(X_train, y_train)

In [15]:
mlp_pred = mlp.predict(X_test_scaled)

  "X does not have valid feature names, but"


In [16]:
mape = mean_absolute_percentage_error(y_test, mlp_pred)
print("MAPE with mlp regressor {}".format(mape))

MAPE with mlp regressor 0.7721782797777615
