In [10]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
from scipy import stats

In [11]:
df_test = pd.read_csv("../data/test.csv") # neobsahuje target
df = pd.read_parquet("../data/train_after_eda.parquet")
df = df.drop(columns="base_price")
df.head(3)

Unnamed: 0,id,week,center_id,meal_id,checkout_price,emailer_for_promotion,homepage_featured,num_orders,category,cuisine,city_code,region_code,center_type,op_area,total_sale
0,1379560,1,55,1885,136.83,0,0,177,Beverages,Thai,647,56,TYPE_C,2.0,26955.33
1,1466964,1,55,1993,136.83,0,0,270,Beverages,Thai,647,56,TYPE_C,2.0,36674.1
2,1346989,1,55,2539,134.86,0,0,189,Beverages,Thai,647,56,TYPE_C,2.0,25677.54


In [12]:
z_scores = stats.zscore(df['total_sale'])
threshold = 3
df_no_outliers = df[(z_scores < threshold) & (z_scores > -threshold)]
PREDICTOR_COLUMS = [col for col in df_no_outliers.columns if col != "num_orders"]

train_df = df_no_outliers[df_no_outliers['week'] <= 120]
test_df = df_no_outliers[df_no_outliers['week'] >= 120]

X_train = train_df[PREDICTOR_COLUMS]
y_train = train_df[["num_orders"]]

X_test = test_df[PREDICTOR_COLUMS]
y_test = test_df[["num_orders"]]

In [13]:
## Preprocessing variables
CATEGORICAL_COLS = ["category", "cuisine", "center_type"]
ct = ColumnTransformer([('_encoder', OneHotEncoder(), CATEGORICAL_COLS)])

In [14]:
pipe = Pipeline([
    ('encoder', ct), 
    ('regression', LinearRegression()), 
])

model = pipe.fit(X_train, y_train)
y_pred = model.predict(X_test)

#SCORE

r2, mae, mse = r2_score(y_test, y_pred), mean_absolute_error(y_test, y_pred), mean_squared_error(y_test, y_pred)

print(
    f"R2: {r2} \t MAE: {mae} \t MSE: {mse}" 
)

R2: 0.3624754876590056 	 MAE: 134.07766080979573 	 MSE: 43057.98815375919


In [15]:
pipe = Pipeline([
    ('encoder', ct), 
    ('regression', DecisionTreeRegressor(random_state=1)), 
])

model = pipe.fit(X_train, y_train)
y_pred = model.predict(X_test)

#SCORE

r2, mae, mse = r2_score(y_test, y_pred), mean_absolute_error(y_test, y_pred), mean_squared_error(y_test, y_pred)
print(f"R2: {r2} \t MAE: {mae} \t MSE: {mse}")

R2: 0.39285662411083166 	 MAE: 127.65020318429427 	 MSE: 41006.0660894029


In [16]:
pipe = Pipeline([
    ('encoder', ct), 
    ('regression', DecisionTreeRegressor(random_state=1)), 
])

model = pipe.fit(X_train, y_train)
y_pred = model.predict(X_test)

#SCORE

r2, mae, mse = r2_score(y_test, y_pred), mean_absolute_error(y_test, y_pred), mean_squared_error(y_test, y_pred)

print(
    f"R2: {r2} \t MAE: {mae} \t MSE: {mse}" 
)

R2: 0.39285662411083166 	 MAE: 127.65020318429427 	 MSE: 41006.0660894029


## Results
1. Better results but not good enough
2. Seems as outlier heavy task

## Resolution
1. Use better regularized rergession, "better" loss function
2. Use random forest RandomForest