In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [15]:
TARGET_COL = 'Quantity'

## Data load & Preprocess

In [16]:
dirty_data = pd.read_csv("data/dirty_dataset.csv")

dirty_data['Quantity'] = dirty_data['Quantity'].map(lambda x: x.replace("_","") if isinstance(x,str) else x).astype(float)
dirty_data['UnitPrice'] = dirty_data['UnitPrice'].map(lambda x: x.replace("_","") if isinstance(x,str) else x).astype(float)
dirty_data.loc[(dirty_data['Quantity'] < 0) | (dirty_data['UnitPrice'] < 0)] = np.nan

dirty_data_std = dirty_data[['UnitPrice','Quantity']].std()
dirty_data_mean = dirty_data[['UnitPrice','Quantity']].mean()

dirty_data.loc[
    (np.abs(dirty_data['Quantity']- dirty_data_mean['Quantity']) > dirty_data_std['Quantity'] * 3) | 
    (np.abs(dirty_data['UnitPrice']- dirty_data_mean['UnitPrice']) > dirty_data_std['UnitPrice'] * 3)
    ] = np.nan

dirty_data.loc[
    (dirty_data.isnull().sum(axis = 1) > 1) |
    (dirty_data[['Quantity','UnitPrice']].isnull().sum(axis = 1) == 2)
    ] = np.nan

dirty_data.dropna(how = 'all', inplace = True)
dirty_data[['Quantity','UnitPrice']] = dirty_data[['Quantity','UnitPrice']].fillna(dirty_data[['Quantity','UnitPrice']].mean())
dirty_data.dropna(inplace = True)

In [17]:
dirty_data['StockCode'] = dirty_data['StockCode'].astype('category')
dirty_data['Country'] = dirty_data['Country'].astype('category')
dirty_data['InvoiceDate'] = pd.to_datetime(dirty_data['InvoiceDate'])
dirty_data['year'] = dirty_data['InvoiceDate'].dt.year
dirty_data['month'] = dirty_data['InvoiceDate'].dt.month
dirty_data['weekdays'] = dirty_data['InvoiceDate'].dt.weekday
dirty_data.drop(["Description","InvoiceDate","InvoiceNo","StockCode"], axis = 1, inplace = True)

dirty_data = pd.get_dummies(dirty_data, columns = ['Country','weekdays','month','year'], drop_first = True)

## Model

In [18]:
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold
from sklearn.ensemble import RandomForestRegressor

Y = dirty_data[TARGET_COL]
X = dirty_data.drop(TARGET_COL, axis = 1)
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, train_size = 0.7)

In [19]:
dirty_model = RandomForestRegressor()
dirty_model.fit(train_X, train_Y)

RandomForestRegressor()

In [22]:
from sklearn.metrics import explained_variance_score,mean_absolute_error,mean_squared_error
pred_Y = dirty_model.predict(test_X)

In [30]:
R2 = explained_variance_score(test_Y, pred_Y)
MAE = mean_absolute_error(test_Y, pred_Y)
MSE = mean_squared_error(test_Y, pred_Y)
print(f"{R2  = } \n{MAE = } \n{MSE = }")

R2  = 0.16854219429846984 
MAE = 7.288252156823835 
MSE = 1883.4718876875804
