In [2]:
import pandas as pd 
from sklearn.model_selection import train_test_split

df = pd.read_excel('./AmazonSaleReport.xlsx', parse_dates=['Date'], index_col='Date')
df = df.sort_index()

# A little bit of data cleaning
df.drop(['Order ID', 'Status', 'Unnamed: 22', 'SKU', 'ASIN', 'Courier Status', 'ship-city', 'ship-state', 'ship-postal-code', 'ship-country', 'Style'], axis=1, inplace=True)

# Getting dummies and changing column names
# Fullfillment
df['fulfilled-by'] = df['fulfilled-by'].apply(lambda x: True if x == 'Easy Ship' else False)
df.rename(columns={'fulfilled-by': 'fulfilledEasyShip'}, inplace=True)

# Fullfillment again?
df['Fulfilment'] = df['Fulfilment'].apply(lambda x: True if x == 'Amazon' else False)
df.rename(columns={'Fulfilment': 'FBA'}, inplace=True)

# Sales channel
df['Sales Channel '] = df['Sales Channel '].apply(lambda x: True if x == 'Amazon.in' else False)
df.rename(columns={'Sales Channel ': 'soldAmazonIn'}, inplace=True)

# Currency
df['currency'] = df['currency'].apply(lambda x: True if x == 'INR' else False)
df.rename(columns={'currency': 'paidINR'}, inplace=True)

# Ship-Service-Level
df['ship-service-level'] = df['ship-service-level'].apply(lambda x: True if x == 'Standard' else False)
df.rename(columns={'ship-service-level': 'standardShipping'}, inplace=True)

# Check if it has promotions, convert to boolean
df['promotion-ids'] = df['promotion-ids'].apply(lambda x: False if pd.isna(x) else True)

# Size
df_dummies = pd.get_dummies(df, columns=['Size'])

# Categories
categories_to_delete = ['Ethnic Dress', 'Blouse', 'Bottom', 'Saree', 'Dupatta']
df_dummies = df_dummies[~df_dummies['Category'].isin(categories_to_delete)]
df_dummies = pd.get_dummies(df_dummies, columns=['Category'])

# Drop NaNs
df_dummies.dropna(inplace=True)
df_dummies.head()

X = df_dummies.drop(columns='Amount')
Y = df_dummies['Amount']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, shuffle=False)

In [4]:
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

# Create a TimeSeriesSplit object
tscv = TimeSeriesSplit(n_splits=5)

# Initialize the RandomForestRegressor model
rf_model = RandomForestRegressor(random_state=42)

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=tscv, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, Y_train)

# Get the best model
best_rf_model = grid_search.best_estimator_

# Make predictions with the best model
Y_pred_best = best_rf_model.predict(X_test)

# Evaluate the best model
mse_best = mean_squared_error(Y_test, Y_pred_best)
print(f'Best Mean Squared Error: {mse_best}')
print(f'Best Parameters: {grid_search.best_params_}')

Best Mean Squared Error: 47007.26854356343
Best Parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 50}


In [6]:
from sklearn.metrics import r2_score, mean_absolute_error, root_mean_squared_error

# Calculate R-squared
r2 = r2_score(Y_test, Y_pred_best)

# Calculate Mean Absolute Error
mae = mean_absolute_error(Y_test, Y_pred_best)

# Calculate Root Mean Squared Error
rmse = root_mean_squared_error(Y_test, Y_pred_best)

print(f'R-squared: {r2}')
print(f'Mean Absolute Error: {mae}')
print(f'Root Mean Squared Error: {rmse}')

R-squared: 0.4395143852201796
Mean Absolute Error: 156.01009353015866
Root Mean Squared Error: 216.81159688439968
