In [2]:
import pandas as pd 
from sklearn.model_selection import train_test_split

df = pd.read_excel('./AmazonSaleReport.xlsx', parse_dates=['Date'], index_col='Date')
df = df.sort_index()

# A little bit of data cleaning
df.drop(['Order ID', 'Status', 'Unnamed: 22', 'SKU', 'ASIN', 'Courier Status', 'ship-city', 'ship-state', 'ship-postal-code', 'ship-country', 'Style'], axis=1, inplace=True)

# Getting dummies and changing column names
# Fullfillment
df['fulfilled-by'] = df['fulfilled-by'].apply(lambda x: True if x == 'Easy Ship' else False)
df.rename(columns={'fulfilled-by': 'fulfilledEasyShip'}, inplace=True)

# Fullfillment again?
df['Fulfilment'] = df['Fulfilment'].apply(lambda x: True if x == 'Amazon' else False)
df.rename(columns={'Fulfilment': 'FBA'}, inplace=True)

# Sales channel
df['Sales Channel '] = df['Sales Channel '].apply(lambda x: True if x == 'Amazon.in' else False)
df.rename(columns={'Sales Channel ': 'soldAmazonIn'}, inplace=True)

# Currency
df['currency'] = df['currency'].apply(lambda x: True if x == 'INR' else False)
df.rename(columns={'currency': 'paidINR'}, inplace=True)

# Ship-Service-Level
df['ship-service-level'] = df['ship-service-level'].apply(lambda x: True if x == 'Standard' else False)
df.rename(columns={'ship-service-level': 'standardShipping'}, inplace=True)

# Check if it has promotions, convert to boolean
df['promotion-ids'] = df['promotion-ids'].apply(lambda x: False if pd.isna(x) else True)

# Size
df_dummies = pd.get_dummies(df, columns=['Size'])

# Categories
categories_to_delete = ['Ethnic Dress', 'Blouse', 'Bottom', 'Saree', 'Dupatta']
df_dummies = df_dummies[~df_dummies['Category'].isin(categories_to_delete)]
df_dummies = pd.get_dummies(df_dummies, columns=['Category'])

# Drop NaNs
df_dummies.dropna(inplace=True)

# Feature engineering
df_dummies['Lag_2'] = df_dummies['Amount'].shift(2)
df_dummies['Lag_3'] = df_dummies['Amount'].shift(3)
df_dummies['Lag_1'] = df_dummies['Amount'].shift(1)
df_dummies['Rolling_Mean_3'] = df_dummies['Amount'].rolling(window=3).mean()
df_dummies['Rolling_Mean_7'] = df_dummies['Amount'].rolling(window=7).mean()
df_dummies = df_dummies.dropna()

# Define the features and target
X = df.drop(columns=['Amount'])
y = df['Amount']

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [4]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR

# Define the parameter grid
param_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto'],
    'epsilon': [0.1, 0.2, 0.5, 1]
}

# Initialize the SVR model
svr_model = SVR()

# Use the one-hot encoded dataframe for training and testing
X_dummies = df_dummies.drop(columns=['Amount'])
y_dummies = df_dummies['Amount']

# Split the data
X_train_dummies, X_test_dummies, Y_train_dummies, Y_test_dummies = train_test_split(X_dummies, y_dummies, test_size=0.2, shuffle=False)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=svr_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)

# Fit GridSearchCV
grid_search.fit(X_train_dummies, Y_train_dummies)

# Get the best parameters and best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print(f'Best Parameters: {best_params}')

Fitting 3 folds for each of 128 candidates, totalling 384 fits




KeyboardInterrupt: 

In [None]:
# Make predictions on the testing data
y_pred_best = best_model.predict(X_test)

# Calculate the mean squared error
mse_best = mean_squared_error(Y_test, y_pred_best)
print(f'Best SVR Mean Squared Error: {mse_best}')

# Plot the actual vs predicted values
plt.figure(figsize=(10, 6))
plt.plot(Y_test.index, Y_test, label='Actual')
plt.plot(Y_test.index, y_pred_best, label='Predicted', color='red')
plt.title('Actual vs Predicted Amount')
plt.legend()
plt.show()