In [None]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from prophet import Prophet
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

# Load the transactional data
transactional_data_1 = pd.read_csv('Transactional_data_retail_01.csv')
transactional_data_2 = pd.read_csv('Transactional_data_retail_02.csv')

# Concatenate the two transactional datasets
transactional_data = pd.concat([transactional_data_1, transactional_data_2])

# Group by StockCode and sum the Quantity sold
top_stock_codes = transactional_data.groupby('StockCode')['Quantity'].sum().reset_index()
top_10_stock_codes = top_stock_codes.sort_values(by='Quantity', ascending=False).head(10)['StockCode']

# Calculate revenue for each transaction (Quantity * Price)
transactional_data['Revenue'] = transactional_data['Quantity'] * transactional_data['Price']

# Set up the DataFrame to store results
results = pd.DataFrame(columns=['StockCode', 'Model', 'RMSE', 'MAE'])

# Function to calculate evaluation metrics
def evaluate_forecast(actual, forecast):
    rmse = np.sqrt(mean_squared_error(actual, forecast))
    mae = mean_absolute_error(actual, forecast)
    return rmse, mae

# Iterate over top 10 stock codes
for stock_code in top_10_stock_codes:
    # Preprocess data for time series
    data = transactional_data[transactional_data['StockCode'] == stock_code]
    data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], errors='coerce')
    data.dropna(subset=['InvoiceDate', 'Quantity', 'Price'], inplace=True)
    data.set_index('InvoiceDate', inplace=True)

    # Group by week
    ts_data = data['Quantity'].resample('W').sum()

    # Split data into train (80%) and test (20%)
    train_size = int(len(ts_data) * 0.8)
    train_data, test_data = ts_data[:train_size], ts_data[train_size:]

    # ARIMA Model
    try:
        arima_model = ARIMA(train_data, order=(15, 1, 0))
        arima_fit = arima_model.fit()
        forecast_arima = arima_fit.forecast(steps=len(test_data))
        rmse_arima, mae_arima = evaluate_forecast(test_data, forecast_arima)
        results = pd.concat([results, pd.DataFrame({'StockCode': [stock_code], 'Model': ['ARIMA'], 'RMSE': [rmse_arima], 'MAE': [mae_arima]})], ignore_index=True)
    except Exception as e:
        print(f"ARIMA failed for {stock_code}: {e}")

    # ETS Model
    try:
        ets_model = ExponentialSmoothing(train_data, trend='add', seasonal=None)
        ets_fit = ets_model.fit()
        forecast_ets = ets_fit.forecast(steps=len(test_data))
        rmse_ets, mae_ets = evaluate_forecast(test_data, forecast_ets)
        results = pd.concat([results, pd.DataFrame({'StockCode': [stock_code], 'Model': ['ETS'], 'RMSE': [rmse_ets], 'MAE': [mae_ets]})], ignore_index=True)
    except Exception as e:
        print(f"ETS failed for {stock_code}: {e}")


    # LSTM Model
    lstm_data = train_data.values.reshape(-1, 1)
    scaler = MinMaxScaler(feature_range=(0, 1))
    train_scaled = scaler.fit_transform(lstm_data)

    # Prepare data for LSTM model
    def create_dataset(dataset, look_back=1):
        X, Y = [], []
        for i in range(len(dataset) - look_back - 1):
            a = dataset[i:(i + look_back), 0]
            X.append(a)
            Y.append(dataset[i + look_back, 0])
        return np.array(X), np.array(Y)

    look_back = 1
    X_train, Y_train = create_dataset(train_scaled, look_back)
    X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))

    # Define and train LSTM model
    lstm_model = Sequential()
    lstm_model.add(LSTM(50, input_shape=(look_back, 1)))
    lstm_model.add(Dense(1))
    lstm_model.compile(loss='mean_squared_error', optimizer='adam')
    lstm_model.fit(X_train, Y_train, epochs=20, batch_size=1, verbose=0)

    # Prepare test data for LSTM
    test_lstm_data = test_data.values.reshape(-1, 1)
    test_scaled = scaler.transform(test_lstm_data)
    X_test, Y_test = create_dataset(test_scaled, look_back)
    X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

    # Predict using LSTM
    lstm_predict = lstm_model.predict(X_test)
    lstm_predict = scaler.inverse_transform(lstm_predict)
    rmse_lstm, mae_lstm = evaluate_forecast(test_data[-len(lstm_predict):], lstm_predict)
    results = pd.concat([results, pd.DataFrame({'StockCode': [stock_code], 'Model': ['LSTM'], 'RMSE': [rmse_lstm], 'MAE': [mae_lstm]})], ignore_index=True)

    # Non-Time Series Techniques
    customer_data = pd.read_csv('CustomerDemographics.csv')
    product_data = pd.read_csv('ProductInfo.csv')

    # Merge with transactional data
    full_data = pd.merge(data.reset_index(), customer_data, on='Customer ID', how='left')
    full_data = pd.merge(full_data, product_data, on='StockCode', how='left')

    # Features for prediction
    X = full_data[['Price']]
    y = full_data['Quantity']

    # Train-test split
    X_train_non_ts, X_test_non_ts, y_train_non_ts, y_test_non_ts = train_test_split(X, y, test_size=0.2, random_state=42)

    # Decision Tree Model
    tree_model = DecisionTreeRegressor()
    tree_model.fit(X_train_non_ts, y_train_non_ts)
    y_pred_tree = tree_model.predict(X_test_non_ts)
    rmse_tree, mae_tree = evaluate_forecast(y_test_non_ts, y_pred_tree)
    results = pd.concat([results, pd.DataFrame({'StockCode': [stock_code], 'Model': ['Decision Tree'], 'RMSE': [rmse_tree], 'MAE': [mae_tree]})], ignore_index=True)

    # XGBoost Model
    xgb_model = XGBRegressor()
    xgb_model.fit(X_train_non_ts, y_train_non_ts)
    y_pred_xgb = xgb_model.predict(X_test_non_ts)
    rmse_xgb, mae_xgb = evaluate_forecast(y_test_non_ts, y_pred_xgb)
    results = pd.concat([results, pd.DataFrame({'StockCode': [stock_code], 'Model': ['XGBoost'], 'RMSE': [rmse_xgb], 'MAE': [mae_xgb]})], ignore_index=True)

# Display results DataFrame
print(results)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.dropna(subset=['InvoiceDate', 'Quantity', 'Price'], inplace=True)
  warn('Non-stationary starting autoregressive parameters'
  results = pd.concat([results, pd.DataFrame({'StockCode': [stock_code], 'Model': ['ARIMA'], 'RMSE': [rmse_arima], 'MAE': [mae_arima]})], ignore_index=True)
  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 241ms/step


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.dropna(subset=['InvoiceDate', 'Quantity', 'Price'], inplace=True)
  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 239ms/step


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.dropna(subset=['InvoiceDate', 'Quantity', 'Price'], inplace=True)
  warn('Non-stationary starting autoregressive parameters'
  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 242ms/step


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.dropna(subset=['InvoiceDate', 'Quantity', 'Price'], inplace=True)
  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 254ms/step


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.dropna(subset=['InvoiceDate', 'Quantity', 'Price'], inplace=True)
  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 253ms/step


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.dropna(subset=['InvoiceDate', 'Quantity', 'Price'], inplace=True)
  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 241ms/step


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.dropna(subset=['InvoiceDate', 'Quantity', 'Price'], inplace=True)
  warn('Non-stationary starting autoregressive parameters'
  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 256ms/step


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.dropna(subset=['InvoiceDate', 'Quantity', 'Price'], inplace=True)
  warn('Non-stationary starting autoregressive parameters'
  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 250ms/step


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.dropna(subset=['InvoiceDate', 'Quantity', 'Price'], inplace=True)
  warn('Non-stationary starting autoregressive parameters'
  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 268ms/step


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.dropna(subset=['InvoiceDate', 'Quantity', 'Price'], inplace=True)
  warn('Non-stationary starting autoregressive parameters'
  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 243ms/step
   StockCode          Model         RMSE          MAE
0      84077          ARIMA  1866.727938  1294.552633
1      84077            ETS  1710.277335  1276.587993
2      84077           LSTM  1801.688927  1149.729348
3      84077  Decision Tree   217.315865    56.970221
4      84077        XGBoost   217.315871    56.970217
5     85123A          ARIMA   449.649815   316.186773
6     85123A            ETS   367.158425   273.317209
7     85123A           LSTM   420.333041   334.332499
8     85123A  Decision Tree    30.306084    10.458914
9     85123A        XGBoost    30.306083    10.458917
10    85099B          ARIMA   744.857674   597.183685
11    85099B            ETS   532.318438   436.994533
12    85099B           LSTM   661.783615   494.869968
13    85099B  Decision Tree    47.922620    13.992879
14    85099B        XGBoost    47.922622    13.992879
15     21212          ARIMA   840.751232   705.725043
16     21