In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.ensemble import RandomForestRegressor
import pickle
from model_preparation import plotting, smape_loss, datasplit
import joblib
import matplotlib.pyplot as plt

# Load the Excel file
df_origin = pd.read_csv('20230411_SummerTerm23_Data_Challenge_Infineon_Data.csv')
stockprice = pd.read_csv('stockprice.csv')
# delete discontinuing products
df_continue = df_origin[df_origin['reporting_time'] == '2023-02-01T00:00:00.000Z']
df_continue_origin = df_origin[df_origin['product_name'].isin(df_continue['product_name'])]

# Check number of datapoints
product_counts = df_continue_origin['product_name'].value_counts()
i = 0
for name in tqdm(df_continue_origin['product_name'].unique()):
    # if we have enough data points
    if product_counts[name] < 19:
        i = i + 1
print(i)
# result dictionaries
result_rf = {}
result_rf_temp = []
result_rf_less80 = {}
result_rf_other = {}
result_rf_stock = {}
result_rf_stock_other = {}
# Use previous 15 data to predict next demand
step = 15
future_periods = 18

for name in tqdm(df_continue_origin['product_name'].unique()):
    # if we have enough data points
    if product_counts[name] > 21:
        df_product = df_continue_origin.loc[df_continue_origin['product_name'] == name]
        df_product = df_product.sort_values(by='reporting_time', ascending=True)
        # calculate the correlation between each feature and demand and choose the first 8 features
        corr_matrix = df_product.corr()
        demand_corr = corr_matrix.sort_values(by=['demand'], ascending=False)
        index_list = demand_corr.index.tolist()
        df = df_product[index_list[0:8]]
        df_copy = df.copy(deep=True)

        # Create lagged features
        for i in range(1, step + 1):
            for j in df_copy.columns.tolist():
                df[j + f"_{i}"] = df_copy[j].shift(i)
        # Split the data into training and testing sets
        train_size = int(len(df) * 0.8)  # 80% for training, 20% for testing
        train_data = df[:train_size]
        test_data = df[train_size:]
        # Prepare the feature matrix and target variable for training
        X_train = train_data.dropna().drop(df_copy.columns.tolist()[0:8], axis=1)
        y_train = train_data.dropna()[df_copy.columns.tolist()[0:8]]

        # Train the Random Forest model
        rf_model = RandomForestRegressor()
        rf_model.fit(X_train, y_train)
        kurs = "RF_model/%s.joblib" % name
        joblib.dump(rf_model, kurs)
        # Prepare the feature matrix and target variable for testing
        X_test = test_data.dropna().drop(df_copy.columns.tolist()[0:8], axis=1)
        y_test = test_data.dropna()[df_copy.columns.tolist()[0:8]]
        # Make predictions
        predictions = pd.DataFrame(rf_model.predict(X_test), columns=y_test.columns)

        # Calculate the smape
        smape = smape_loss(y_test['demand'], predictions['demand'])
        accuracy = 1 - smape
        # Generate future dates for prediction
        future = pd.DataFrame(index=pd.Index(list(range(future_periods))), columns=X_train.columns)
        future_all = pd.DataFrame(index=pd.Index(list(range(1))), columns=df.columns)
        future_all.iloc[0, :] = df.iloc[-1, :]
        for i in range(future_periods):
            future.iloc[i, :] = future_all.iloc[-1, 0:(len(df.columns.tolist())-8)]
            test_future = pd.DataFrame([list(future.iloc[i, :])], columns=X_train.columns)
            future_data = pd.DataFrame(rf_model.predict(test_future), columns=df.columns[0:8])
            future_all.loc[i+1, :] = pd.concat([future_data, test_future], axis=1).iloc[0, :]
        future_all = future_all[1:len(future_all)]

        actual = df_copy.reset_index(drop=True)['demand']
        predict = predictions['demand']
        predict.index = list(range(len(actual) - len(predict), len(actual)))
        future_all.index = list(range(len(actual), len(actual) + future_periods))
        plt.figure(figsize=(12, 6))
        plt.plot(actual, label="Actual Demand")
        plt.plot(predict, label="Predicted Demand")
        plt.plot(future_all['demand'], label="Future Demand")
        plt.xlabel("Months")
        plt.ylabel("Demand")
        percent_string = '%'
        plt.title("Actual vs Predicted Demand: %s, test accuracy: %s%s" % (
            name, round(100.00 * float(accuracy), 2), percent_string))
        plt.legend()
        plt.xticks(rotation=45)
        plt.grid(True)
        kurs = "RF_plot/%s.png" % name
        plt.savefig(kurs, format='png')

        result_rf[name] = [accuracy, smape, future_all]
with open("RF_results/result.pkl", 'wb') as f:
    pickle.dump(result_rf, f)