In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow import keras
from keras.layers import Dense, Dropout, LSTM
from keras.models import Sequential
from keras.regularizers import l1_l2, l2
from keras.optimizers import Adam
from keras.initializers import HeNormal
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

import sklearn
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import Ridge, Lasso, LinearRegression, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

import warnings
warnings.filterwarnings('ignore')

In [17]:
df = pd.read_csv('veda_mc.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,date,amount
0,0,"Jul 30, 2023",1.0
1,1,"Aug 01, 2023",583.0
2,2,"Aug 01, 2023",93.0
3,3,"Aug 04, 2023",454.0
4,4,"Aug 09, 2023",5500.0


In [18]:
df = df.drop(columns=['Unnamed: 0'])

In [19]:
df = df.rename(columns = {'date': 'date',
          'amount': 'price'})

In [20]:
df.shape

(1198, 2)

In [106]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet
from datetime import timedelta
from calendar import monthrange


class DataPreprocessor:
    def __init__(self):
        self.encoder = None
        self.feature_names = None
    
    @staticmethod
    def preprocess(df):
        print("Step 1: Converting to datetime")
        df['date'] = pd.to_datetime(df['date'])
    
        print("Step 2: Creating date range and merging")
        date_range = pd.date_range(start=df['date'].min(), end=df['date'].max(), freq='D')
        df = (pd.DataFrame(date_range, columns=['date'])
              .merge(df, on='date', how='left')
              .fillna({'price': 0}))
        
        print("Step 3: Grouping by date")
        df = df.groupby('date')['price'].sum().rename('total_price').reset_index()
        
        print("Step 4: Monthly cumulative average")
        df['year_month'] = df['date'].dt.to_period('M')
        df['day'] = df['date'].dt.day
        df['monthly_avg'] = df.groupby('year_month')['total_price'].expanding().mean().reset_index(level=0, drop=True)
        df.drop(columns=['year_month', 'day'], inplace=True)
        
        print("Step 5: One-hot encoding day of week")
        df['day_of_week'] = df['date'].dt.dayofweek
        encoder = OneHotEncoder(sparse_output=False)
        encoded_days = encoder.fit_transform(df[['day_of_week']])
        encoded_df = pd.DataFrame(encoded_days, 
                                  columns=[f'day_of_week_{i}' for i in range(encoded_days.shape[1])])
        df = pd.concat([df, encoded_df], axis=1).drop(columns=['day_of_week'])
        
        print("Step 6: Lag features")
        lag_values = [1, 2, 3, 4, 5, 6, 7, 14, 21, 28]
        for lag in lag_values:
            df[f'lag_{lag}'] = df['total_price'].rolling(lag).mean().shift(1)
    
        print("Step 7: Filling NaNs")
        df = df.fillna(0)
    
        print("Step 8: Sorting by date")
        df = df.sort_values('date').set_index('date')
    
        print("Step 9: Outlier handling")
        old_mean = df['total_price'].mean()
        filtered_df = df[df['total_price'] > 0]
        Q1 = filtered_df['total_price'].quantile(0.25)
        Q3 = filtered_df['total_price'].quantile(0.75)
        IQR = Q3 - Q1
        df['total_price'] = df['total_price'].clip(upper=Q3 + 1.5 * IQR)
        new_mean = df['total_price'].mean()
        
        print("Step 10: Done")
        return df, old_mean, new_mean


class DataPreprocessor:
    def __init__(self):
        self.encoder = None
        self.feature_names = None
    
    @staticmethod
    def training(self, df):
        x = df.drop(columns=['total_price'])
        y = df['total_price']
        scale_x = StandardScaler()
        scale_y = StandardScaler()
        x_scaled = scale_x.fit_transform(x)
        y_scaled = scale_y.fit_transform(y.values.reshape(-1,1))
        x_train, x_test, y_train, y_test = train_test_split(x_scaled, y_scaled, test_size=0.25, random_state=42)
        model = ElasticNet(
            alpha=0.00001,
            l1_ratio=0.75,
            max_iter=200000,
            tol=0.0001,
            fit_intercept=True,
            random_state=42
        )
        model.fit(x_train, y_train)


In [107]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet
from datetime import timedelta
from calendar import monthrange

class Training_Prediction:
    def __init__(self):
        self.x_scaler = StandardScaler()
        self.y_scaler = StandardScaler()
        self.model = None
        self.feature_names = None
    @staticmethod
    def traintest(self, df):
        if 'total_price' not in df.columns:
            raise ValueError("Input DataFrame must contain 'total_price' column.")
        if not isinstance(df.index, pd.DatetimeIndex):
            raise ValueError("DataFrame index must be a DatetimeIndex.")
        if df.empty:
            raise ValueError("Input DataFrame cannot be empty.")
        if len(df) < 28:
            print("Warning: DataFrame length is less than the longest lag (28 days). Lag features might be zero-filled.")

        x = df.drop(columns=['total_price'])
        y = df['total_price']
        self.feature_names = x.columns.tolist()

        x_scaled = self.x_scaler.fit_transform(x)
        y_scaled = self.y_scaler.fit_transform(y.values.reshape(-1, 1))

        self.model = ElasticNet(
            alpha=0.1,              
            l1_ratio=0.5,         
            max_iter=10000,        
            tol=1e-4,              
            fit_intercept=True,
            random_state=42)


        self.model.fit(x_scaled, y_scaled.ravel())

        if self.model is None:
            raise ValueError("Model training failed.")

        predictions_next_month = []
        temp_df = df.copy()
        last_date = temp_df.index[-1]
        start_prediction_date = last_date + timedelta(days=1)
        pred_year, pred_month = start_prediction_date.year, start_prediction_date.month
        days_in_pred_month = monthrange(pred_year, pred_month)[1]

        historical_monthly_avgs = df.groupby(df.index.month)['total_price'].mean().to_dict()
        overall_mean_fallback = df['total_price'].mean()

        for i in range(days_in_pred_month):
            current_pred_date = last_date + timedelta(days=i + 1)
            next_data_features = {}

            # Monthly average feature
            month_being_predicted = current_pred_date.month
            next_data_features['monthly_avg'] = historical_monthly_avgs.get(month_being_predicted, overall_mean_fallback)

            # Day-of-week one-hot encoding
            for d in range(7):
                next_data_features[f'day_of_week_{d}'] = 1 if current_pred_date.weekday() == d else 0

            # Lag features
            lags = [1, 2, 3, 4, 5, 6, 7, 14, 21, 28]
            for lag in lags:
                try:
                    lag_value = temp_df['total_price'].iloc[-lag]
                except IndexError:
                    lag_value = 0
                next_data_features[f'lag_{lag}'] = lag_value

            next_data_row_df = pd.DataFrame([next_data_features], index=[current_pred_date])

            # Ensure all feature columns are present and aligned
            for col in self.feature_names:
                if col not in next_data_row_df:
                    next_data_row_df[col] = 0
            next_data_row_df = next_data_row_df[self.feature_names]

            # Scale, predict, and inverse transform
            next_data_scaled = self.x_scaler.transform(next_data_row_df)
            predicted_scaled = self.model.predict(next_data_scaled)
            predicted_value = self.y_scaler.inverse_transform(predicted_scaled.reshape(-1, 1))[0, 0]

            if predicted_value < 10:
                predicted_value = 0

            predictions_next_month.append(predicted_value)

            # Append prediction to temp_df for future lag use
            row_to_append = next_data_row_df.copy()
            row_to_append['total_price'] = predicted_value
            temp_df = pd.concat([temp_df, row_to_append], axis=0)

        predicted_sum = sum(predictions_next_month)
        predicted_average = predicted_sum / len(predictions_next_month) if predictions_next_month else 0

        return predictions_next_month, predicted_sum, predicted_average


In [91]:
# Ensure df has a DatetimeIndex
df.index = pd.to_datetime(df.index)
trainpred = Training_Prediction()
l, sum_, avg = trainpred.traintest(df)


In [92]:
old_mean

710.5037347294939

In [93]:
new_mean

573.2367364746947

In [82]:
mae_score

NameError: name 'mae_score' is not defined

In [83]:
mean_change = old_mean - new_mean

In [94]:
mean_change

137.2669982547992

In [95]:
l

[627.1586716055131,
 575.6193024952318,
 586.3408115182967,
 567.6423709957098,
 586.4822352116042,
 581.4565822570638,
 549.7948613177845,
 689.0752236132383,
 538.0444394822646,
 577.7632756346278,
 420.6695457921283,
 394.06911654586474,
 427.2215712405382,
 437.91367313166256,
 538.5459469329132,
 316.0318593388928,
 405.8963225664837,
 427.58968614318655,
 414.96928308924316,
 316.84198155469284,
 402.9853102655935,
 501.02840091066906,
 409.6748304926822,
 316.87668999830646,
 428.1732654351237,
 316.7554212033877,
 317.485834762398,
 346.0564801771027,
 559.774634659279,
 400.92846505415116,
 401.49199726127335]

In [96]:
sum_

14380.358090686908

In [97]:
avg

463.8825190544164

In [98]:
import dill
with open("preprocessing.pkl", "wb") as f:
    dill.dump(preprocessor, f)

In [99]:
with open("training_prediction.pkl", "wb") as f:
    dill.dump(trainpred, f)

In [100]:
import joblib

In [108]:
preprocessor = DataPreprocessor()
joblib.dump(preprocessor, 'preprocessor.pkl')

['preprocessor.pkl']

In [109]:
import pandas as pd
import sklearn
import joblib

print(f"Pandas version: {pd.__version__}")
print(f"Sklearn version: {sklearn.__version__}")
print(f"Joblib version: {joblib.__version__}")


Pandas version: 2.2.3
Sklearn version: 1.6.1
Joblib version: 1.4.2
