In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow import keras
from keras.layers import Dense, Dropout, LSTM
from keras.models import Sequential
from keras.regularizers import l1_l2, l2
from keras.optimizers import Adam
from keras.initializers import HeNormal
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

import sklearn
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import Ridge, Lasso, LinearRegression, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

import warnings
warnings.filterwarnings('ignore')

In [29]:
df = pd.read_csv('veda_mc.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,date,amount
0,0,"Jul 30, 2023",1.0
1,1,"Aug 01, 2023",583.0
2,2,"Aug 01, 2023",93.0
3,3,"Aug 04, 2023",454.0
4,4,"Aug 09, 2023",5500.0


In [30]:
df = df.drop(columns=['Unnamed: 0'])

In [31]:
df = df.rename(columns = {'date': 'date',
          'amount': 'price'})

In [32]:
df.shape

(1198, 2)

In [33]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

class DataPreprocessor:
    def __init__(self):
        self.encoder = None
        self.feature_names = None
    
    @staticmethod
    def preprocess(df):
        df['date'] = pd.to_datetime(df['date'])
        date_range = pd.date_range(start=df['date'].min(), end=df['date'].max(), freq='D')
        df = (pd.DataFrame(date_range, columns=['date'])
              .merge(df, on='date', how='left')
              .fillna({'price': 0}))
        df = df.groupby('date')['price'].sum().rename('total_price').reset_index()
        
        # Monthly cumulative average calculation
        df['year_month'] = df['date'].dt.to_period('M')
        df['day'] = df['date'].dt.day
        df['monthly_avg'] = df.groupby('year_month')['total_price'].expanding().mean().reset_index(level=0, drop=True)
        df.drop(columns=['year_month', 'day'], inplace=True)
        
        df['day_of_week'] = df['date'].dt.dayofweek
        encoder = OneHotEncoder(sparse_output=False)
        encoded_days = encoder.fit_transform(df[['day_of_week']])
        encoded_df = pd.DataFrame(encoded_days, 
                                  columns=[f'day_of_week_{i}' for i in range(encoded_days.shape[1])])
        df = pd.concat([df, encoded_df], axis=1).drop(columns=['day_of_week'])
        
        lag_values = [1, 2, 3, 4, 5, 6, 7, 14, 21, 28]
        for lag in lag_values:
            df[f'lag_{lag}'] = df['total_price'].rolling(lag).mean().shift(1)
        
        df = df.dropna().sort_values('date').set_index('date')
        
        # Compute old mean before outlier removal
        old_mean = df['total_price'].mean()
        
        # Outlier removal using IQR
        filtered_df = df[df['total_price'] > 0]
        Q1 = filtered_df['total_price'].quantile(0.25)
        Q3 = filtered_df['total_price'].quantile(0.75)
        IQR = Q3 - Q1
        df.loc[df['total_price'] > Q3 + 1.5*IQR, 'total_price'] = Q3 + 1.5*IQR
        
        # Compute new mean after outlier removal
        new_mean = df['total_price'].mean()
        
        return df, old_mean, new_mean

preprocessor = DataPreprocessor()
df, old_mean, new_mean = preprocessor.preprocess(df)


class DataPreprocessor:
    def __init__(self):
        self.encoder = None
        self.feature_names = None
    
    @staticmethod
    def training(self, df):
        x = df.drop(columns=['total_price'])
        y = df['total_price']
        scale_x = StandardScaler()
        scale_y = StandardScaler()
        x_scaled = scale_x.fit_transform(x)
        y_scaled = scale_y.fit_transform(y.values.reshape(-1,1))
        x_train, x_test, y_train, y_test = train_test_split(x_scaled, y_scaled, test_size=0.25, random_state=42)
        model = ElasticNet(
            alpha=0.00001,
            l1_ratio=0.75,
            max_iter=200000,
            tol=0.0001,
            fit_intercept=True,
            random_state=42
        )
        model.fit(x_train, y_train)


In [53]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split
from calendar import monthrange
from datetime import timedelta

class Training_Prediction:
    def __init__(self):
        self.x_scaler = StandardScaler()
        self.y_scaler = StandardScaler()
        self.model = None
        self.feature_names = None

    def traintest(self, df):
        x = df.drop(columns=['total_price'])
        y = df['total_price']
        self.feature_names = x.columns.tolist()
        x_scaled = self.x_scaler.fit_transform(x)
        y_scaled = self.y_scaler.fit_transform(y.values.reshape(-1, 1))
        
        self.model = ElasticNet(
            alpha=0.00001,
            l1_ratio=0.75,
            max_iter=200000,
            tol=0.0001,
            fit_intercept=True,
            random_state=42
        )
        self.model.fit(x_scaled, y_scaled)

        print("Model training complete.")
        
        if self.model is None:
            raise ValueError("Model is not trained yet. Call traintest(df) first.")
        
        l = []
        last_date = df.index[-1]
        next_day = last_date + timedelta(days=1)
        year, month = next_day.year, next_day.month
        days_in_month = monthrange(year, month)[1]

        mean_change = df['total_price'].diff().mean()

        for i in range(days_in_month):
            next_day = df.index[-1] + timedelta(days=1)
            next_data = {}
            current_month = next_day.month
            monthly_data = df[df.index.month == current_month]
            next_data['monthly_avg'] = monthly_data['total_price'].mean()
            
            for d in range(7):
                next_data[f'day_of_week_{d}'] = 1 if next_day.weekday() == d else 0
            
            lags = [1, 2, 3, 4, 5, 6, 7, 14, 21, 28]
            for lag in lags:
                lag_value = df['total_price'].shift(lag).iloc[-1]
                next_data[f'lag_{lag}'] = lag_value
            
            next_data_df = pd.DataFrame([next_data], index=[next_day])
            for col in self.feature_names:
                if col not in next_data_df:
                    next_data_df[col] = 0  # Fill missing features with 0
            
            next_data_df = next_data_df[self.feature_names]
            next_data_scaled = self.x_scaler.transform(next_data_df)
            predicted_value = self.model.predict(next_data_scaled)
            predicted_value = self.y_scaler.inverse_transform(predicted_value.reshape(-1, 1))
            
            if predicted_value[0] < 10:
                predicted_value[0] = 0 
            
            next_data_df['total_price'] = predicted_value[0]
            df = pd.concat([df, next_data_df], axis=0)
            l.append((predicted_value[0], next_day.weekday()))

        sum_ = sum(item[0] for item in l) + days_in_month * mean_change
        average = sum_ / len(l) if len(l) > 0 else 0        
        return l, sum_, average

trainpred = Training_Prediction()
l, sum_, avg= trainpred.traintest(df)

Model training complete.


In [54]:
old_mean

710.5037347294939

In [55]:
new_mean

573.2367364746947

In [56]:
mae_score

505.74682524656305

In [57]:
mean_change = old_mean - new_mean

In [58]:
mean_change

137.2669982547992

l = []
from calendar import monthrange
from datetime import timedelta

last_date = df.index[-1]
next_day = last_date + timedelta(days=1)
year, month = next_day.year, next_day.month 
days_in_month = monthrange(year, month)[1]
from datetime import timedelta
for i in range(days_in_month):
    last_date = df.index[-1]
    next_day = last_date + timedelta(days=1)
    next_data = {}
    current_month = next_day.month
    monthly_data = df[df.index.month == current_month]
    next_data['monthly_avg'] = monthly_data['total_price'].mean()
    next_data['day_of_week_1'] = 1 if next_day.weekday() == 0 else 0
    next_data['day_of_week_2'] = 1 if next_day.weekday() == 1 else 0
    next_data['day_of_week_3'] = 1 if next_day.weekday() == 2 else 0
    next_data['day_of_week_4'] = 1 if next_day.weekday() == 3 else 0
    next_data['day_of_week_5'] = 1 if next_day.weekday() == 4 else 0
    next_data['day_of_week_6'] = 1 if next_day.weekday() == 5 else 0
    next_data['day_of_week_7'] = 1 if next_day.weekday() == 6 else 0
    lags = [1, 2, 3, 4, 5, 6, 7, 14, 21, 28]
    for lag in lags:
        lag_value = df['total_price'].shift(lag).iloc[-1]
        next_data[f'lag_{lag}'] = lag_value
    next_data_df = pd.DataFrame([next_data], index=[next_day])
    next_data_scaled = scale_x.transform(next_data_df)
    predicted_value = model.predict(next_data_scaled)
    predicted_value = scale_y.inverse_transform(predicted_value.reshape(-1, 1))
    if predicted_value[0] < 10:
        predicted_value[0] = 0

    next_data_df['total_price'] = predicted_value[0]
    df = pd.concat([df, next_data_df], axis=0)
    l.append((predicted_value[0], next_day.weekday()))
sum_ = sum([item[0] for item in l]) + days_in_month * mean_change
average = sum_ / len(l) if len(l) > 0 else 0

In [59]:
l

[(array([1331.85006296]), 5),
 (array([0.]), 6),
 (array([1880.38382504]), 0),
 (array([0.]), 1),
 (array([1409.6175796]), 2),
 (array([474.86815696]), 3),
 (array([680.65743786]), 4),
 (array([846.88638086]), 5),
 (array([0.]), 6),
 (array([796.2489132]), 0),
 (array([0.]), 1),
 (array([841.74094121]), 2),
 (array([680.78003287]), 3),
 (array([323.89279763]), 4),
 (array([1248.53375571]), 5),
 (array([0.]), 6),
 (array([164.9322375]), 0),
 (array([0.]), 1),
 (array([557.28879238]), 2),
 (array([273.33322158]), 3),
 (array([0.]), 4),
 (array([290.17655641]), 5),
 (array([0.]), 6),
 (array([627.16697182]), 0),
 (array([0.]), 1),
 (array([571.70014143]), 2),
 (array([0.]), 3),
 (array([0.]), 4),
 (array([398.40159618]), 5),
 (array([0.]), 6),
 (array([889.63653105]), 0)]

In [None]:
sum_

In [None]:
average

In [None]:
from sklean.metr

In [50]:
import dill
with open("preprocessing.pkl", "wb") as f:
    dill.dump(preprocessor, f)

In [51]:
with open("training_prediction.pkl", "wb") as f:
    dill.dump(trainpred, f)