In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow import keras
from keras.layers import Dense, Dropout, LSTM
from keras.models import Sequential
from keras.regularizers import l1_l2, l2
from keras.optimizers import Adam
from keras.initializers import HeNormal
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

import sklearn
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

import warnings
warnings.filterwarnings('ignore')

In [210]:
df = pd.read_csv('veda_mc.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,date,amount
0,0,"Jul 30, 2023",1.0
1,1,"Aug 01, 2023",583.0
2,2,"Aug 01, 2023",93.0
3,3,"Aug 04, 2023",454.0
4,4,"Aug 09, 2023",5500.0


In [211]:
df = df.drop(columns=['Unnamed: 0'])

In [212]:
df = df.rename(columns = {'date': 'date',
          'amount': 'price'})

In [213]:
df['date'] = pd.to_datetime(df['date'])
date_range = pd.date_range(start=df['date'].min(), end=df['date'].max(), freq='D')
full_df = pd.DataFrame(date_range, columns=['date'])
df = pd.merge(full_df, df, on='date', how='left')
df['price'] = df['price'].fillna(0)

In [214]:
df = df.groupby('date').agg({
    'price': 'sum'
}).rename(columns={'price': 'total_price'})
df.reset_index(inplace=True)

In [215]:
df1 = df.copy()
df1['date'] = pd.to_datetime(df1['date'])

In [216]:
import pandas as pd

def calculate_monthly_avg(df):
    df['date'] = pd.to_datetime(df['date'])
    df['monthly_avg'] = None
    monthly_spending = {}

    for idx, row in df.iterrows():
        current_month = row['date'].month
        current_year = row['date'].year
        month_key = (current_year, current_month)

        if month_key not in monthly_spending:
            monthly_spending[month_key] = []

        monthly_spending[month_key].append(row['total_price'])
        monthly_avg = sum(monthly_spending[month_key]) / len(monthly_spending[month_key])
        
        df.at[idx, 'monthly_avg'] = monthly_avg

calculate_monthly_avg(df)


In [217]:
def extract_date_features(df, date_col):
    df[date_col] = pd.to_datetime(df[date_col])
    df['day_of_week'] = df[date_col].dt.dayofweek    
    return df
df = extract_date_features(df, 'date')
encoder = OneHotEncoder(sparse_output=False)
encoded_days = encoder.fit_transform(df[['day_of_week']])
encoded_df = pd.DataFrame(encoded_days, columns=[f'day_of_week_{i}' for i in range(1, 8)])
df = pd.concat([df, encoded_df], axis=1)
df = df.drop(columns=['day_of_week'])


In [218]:
lag_values = [1, 2, 3, 4, 5, 6, 7, 14, 21, 28]
for lag in lag_values:
    df[f'lag_{lag}'] = sum([df['total_price'].shift(i) for i in range(1, lag+1)]) / lag

In [219]:
df.dropna(inplace=True)

In [220]:
df.sort_values('date', inplace=True)
df.set_index('date', inplace=True)

In [221]:
df['total_price'] = df['total_price'].apply(lambda x: min(x, 500))

In [222]:
l = []
for i in range(30):
    x = df.drop(columns=['total_price'])
    y = df['total_price']
    scale_x = StandardScaler()
    scale_y = StandardScaler()
    x_scaled = scale_x.fit_transform(x)
    y_scaled = scale_y.fit_transform(y.values.reshape(-1,1))
    x_train, x_test, y_train, y_test = train_test_split(x_scaled, y_scaled, test_size=0.2, random_state=42)
    degree = 4
    poly = PolynomialFeatures(degree=degree)
    x_poly = poly.fit_transform(x_train)
    model = Lasso(alpha=0.00275, max_iter=100, tol=0.0001, fit_intercept=True, warm_start=False, selection='random')
    model.fit(x_poly, y_train)
    x_test_poly = poly.transform(x_test)
    y_pred = model.predict(x_test_poly)
    y_pred_original = scale_y.inverse_transform(y_pred.reshape(-1,1))
    y_test_original = scale_y.inverse_transform(y_test.reshape(-1,1))
    from datetime import timedelta
    last_date = df.index[-1]
    next_day = last_date + timedelta(days=1)
    next_data = {}
    current_month = next_day.month
    monthly_data = df[df.index.month == current_month]
    next_data['monthly_avg'] = monthly_data['total_price'].mean()
    next_data['day_of_week_1'] = 1 if next_day.weekday() == 0 else 0
    next_data['day_of_week_2'] = 1 if next_day.weekday() == 1 else 0
    next_data['day_of_week_3'] = 1 if next_day.weekday() == 2 else 0
    next_data['day_of_week_4'] = 1 if next_day.weekday() == 3 else 0
    next_data['day_of_week_5'] = 1 if next_day.weekday() == 4 else 0
    next_data['day_of_week_6'] = 1 if next_day.weekday() == 5 else 0
    next_data['day_of_week_7'] = 1 if next_day.weekday() == 6 else 0
    lags = [1, 2, 3, 4, 5, 6, 7, 14, 21, 28]
    for lag in lags:
        lag_value = df['total_price'].shift(lag).iloc[-1]
        next_data[f'lag_{lag}'] = lag_value
    next_data_df = pd.DataFrame([next_data], index=[next_day])
    next_data_scaled = scale_x.transform(next_data_df)
    next_data_scaled = poly.transform(next_data_scaled)
    predicted_value = model.predict(next_data_scaled)
    predicted_value = scale_y.inverse_transform(predicted_value.reshape(-1, 1))
    if predicted_value[0] < 0:
        predicted_value[0] = 0

    next_data_df['total_price'] = predicted_value[0]

    # Append the prediction to the dataframe
    df = pd.concat([df, next_data_df], axis=0)

    # Store the prediction and the day of the week
    l.append((predicted_value[0], next_day.weekday()))


In [223]:
l

[(array([343.82943976]), 5),
 (array([50.14272754]), 6),
 (array([157.66226229]), 0),
 (array([200.29161717]), 1),
 (array([602.6320279]), 2),
 (array([102.31631301]), 3),
 (array([5.07119223]), 4),
 (array([130.35925643]), 5),
 (array([0.]), 6),
 (array([52.46864262]), 0),
 (array([127.87278845]), 1),
 (array([458.40376561]), 2),
 (array([141.57271994]), 3),
 (array([238.33304877]), 4),
 (array([271.79312445]), 5),
 (array([0.]), 6),
 (array([214.31715173]), 0),
 (array([89.09055553]), 1),
 (array([473.04731975]), 2),
 (array([149.26480471]), 3),
 (array([199.08270196]), 4),
 (array([68.18557703]), 5),
 (array([77.90150539]), 6),
 (array([395.48105745]), 0),
 (array([187.99008811]), 1),
 (array([158.28292638]), 2),
 (array([139.13254534]), 3),
 (array([152.28062056]), 4),
 (array([0.]), 5),
 (array([164.22610957]), 6)]

In [224]:
sum_ = sum([item[0] for item in l])
average = sum_ / len(l) if len(l) > 0 else 0

In [225]:
sum_

array([5351.03188967])

In [226]:
average

array([178.36772966])

In [227]:
121

121