In [54]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [55]:
excel_file = 'Invoice.xlsx'
sheet_name = '1'
columns = ['Invoice Date', 'Vendor ID', 'Invoice Amount']

df = pd.read_excel(excel_file, sheet_name=sheet_name, usecols=columns)

df = df.dropna()

print(df)

    Invoice Date  Vendor ID  Invoice Amount
0     2022-06-15      65670         1732.67
1     2022-08-14      65670         2045.03
2     2022-10-13      65670         1398.03
3     2022-12-12      65670         1316.33
4     2023-02-10      65670         1755.29
..           ...        ...             ...
858   2023-11-12      65769         1308.83
859   2023-12-12      65769         1469.97
860   2024-01-11      65769         1956.70
861   2024-02-10      65769         2221.93
862   2024-03-11      65769         1410.39

[863 rows x 3 columns]


In [56]:
# Preprocess dates

def date_to_num(date_obj):
    base_date = np.datetime64('2000-01-01')
    delta_days = (date_obj - base_date).days
    return delta_days + 1

def date_from_num(days_count):
    base_date = np.datetime64('2000-01-01')
    target_date = base_date + np.timedelta64(days_count - 1, 'D')
    formatted_date = target_date.astype(object).strftime('%d-%b-%Y')
    return formatted_date

In [57]:
df['Invoice Date'] = df['Invoice Date'].apply(date_to_num)

print(df)

     Invoice Date  Vendor ID  Invoice Amount
0            8202      65670         1732.67
1            8262      65670         2045.03
2            8322      65670         1398.03
3            8382      65670         1316.33
4            8442      65670         1755.29
..            ...        ...             ...
858          8717      65769         1308.83
859          8747      65769         1469.97
860          8777      65769         1956.70
861          8807      65769         2221.93
862          8837      65769         1410.39

[863 rows x 3 columns]


In [58]:
print(df.head())

   Invoice Date  Vendor ID  Invoice Amount
0          8202      65670         1732.67
1          8262      65670         2045.03
2          8322      65670         1398.03
3          8382      65670         1316.33
4          8442      65670         1755.29


In [68]:
# Find Potential Vendor ID's

# Define the start and end dates for April 2024
start_date = np.datetime64('2024-04-01')
end_date = np.datetime64('2024-04-30')

potential_vendors = set()

# Iterate through each day in April 2024
current_date = start_date
while current_date <= end_date:
    day_count = (current_date - np.datetime64('2000-01-01')).astype(int) + 1
    
    # Filter the dataframe for each date offset
    filtered_df1 = df[df['Invoice Date'] == day_count-15]
    filtered_df2 = df[df['Invoice Date'] == day_count-30]
    filtered_df3 = df[df['Invoice Date'] == day_count-45]
    filtered_df4 = df[df['Invoice Date'] == day_count-60]
    
    # Add unique Vendor IDs to potential_vendors set
    potential_vendors.update(set(filtered_df1['Vendor ID']))
    potential_vendors.update(set(filtered_df2['Vendor ID']))
    potential_vendors.update(set(filtered_df3['Vendor ID']))
    potential_vendors.update(set(filtered_df4['Vendor ID']))
    
    current_date += np.timedelta64(1, 'D')

In [69]:
print(potential_vendors)

{65671, 65673, 65674, 65675, 65677, 65679, 65681, 65683, 65684, 65685, 65689, 65691, 65693, 65694, 65695, 65696, 65699, 65703, 65705, 65707, 65710, 65712, 65714, 65716, 65718, 65720, 65722, 65723, 65724, 65726, 65727, 65728, 65731, 65733, 65735, 65736, 65737, 65739, 65740, 65743, 65744, 65747, 65750, 65752, 65760, 65761, 65764, 65765, 65766, 65767, 65768, 65769}


In [77]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

pipeline = Pipeline([
 ('std_scaler', StandardScaler()),
 ])

df_scaled = pipeline.fit_transform(df)
df_scaled = pd.DataFrame(df_scaled, columns=columns)

In [82]:
from sklearn.model_selection import train_test_split

# Separate features and target variable
X = df_scaled[columns[:-1]]
y = df_scaled[columns[-1]]

# Perform train-test split with a 80-20 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the train and test sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (690, 2)
X_test shape: (173, 2)
y_train shape: (690,)
y_test shape: (173,)


In [83]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train, y_train)

In [108]:
from sklearn.metrics import mean_squared_error

y_pred = pd.DataFrame(tree_reg.predict(X_test))
y_pred = y_pred.rename(columns={'0':columns[-1]})

X_test_reset = X_test.reset_index(drop=True)

combined_org = pd.concat([X_test, y_test], axis=1)
combined_res = pd.concat([X_test_reset, y_pred], axis=1,ignore_index=True)
combined_res = combined_res.set_index(combined_org.index)
combined_res.columns = combined_org.columns

print(combined_org)
print(combined_res)

     Invoice Date  Vendor ID  Invoice Amount
715      0.188044   1.167199        1.627016
605     -0.994922   0.746764        1.622596
120      0.340210  -1.215267        0.463345
208      0.276399  -0.864905        0.390861
380      0.934148  -0.199216       -1.324560
..            ...        ...             ...
235      1.277748  -0.759796       -0.195896
333     -0.111379  -0.409433        0.155350
215      1.307199  -0.864905       -1.708048
796      1.164851   1.447489        1.230891
7        0.467833  -1.740811        1.123752

[173 rows x 3 columns]
     Invoice Date  Vendor ID  Invoice Amount
715      0.188044   1.167199       -0.104621
605     -0.994922   0.746764        1.108295
120      0.340210  -1.215267        1.546587
208      0.276399  -0.864905       -0.977923
380      0.934148  -0.199216        0.744575
..            ...        ...             ...
235      1.277748  -0.759796       -1.149763
333     -0.111379  -0.409433       -0.175260
215      1.307199  -0.864905   

In [111]:
unscaled_pred = pipeline.inverse_transform(combined_res)
unscaled_org = pipeline.inverse_transform(combined_org)

unscaled_pred = pd.DataFrame(unscaled_pred, columns=columns)
unscaled_org = pd.DataFrame(unscaled_org ,columns=columns)

print(unscaled_org)
print(unscaled_pred)

tree_mse = mean_squared_error(unscaled_org['Invoice Amount'], unscaled_pred['Invoice Amount'])
tree_rmse = np.sqrt(tree_mse)
tree_rmse

     Invoice Date  Vendor ID  Invoice Amount
0          8565.0    65753.0         2469.78
1          8324.0    65741.0         2468.15
2          8596.0    65685.0         2040.65
3          8583.0    65695.0         2013.92
4          8717.0    65714.0         1381.32
..            ...        ...             ...
168        8787.0    65698.0         1797.54
169        8504.0    65708.0         1927.07
170        8793.0    65695.0         1239.90
171        8764.0    65761.0         2323.70
172        8622.0    65670.0         2284.19

[173 rows x 3 columns]
     Invoice Date  Vendor ID  Invoice Amount
0          8565.0    65753.0         1831.20
1          8324.0    65741.0         2278.49
2          8596.0    65685.0         2440.12
3          8583.0    65695.0         1509.15
4          8717.0    65714.0         2144.36
..            ...        ...             ...
168        8787.0    65698.0         1445.78
169        8504.0    65708.0         1805.15
170        8793.0    65695.0   

506.07679332622297