## VNR Revenue Prediction using different ML Algorithms

In [0]:
import numpy as np
import datetime
import json
import logging
import pandas as pd
from matplotlib import rcParams
from cycler import cycler
import pmdarima as pm
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tsa.seasonal import seasonal_decompose
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from pandas.tseries.offsets import DateOffset

In [0]:
def upload_file(path):
    
    df = pd.read_csv(path, delimiter=";")
    df['Timestamp'] = df['SNAPSHOT_PERIOD_END'].apply(lambda x: datetime.datetime.strptime(x, "%d%b%Y:%H:%M:%S").date())
    df.sort_values(by=['Timestamp'],inplace=True)
    df.set_index(df.Timestamp, inplace=True)
    df= df.drop(['SNAPSHOT_PERIOD_END','Timestamp'], axis=1)
    df= df.rename(columns={"SUM_of_BR_BILLED_AMT": "Revenue"})
    df['Revenue']= df['Revenue']/1000000
    return (df)

data= upload_file("./dataset2.csv")
data.head(100)

In [0]:
data.info()

In [0]:
data.isna().sum()

Timestamp will be shifted by 3 months (concordance with Post_period)

In [0]:
sns.set(style="whitegrid")
fig, axes = plt.subplots(1, 3, figsize=(16,6),
                      sharex=True,
                      )
sns.lineplot (ax=axes[0],data=data, x='Timestamp', y='Revenue' , color='orange',linewidth=2)
sns.lineplot (ax=axes[1],data=data, x='Timestamp',y='COUNT_BR_NMBR', linewidth=2)
sns.lineplot (ax=axes[2],data=data, x='Timestamp',y='COUNT_GC_ORG',color='black', linewidth=2)


In [0]:
data.shape

In [0]:
numeric_col = ['Revenue','COUNT_BR_NMBR','COUNT_GC_ORG','POST_PERIOD']
data_numeric = data[numeric_col].copy()

In [0]:
data_numeric.head()

In [0]:
data.head()

In [0]:
#Revenue per FY
plt.figure(figsize=(8,5))
sns.barplot(x='FY',y='Revenue',data=data)
plt.grid()
plt.title('Revenue per FY', fontsize=10)
plt.ylabel('Revenue', fontsize=10)
plt.xlabel('FY', fontsize=10)

plt.show()

In [0]:
# Effect of BR numbers on Revenue
plt.figure(figsize=(5,5))
sns.distplot( data['COUNT_BR_NMBR'] )
plt.title('Effect of Number of BR',fontsize=15)
plt.xlabel('COUNT_BR_NMBR',fontsize=14)
plt.ylabel('Revenue',fontsize=14)
plt.show()

In [0]:
# Effect of BR numbers on Revenue
plt.figure(figsize=(5,5))
sns.distplot(data['COUNT_GC_ORG'])
plt.title('Effect of Number of organization',fontsize=15)
plt.xlabel('COUNT_GC_ORG',fontsize=14)
plt.ylabel('Revenue',fontsize=14)

plt.show()

# Time series decomposition
Time Series Decomposition is a technique to extract multiple types of variation from your dataset. There are three important components in the temporal data of a time series: seasonality, trend, and noise.

## Seasonality :
is a recurring movement that is present in your time series variable. For example, the temperature of a place will be higher in the summer months and lower in the winter months. You could compute average monthly temperatures and use this seasonality as a basis for forecasting future values.
## A trend :
can be a long-term upward or downward pattern. In a temperature time series, a trend could be present due to global warming. For example, on top of the summer/winter seasonality, you may well see a slight increase in average temperatures over time.
## Noise:
 is the part of the variability in a time series that can neither be explained by seasonality nor by a trend. When building models, you end up combining different components into a mathematical formula. Two parts of such a formula can be seasonality and trend. A model that combines both will never represent the values of temperature perfectly: an error will always remain. This is represented by the noise factor.

In [0]:
# Time Series Decompose
sm.tsa.seasonal_decompose(data['Revenue'], model='additive', period = 12).plot()
#plt.savefig('seasonal_decompose.png')
plt.show()

In [0]:
# Time Series Decompose
sm.tsa.seasonal_decompose(data['Revenue'], model='multiplicative', period = 12).plot()
plt.show()

### One Hot Encoding

In [0]:
data.dtypes

Here, Store,Dept and Type columns are categorical columns.

In [0]:
cat_col = ['FY']
data_cat = data[cat_col].copy()


In [0]:
data_cat = pd.get_dummies(data_cat,columns=cat_col)
data_cat.head()

In [0]:
data = pd.concat([data, data_cat],axis=1)

In [0]:
data.shape

In [0]:
data.drop(columns=cat_col,inplace=True)

In [0]:
data.shape

#### Data Normalization

In [0]:
num_col = ['Revenue','COUNT_BR_NMBR','COUNT_GC_ORG','POST_PERIOD']

In [0]:
#minmax_scale = MinMaxScaler(feature_range=(0, 1))
#def normalization(data,col):
  #for i in col:
  #  arr = data[i]
  #  arr = np.array(arr)
  #  data[i] = minmax_scale.fit_transform(arr.reshape(len(arr),1))
  #return data



In [0]:
#data = normalization(data.copy(),num_col)
data.head()

In [0]:
# inverse transform 

#def invert_normalization(data,col):
  #for i in col:
    #arr = data[i]
    #print(data[i])
    #arr = np.array(arr)
    #data[i] = minmax_scale.inverse_transform(arr.reshape(len(arr),1))
    #print(data[i])
  #return data

#test_data= invert_normalization(data, num_col)
#test_data.head(12)

**Finding Correlation between features**

In [0]:
plt.figure(figsize=(15,8))
corr = data[num_col].corr()
sns.heatmap(corr,vmax=1.0,annot=True)
plt.title('Correlation Matrix',fontsize=16)
plt.savefig('correlation_matrix.png')
plt.show()

# Building the models and use of regression metrics (predicting in a continuous range)

**Splitting data into train and test data**

## Prepare data frame for time-series split
Set the data frame index to be time if it is not so.
Sort time frame by time: it is important to sort dataframe by time before the time series split
prepare features dataframe (X) and target (y) as data series

In [0]:
from sklearn.model_selection import TimeSeriesSplit


tss = TimeSeriesSplit(n_splits = 5)
X = data.drop(['Revenue'],axis=1)
Y = data.Revenue

# Just to see how the TimeSeriesSplit is dividing the traning and test sets 

for fold, (train_index, test_index) in enumerate(tss.split(X)):
    print("Fold: {}".format(fold))
    print("TRAIN indices:", train_index, "\n", "TEST indices:", test_index)
    print("\n")
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index,:]
    y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]


## Split train test sets for both features and targets
time-series class (tss) class returns two arrays to mark train and test sets

In time series split, testing sets are always “younger” or later than training sets. The randomness comes from the sizes of training and testing sets.
Split train test sets for both features and targets
time-series class (tss) class returns two arrays to mark train and test sets

In [0]:
# X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.30, random_state=50)

## 1. Linear Regression Model

In [0]:
from sklearn.model_selection import cross_val_score
from sklearn.compose import TransformedTargetRegressor

regressor = LinearRegression()
for fold, (train_index, test_index) in enumerate(tss.split(X)):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index,:]
    y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]
    # lr.fit(X_train, y_train)
    lr = TransformedTargetRegressor(regressor= regressor,
                                        transformer = MinMaxScaler()
                                        ).fit(X_train,y_train)




## Metrics using Time serie Cross-validation

In [0]:

linear_regression_RMSE = np.sqrt(-cross_val_score(lr, X, Y, cv=tss, scoring='neg_mean_squared_error'))
linear_regression_MSE = -cross_val_score(lr, X, Y, cv=tss, scoring='neg_mean_squared_error')
linear_regression_MAE = -cross_val_score(lr, X, Y, cv=tss, scoring='neg_mean_absolute_error')
linear_regression_R2 = cross_val_score(lr, X, Y, cv=tss, scoring='r2')

scores = cross_val_score(lr, X, Y,cv=tss)
print("Mean cross-validation score: %.2f" % scores.mean())

print(f"RMSE: {linear_regression_RMSE.mean()} (+/- {linear_regression_RMSE.std()}")
print(f"\nMSE: {linear_regression_MSE.mean()} (+/- {linear_regression_MSE.std()}")
print(f"\nMAE: {linear_regression_MAE.mean()} (+/- {linear_regression_MAE.std()}")
print(f"\nR2: {linear_regression_R2.mean()} (+/- {linear_regression_R2.std()}")

In [0]:
#make prediction with the last test set
y_pred = lr.predict(X_test)
lr_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
lr_df.head()


In [0]:
plt.figure(figsize=(16,8))
plt.title('Comparison between actual and predicted values',fontsize=16)
plt.plot(lr.predict(X_test), label="prediction", linewidth=3.0,color='blue')
plt.plot(y_test.values, label="real_values", linewidth=3.0,color='red')
plt.legend(loc="best")
plt.show()

# 2. Random Forest Regressor

In [0]:

regressor = RandomForestRegressor()
for train_index, test_index in tss.split(X):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index,:]
    y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]
    rf = TransformedTargetRegressor(regressor= regressor,
                                        transformer = MinMaxScaler()
                                        ).fit(X_train,y_train)
    #rf.fit(X_train, y_train)

In [0]:
random_forest_RMSE = np.sqrt(-cross_val_score(rf, X, Y, cv=tss, scoring='neg_mean_squared_error'))
random_forest_MSE = -cross_val_score(rf, X, Y, cv=tss, scoring='neg_mean_squared_error')
random_forest_MAE = -cross_val_score(rf, X, Y, cv=tss, scoring='neg_mean_absolute_error')
random_forest_R2 = cross_val_score(rf, X, Y, cv=tss, scoring='r2')

print(f"RMSE: {random_forest_RMSE.mean()} (+/- {random_forest_RMSE.std()}")
print(f"\nMSE: {random_forest_MSE.mean()} (+/- {random_forest_MSE.std()}")
print(f"\nMAE: {random_forest_MAE.mean()} (+/- {random_forest_MAE.std()}")
print(f"\nR2: {random_forest_R2.mean()} (+/- {random_forest_R2.std()}")

In [0]:
#make prediction with the last test set
y_pred = rf.predict(X_test)
rf_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
rf_df.head()

In [0]:
plt.figure(figsize=(16,8))
plt.title('Comparison between actual and predicted values',fontsize=16)
plt.plot(rf.predict(X_test), label="prediction", linewidth=3.0,color='blue')
plt.plot(y_test.values, label="real_values", linewidth=3.0,color='red')
plt.legend(loc="best")
# plt.savefig('rf_real_pred.png')
plt.show()

# 3.   KNeighborsRegressor (KNN)

In [0]:
regressor = KNeighborsRegressor(n_neighbors = 1,weights = 'uniform')

for train_index, test_index in tss.split(X):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index,:]
    y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]
    knn = TransformedTargetRegressor(regressor= regressor,
                                        transformer = MinMaxScaler()
                                        ).fit(X_train,y_train)
    #knn.fit(X_train,y_train)

In [0]:
knn_RMSE = np.sqrt(-cross_val_score(knn, X, Y, cv=tss, scoring='neg_mean_squared_error'))
knn_MSE = -cross_val_score(knn, X, Y, cv=tss, scoring='neg_mean_squared_error')
knn_MAE = -cross_val_score(knn, X, Y, cv=tss, scoring='neg_mean_absolute_error')
knn_R2 = cross_val_score(knn, X, Y, cv=tss, scoring='r2')

print(f"RMSE: {knn_RMSE.mean()} (+/- {knn_RMSE.std()}")
print(f"\nMSE: {knn_MSE.mean()} (+/- {knn_MSE.std()}")
print(f"\nMAE: {knn_MAE.mean()} (+/- {knn_MAE.std()}")
print(f"\nR2: {knn_R2.mean()} (+/- {knn_R2.std()}")

In [0]:
#make prediction with the last test set
y_pred = knn.predict(X_test)
knn_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
knn_df.head()

In [0]:
plt.figure(figsize=(16,8))
plt.title('Comparison between actual and predicted values',fontsize=16)
plt.plot(knn.predict(X_test), label="prediction", linewidth=3.0,color='blue')
plt.plot(y_test.values, label="real_values", linewidth=3.0,color='red')
plt.legend(loc="best")
plt.show()

# 4. XGBoost Regressor

In [0]:
regressor = XGBRegressor()
for train_index, test_index in tss.split(X):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index,:]
    y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]
    #xgbr.fit(X_train, y_train)
    xgbr = TransformedTargetRegressor(regressor= regressor,
                                        transformer = MinMaxScaler()
                                        ).fit(X_train,y_train)
    

In [0]:
xgbr_RMSE = np.sqrt(-cross_val_score(xgbr, X, Y, cv=tss, scoring='neg_mean_squared_error'))
xgbr_MSE = -cross_val_score(xgbr, X, Y, cv=tss, scoring='neg_mean_squared_error')
xgbr_MAE = -cross_val_score(xgbr, X, Y, cv=tss, scoring='neg_mean_absolute_error')
xgbr_R2 = cross_val_score(xgbr, X, Y, cv=tss, scoring='r2')

print(f"RMSE: {xgbr_RMSE.mean()} (+/- {xgbr_RMSE.std()}")
print(f"\nMSE: {xgbr_MSE.mean()} (+/- {xgbr_MSE.std()}")
print(f"\nMAE: {xgbr_MAE.mean()} (+/- {xgbr_MAE.std()}")
print(f"\nR2: {xgbr_R2.mean()} (+/- {xgbr_R2.std()}")


In [0]:
#make prediction with the last test set
y_pred = xgbr.predict(X_test)
xgb_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
xgb_df.head()

In [0]:
plt.figure(figsize=(16,8))
plt.title('Comparison between actual and predicted values',fontsize=10)
plt.plot(xgbr.predict(X_test), label="prediction", linewidth=3.0,color='blue')
plt.plot(y_test.values, label="real_values", linewidth=3.0,color='red')
plt.legend(loc="best")

plt.show()

In [0]:

regressor = DecisionTreeRegressor()
for train_index, test_index in tss.split(X):
    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index,:]
    y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]
    #xgbr.fit(X_train, y_train)
    dt = TransformedTargetRegressor(regressor= regressor,
                                        transformer = MinMaxScaler()
                                        ).fit(X_train,y_train)

In [0]:
dt_RMSE = np.sqrt(-cross_val_score(dt, X, Y, cv=tss, scoring='neg_mean_squared_error'))
dt_MSE = -cross_val_score(dt, X, Y, cv=tss, scoring='neg_mean_squared_error')
dt_MAE = -cross_val_score(dt, X, Y, cv=tss, scoring='neg_mean_absolute_error')
dt_R2 = cross_val_score(dt, X, Y, cv=tss, scoring='r2')

print(f"RMSE: {dt_RMSE.mean()} (+/- {dt_RMSE.std()}")
print(f"\nMSE: {dt_MSE.mean()} (+/- {dt_MSE.std()}")
print(f"\nMAE: {dt_MAE.mean()} (+/- {dt_MAE.std()}")
print(f"\nR2: {dt_R2.mean()} (+/- {dt_R2.std()}")


In [0]:
#make prediction with the last test set
y_pred = dt.predict(X_test)
dt_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
dt_df.head()

In [0]:
plt.figure(figsize=(16,8))
plt.title('Comparison between actual and predicted values',fontsize=10)
plt.plot(dt.predict(X_test), label="prediction", linewidth=3.0,color='blue')
plt.plot(y_test.values, label="real_values", linewidth=3.0,color='red')
plt.legend(loc="best")

plt.show()

In [0]:
error_metrics = {'model':['linear_regression','random_forest','knn','xgbr','dt'],
'MAE': [linear_regression_MAE.mean(),random_forest_MAE.mean(),knn_MAE.mean(),xgbr_MAE.mean(),dt_MAE.mean()],
'MSE':[linear_regression_MSE.mean(),random_forest_MSE.mean(),knn_MSE.mean(),xgbr_MSE.mean(),dt_MSE.mean()],
'RMSE':[linear_regression_RMSE.mean(),random_forest_RMSE.mean(),knn_RMSE.mean(),xgbr_RMSE.mean(),dt_RMSE.mean()],
'R2':[linear_regression_R2.mean(),random_forest_R2.mean(),knn_R2.mean(),xgbr_R2.mean(),dt_R2.mean()]}

In [0]:
acc_df = pd.DataFrame(error_metrics)
acc_df

In [0]:
fig, axes = plt.subplots(2,2,figsize=(16,8))

sns.lineplot(ax=axes[0,0], x='model',y='MAE', data=acc_df)
sns.lineplot(ax=axes[0,1],x='model',y='MSE', data=acc_df)
sns.lineplot(ax=axes[1,0],x='model',y='RMSE', data=acc_df)
sns.lineplot(ax=axes[1,1],x='model',y='R2', data=acc_df)

plt.show()

The model with the lowest average MSE, MAE and RMSE is the random forest

# How to predict the next Period with XGBoost

Using Prophet to predict the Number of BRs and the company Number.
Predict the next 12 periods using the prophet results and XGboost ML process.

Applying Prophet to BRs Number and predict 12 future periods

In [0]:
from fbprophet import Prophet,diagnostics
from sklearn.preprocessing import MinMaxScaler
pred_periods = 12

df= upload_file("./Dataset2.csv")
# prepare expected column names
df_pro=pd.DataFrame(columns=['ds','y'])
df_pro=df.reset_index()
df_pro= df_pro.rename(columns={"Timestamp": "ds", "COUNT_BR_NMBR": "y"})
df_pro= df_pro.drop([ 'FY','Revenue', 'COUNT_GC_ORG','POST_PERIOD'], axis=1)
df_pro.tail()

In [0]:
m = Prophet(interval_width=0.95)
m.fit(df_pro)
future = m.make_future_dataframe(periods=pred_periods,freq='M')
forecast = m.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail(pred_periods)


To avoid possible negative values we will Clip predictions of a regular model : The idea here is just to fit a usual model, and then clamp negative predictions up to 0. Note that we'll use multiplicative seasonality here - we'd probably always want to use multiplicative seasonality in settings with positive predictions.

In [0]:
for col in ['yhat', 'yhat_lower', 'yhat_upper']:
    forecast[col] = forecast[col].clip(lower=0.0)
fig = m.plot(forecast)

In [0]:
future_brs_nb = forecast[['ds','yhat']].tail(12)


Applying Prophet to Organization account Number and predict 12 future periods

In [0]:
df= upload_file("./Dataset2.csv")
# prepare expected column names
df_pro=pd.DataFrame(columns=['ds','y'])
df_pro=df.reset_index()
df_pro= df_pro.rename(columns={"Timestamp": "ds", "COUNT_GC_ORG": "y"})
df_pro= df_pro.drop([ 'FY','Revenue', 'COUNT_BR_NMBR','POST_PERIOD'], axis=1)

df_pro.tail()

In [0]:
m = Prophet(interval_width=0.95)
m.fit(df_pro)
future = m.make_future_dataframe(periods=pred_periods,freq='M')
forecast = m.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail(pred_periods)

Same approach To avoid possible negative values generated by prophet we will Clip predictions of a regular model : The idea here is just to fit a usual model, and then clamp negative predictions up to 0. Note that we'll use multiplicative seasonality here - we'd probably always want to use multiplicative seasonality in settings with positive predictions.

In [0]:
for col in ['yhat', 'yhat_lower', 'yhat_upper']:
    forecast[col] = forecast[col].clip(lower=0.0)
fig = m.plot(forecast)

In [0]:
future_org_nb = forecast[['ds','yhat']].tail(12)

## Merging DataFrames
Pull all predicted variables (except Revenue) together

In [0]:
future_org_nb = future_org_nb.rename(columns={"ds":"Timestamp" ,"yhat": "COUNT_GC_ORG" })
future_brs_nb = future_brs_nb.rename(columns={"ds":"Timestamp" ,"yhat": "COUNT_BR_NMBR" })

In [0]:

result = pd.merge(future_org_nb,
                 future_brs_nb[['Timestamp','COUNT_BR_NMBR']],
                 on='Timestamp'
                )
result.head(12)

In [0]:
def create_pred_Dframe (df):
   # add the POST_PERIOD and FY one hot if not exist
   df['POST_PERIOD'].where(pd.DatetimeIndex(df['Timestamp']).month >= 4 , pd.DatetimeIndex(df['Timestamp']).month )
                      # np.where( pd.DatetimeIndex(df['Timestamp']).month < 4 ,  1))
   return df
create_pred_Dframe(result)

Make prediction

In [0]:
#make prediction with the last test set
y_pred = dt.predict(X_test)
dt_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
dt_df.head()