In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [16]:
df = pd.read_excel("Sales_Transactions_X_Train_y_Train.xlsx", sheet_name="Sheet2")
df.head()

Unnamed: 0,Product_Code,Date,Value
0,P1,1/01/2020,11
1,P1,8/01/2020,12
2,P1,15/01/2020,10
3,P1,22/01/2020,8
4,P1,29/01/2020,13


In [17]:
procode = pd.get_dummies(df.Product_Code, prefix='Product_Code')
dt = pd.get_dummies(df.Date, prefix='Date_')

df = pd.concat([df, procode, dt], axis=1)

df.drop(['Product_Code', 'Date'], axis=1, inplace=True)

In [18]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

y_train = train['Value']
y_test = test['Value']

X_train = train.drop(['Value'], axis=1)
X_test = test.drop(['Value'], axis=1)

In [19]:
from sklearn.model_selection import train_test_split
from sklearn import model_selection, tree, preprocessing, metrics, linear_model
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression 
import xgboost as xgb

# Function that runs the requested algorithm and returns the accuracy metrics
def fit_ml_algo(algo, X_train, y_train, cv):
    
    # One Pass
    model = algo.fit(X_train, y_train)
    score = round(model.score(X_train, y_train) * 100, 2)
    
    # Cross Validation 
    train_pred = model_selection.cross_val_predict(algo, 
                                                  X_train, 
                                                  y_train, 
                                                  cv=cv, 
                                                  n_jobs = -1)
    
    return train_pred, score



# Random Forest Regressor ----------------------------------------------------------------

train_pred_log, acc_rfor = fit_ml_algo(RandomForestRegressor(), 
                                                               X_train, 
                                                               y_train, 
                                                                    10)

# Linear Regression ---------------------------------------------------------------------

train_pred_log, acc_lin = fit_ml_algo(LinearRegression(), 
                                                               X_train, 
                                                               y_train, 
                                                                    10)


# Decision Tree Regressor --------------------------------------------------------------------

train_pred_knn, acc_dtr = fit_ml_algo(DecisionTreeRegressor(), 
                                                  X_train, 
                                                  y_train, 
                                                  10)


# Support Vector Regressor --------------------------------------------------------------------

train_pred_svr, acc_svr = fit_ml_algo(SVR(),
                                           X_train,
                                           y_train,
                                           10)


# XGB Regressor -------------------------------------------------------------------

train_pred_xgb, acc_xgb = fit_ml_algo(xgb.XGBRegressor(),
                                           X_train,
                                           y_train,
                                           10)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




In [20]:
# Accuracy Scores --------------------------------------------------------------------------

models = pd.DataFrame({
    'Model': ['RandomForestRegressor', 'LinearRegression', 'DecisionTreeRegressor', 'SVR', 'XGB' ],
    'Score': [
        acc_rfor,
        acc_lin, 
        acc_dtr, 
        acc_svr,
        acc_xgb
    ]})
print("---Accuracy Scores---")
models.sort_values(by='Score', ascending=False)

---Accuracy Scores---


Unnamed: 0,Model,Score
2,DecisionTreeRegressor,100.0
0,RandomForestRegressor,98.18
1,LinearRegression,91.05
4,XGB,27.03
3,SVR,-16.65


In [28]:
X_test = pd.read_excel("Sales_Transactions_X_Test.xlsx")
X_test.head()

Unnamed: 0,Product_Code,Date
0,P1,1/01/2020
1,P1,8/01/2020
2,P1,15/01/2020
3,P1,22/01/2020
4,P1,29/01/2020


In [29]:
procode = pd.get_dummies(X_test.Product_Code, prefix='Product_Code')
dt = pd.get_dummies(X_test.Date, prefix='Date_')

X_test = pd.concat([X_test, procode, dt], axis=1)

X_test.drop(['Product_Code', 'Date'], axis=1, inplace=True)



In [31]:
X_test = np.array(X_test)
X_test

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [33]:
rfr = RandomForestRegressor()

model = rfr.fit(X_train, y_train)

pred = model.predict(X_test)
pred



array([ 9.9, 11.4,  9.9, ...,  0. ,  0. ,  0.7])

In [34]:
transpose_pred = pred.transpose()
transpose_pred

array([ 9.9, 11.4,  9.9, ...,  0. ,  0. ,  0.7])

In [35]:
transpose_pred.shape

(42172,)

In [36]:
predict = pd.DataFrame(data=transpose_pred)  
predict

Unnamed: 0,0
0,9.9
1,11.4
2,9.9
3,9.6
4,9.9
5,11.1
6,12.5
7,16.9
8,7.4
9,12.7


In [37]:
predict.to_csv("check_result.csv")

## Conclusion

This would be more interesting if there was more features included rather than the same orientation of products and dates... It predicted a 2% decline... Need to find a way to make date more relevant.