# Documentation of model results

I will begin by loading necessary packages and the necessary dataframes

In [1]:
# Import necessary packages

import pandas as pd
import numpy as np

In [2]:
# Load aggregated data with dummy variables

df = pd.read_csv(r'C:\Users\bronc\Downloads\Capstone 3\Aggregated_Dummy_Data')
df.head()

Unnamed: 0,Year_ID,QTR_ID,Product_Code,Price_Each,Sales,MSRP,Product_Line,Sales_PQ,1,2,...,2003,2004,2005,Classic Cars,Motorcycles,Planes,Ships,Trains,Trucks and Buses,Vintage Cars
0,2003,2,S10_1678,81.35,2765.9,95,Motorcycles,2871.0,0,1,...,1,0,0,0,1,0,0,0,0,0
1,2003,2,S10_1949,192.87,7329.06,214,Classic Cars,12613.73,0,1,...,1,0,0,1,0,0,0,0,0,0
2,2003,2,S10_2016,96.34,2793.86,118,Motorcycles,3896.49,0,1,...,1,0,0,0,1,0,0,0,0,0
3,2003,2,S10_4698,201.41,9264.86,193,Motorcycles,6065.55,0,1,...,1,0,0,0,1,0,0,0,0,0
4,2003,2,S10_4757,121.04,9403.04,136,Classic Cars,7208.0,0,1,...,1,0,0,1,0,0,0,0,0,0


In [3]:
# Load aggregated data

df_agg = pd.read_csv(r'C:\Users\bronc\Downloads\Capstone 3\Aggregated_Data.csv')
df_agg.head()

Unnamed: 0,Year_ID,QTR_ID,Product_Code,Price_Each,Sales,MSRP,Product_Line,Sales_PQ
0,2003,2,S10_1678,81.35,2765.9,95,Motorcycles,2871.0
1,2003,2,S10_1949,192.87,7329.06,214,Classic Cars,12613.73
2,2003,2,S10_2016,96.34,2793.86,118,Motorcycles,3896.49
3,2003,2,S10_4698,201.41,9264.86,193,Motorcycles,6065.55
4,2003,2,S10_4757,121.04,9403.04,136,Classic Cars,7208.0


In [4]:
# Load test data

df_final = pd.read_csv(r'C:\Users\bronc\Downloads\Capstone 3\Final_Test_Data')
df_final.head()

Unnamed: 0,Price_Each,Sales_PQ,MSRP,1,2,3,4,2003,2004,2005,Classic Cars,Motorcycles,Planes,Ships,Trains,Trucks and Buses,Vintage Cars,Sales
0,55.635,3940.23,95,0,1,0,0,0,0,1,0,1,0,0,0,0,0,2986.176933
1,146.703333,15186.28,214,0,1,0,0,0,0,1,1,0,0,0,0,0,0,15722.280593
2,90.05,7513.51,118,0,1,0,0,0,0,1,0,1,0,0,0,0,0,7313.26429
3,142.536667,9320.65,193,0,1,0,0,0,0,1,0,1,0,0,0,0,0,10640.32283
4,117.21,12263.51,136,0,1,0,0,0,0,1,1,0,0,0,0,0,0,10531.051761


### Scaling

For this part I'll be using sklearn's StandardScaler

In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

Next I'll create the subset of our data that the model will be using and assign it to X

In [6]:
# Set up X
X = df.drop('Sales', axis=1)

In [7]:
# Verify X
X.head()

Unnamed: 0,Year_ID,QTR_ID,Product_Code,Price_Each,MSRP,Product_Line,Sales_PQ,1,2,3,...,2003,2004,2005,Classic Cars,Motorcycles,Planes,Ships,Trains,Trucks and Buses,Vintage Cars
0,2003,2,S10_1678,81.35,95,Motorcycles,2871.0,0,1,0,...,1,0,0,0,1,0,0,0,0,0
1,2003,2,S10_1949,192.87,214,Classic Cars,12613.73,0,1,0,...,1,0,0,1,0,0,0,0,0,0
2,2003,2,S10_2016,96.34,118,Motorcycles,3896.49,0,1,0,...,1,0,0,0,1,0,0,0,0,0
3,2003,2,S10_4698,201.41,193,Motorcycles,6065.55,0,1,0,...,1,0,0,0,1,0,0,0,0,0
4,2003,2,S10_4757,121.04,136,Classic Cars,7208.0,0,1,0,...,1,0,0,1,0,0,0,0,0,0


Next we must drop the categorical variables so that we can scale

In [8]:
X.drop(columns = 'Year_ID', inplace = True)
X.drop(columns = 'QTR_ID', inplace = True)
X.drop(columns = 'Product_Line', inplace = True)
X.drop(columns = 'Product_Code', inplace = True)

Now that this is all set up its time to scale

In [9]:
scaler.fit(X)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [10]:
X_scaled = scaler.transform(X)

### Train Test Split

Next, its time to split the data for our model. First I need to define y as the Sales column

In [11]:
y = df[['Sales']]

Now to perform the actual split. I'll be using a 70/30 split of training to testing size

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.3, random_state = 123)

Let's check that that worked by looking at the sizes of the training and test splits

In [13]:
X_train.shape

(606, 17)

In [14]:
X_test.shape

(261, 17)

In [15]:
261/(261+606)

0.30103806228373703

This rounds to 30% for our test split so conversely the train split is right around 70%

### Model

Next I'll implement the model that I discovered to be the most accurate at predicting product revenue

In [16]:
# Import RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [17]:
# Tune parameters and fit model to training data
Model2 = RandomForestRegressor(min_samples_leaf = 7, min_samples_split = 10, random_state = 123)
Model2.fit(X_train, y_train)

  This is separate from the ipykernel package so we can avoid doing imports until


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=7,
                      min_samples_split=10, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=123, verbose=0, warm_start=False)

In [18]:
# Predict data on test set
y_pred = Model2.predict(X_test)

In [19]:
# Check r-squared
r2_score(y_test, y_pred)

0.824209546137172

### Model Implementation

Now that the model has been built its time to use it to predict future data

In [20]:
# Verify dataset
df_final.head()

Unnamed: 0,Price_Each,Sales_PQ,MSRP,1,2,3,4,2003,2004,2005,Classic Cars,Motorcycles,Planes,Ships,Trains,Trucks and Buses,Vintage Cars,Sales
0,55.635,3940.23,95,0,1,0,0,0,0,1,0,1,0,0,0,0,0,2986.176933
1,146.703333,15186.28,214,0,1,0,0,0,0,1,1,0,0,0,0,0,0,15722.280593
2,90.05,7513.51,118,0,1,0,0,0,0,1,0,1,0,0,0,0,0,7313.26429
3,142.536667,9320.65,193,0,1,0,0,0,0,1,0,1,0,0,0,0,0,10640.32283
4,117.21,12263.51,136,0,1,0,0,0,0,1,1,0,0,0,0,0,0,10531.051761


In [21]:
# Drop sales data as this will be the predicted column by the data
df_final.drop(columns = 'Sales', inplace = True)

In [22]:
# Scale data
scaler.fit(df_final)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [23]:
# Fit Scaler
df_final_scaled = scaler.transform(df_final)

In [24]:
# Predict data with Random Forest model
y_pred = Model2.predict(df_final_scaled)

In [25]:
# Ensure predictions make sense
y_pred

array([ 2986.17693331, 15722.28059309,  7313.26429038, 10599.27187141,
       10515.15726756,  7563.58767035,  6322.81663999,  7717.21753755,
       11962.27405916, 11827.77902696, 11098.62336745,  7420.51915837,
       15514.70816772,  4518.22470307,  2894.58882902,  8961.58641765,
        9750.93579309, 10042.69195306, 13653.86292716,  6779.70108323,
        2870.46456456,  7330.2469734 , 10506.18263377,  8640.16113807,
        7713.22024901,  7391.96956734,  2219.43207321, 12090.72146542,
        9297.17817199,  2662.33112376,  4928.3657547 ,  7122.92432097,
        3149.45488552, 12194.41816005,  6407.78820161,  2568.27092986,
        5601.70606185,  7393.0817318 , 10874.37609895, 15338.67709664,
        7635.87408606,  8424.18971533, 11932.38243309, 14640.68144182,
        4874.34087148, 10983.32287724,  6170.21512768, 15547.71388327,
        8605.49011481,  6171.54193135,  7110.01955097, 11412.96753709,
        7361.99895657, 11544.29710197,  7241.8938291 ,  5052.7448202 ,
      

In [26]:
# Add predicted data to final dataset
df_final['Sales'] = y_pred

In [27]:
# Verify few if any duplicate predicted values
df_final['Sales'].value_counts()

9681.902862     1
9943.152048     1
6779.701083     1
6171.541931     1
8961.586418     1
               ..
14640.681442    1
8605.490115     1
2568.270930     1
7110.019551     1
7420.519158     1
Name: Sales, Length: 109, dtype: int64

In [28]:
# Verify new column has been added
df_final.head()

Unnamed: 0,Price_Each,Sales_PQ,MSRP,1,2,3,4,2003,2004,2005,Classic Cars,Motorcycles,Planes,Ships,Trains,Trucks and Buses,Vintage Cars,Sales
0,55.635,3940.23,95,0,1,0,0,0,0,1,0,1,0,0,0,0,0,2986.176933
1,146.703333,15186.28,214,0,1,0,0,0,0,1,1,0,0,0,0,0,0,15722.280593
2,90.05,7513.51,118,0,1,0,0,0,0,1,0,1,0,0,0,0,0,7313.26429
3,142.536667,9320.65,193,0,1,0,0,0,0,1,0,1,0,0,0,0,0,10599.271871
4,117.21,12263.51,136,0,1,0,0,0,0,1,1,0,0,0,0,0,0,10515.157268


In [29]:
# Identify highest Sales figure and index
df_final.loc[df_final['Sales'].idxmax()].sort_values(ascending = False)

Sales               15722.280593
Sales_PQ            15186.280000
MSRP                  214.000000
Price_Each            146.703333
2                       1.000000
Classic Cars            1.000000
2005                    1.000000
3                       0.000000
4                       0.000000
2003                    0.000000
2004                    0.000000
1                       0.000000
Motorcycles             0.000000
Planes                  0.000000
Ships                   0.000000
Trains                  0.000000
Trucks and Buses        0.000000
Vintage Cars            0.000000
Name: 1, dtype: float64

In [30]:
# Use original aggregated dataframe to find product code
df_agg.iloc[759, :]

Year_ID                 2005
QTR_ID                     1
Product_Code        S10_1949
Price_Each        146.703333
Sales           15186.280000
MSRP                     214
Product_Line    Classic Cars
Sales_PQ        40436.340000
Name: 759, dtype: object

Here is the answer to our initial problem which was to find the product code that will generate the most revenue next quarter. The model provided one code, S10_1949, as the one with the maximum revenue. Therefore, I would recommend focusing the most marketing and production efforts on this product to maximize quarter revenue for 2005 Quarter 2

### Further Model Use

This model can easily be used to predict future data by following the process shown above with new data. For best results I would recommend loading as much historical data as is available to improve performance before running. This model performs well at predicting the top product by revenue as observed by the r-squared of 0.8242