# Scikit-Learn Linear Regression
Using SALES_VIEW from SAP Datasphere. This view has 6,291,450 records

## Install fedml aws library

In [1]:
pip install fedml-aws --force-reinstall

Processing ./fedml_aws-2.0.0-py3-none-any.whl
Collecting pyyaml
  Using cached PyYAML-6.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (603 kB)
Collecting hdbcli
  Using cached hdbcli-2.12.13-cp34-abi3-manylinux1_x86_64.whl (11.7 MB)
Installing collected packages: pyyaml, hdbcli, fedml-aws
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 6.0
    Uninstalling PyYAML-6.0:
      Successfully uninstalled PyYAML-6.0
  Attempting uninstall: hdbcli
    Found existing installation: hdbcli 2.12.13
    Uninstalling hdbcli-2.12.13:
      Successfully uninstalled hdbcli-2.12.13
  Attempting uninstall: fedml-aws
    Found existing installation: fedml-aws 2.0.0
    Uninstalling fedml-aws-2.0.0:
      Successfully uninstalled fedml-aws-2.0.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
docker-co

## Import Libraries

In [2]:
from fedml_aws import DwcSagemaker
from fedml_aws import DbConnection
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt # plotting

## Create DwcSagemaker instance to access libraries functions

In [3]:
dwcs = DwcSagemaker(prefix='<prefix>', bucket_name='<bucket_name>')

2022-03-21 16:59:21,778: fedml_aws.dwcsagemaker INFO: Bucket created in us-east-1


## Create DbConnection instance to get data from SAP Datasphere

Before running the following cell, you should have a config.json file in the same directory as this notebook with the specified values to allow you to access to SAP Datasphere.

You should also have the follow view `SALES_VIEW` created in your SAP Datasphere. To gather this data, please refer to https://eforexcel.com/wp/downloads-18-sample-csv-files-data-sets-for-testing-sales/

Please note the 2M records data was downloaded and duplicated 3 times to represent a large dataset in SAP Datasphere.

In [4]:
import json
with open('config.json', 'r') as f:
    config = json.load(f)

In [5]:
%%time
db = DbConnection()
train_data = db.execute_query('SELECT * FROM ' + config['schema'] +'.SALES_VIEW')
data = pd.DataFrame(train_data[0], columns=train_data[1])
data

CPU times: user 26.9 s, sys: 4.76 s, total: 31.7 s
Wall time: 35.1 s


Unnamed: 0,Region,Country,Order_ID,Item_Type,Sales_Channel,Order_Priority,Units_Sold,Unit_Price,Unit_Cost,Total_Revenue,Total_Cost,Total_Profit
0,Sub-Saharan Africa,Guinea-Bissau,197647750,Beverages,Offline,C,7216,47.45,31.79,342399.20,229396.64,113002.56
1,Sub-Saharan Africa,Sudan,321990668,Beverages,Offline,C,3049,47.45,31.79,144675.05,96927.71,47747.34
2,Sub-Saharan Africa,Sudan,982767236,Beverages,Offline,C,1519,47.45,31.79,72076.55,48289.01,23787.54
3,Sub-Saharan Africa,Guinea-Bissau,897898280,Beverages,Offline,C,6909,47.45,31.79,327832.05,219637.11,108194.94
4,Sub-Saharan Africa,Sudan,458928811,Beverages,Offline,C,6088,47.45,31.79,288875.60,193537.52,95338.08
...,...,...,...,...,...,...,...,...,...,...,...,...
6291445,Middle East and North Africa,Saudi Arabia,157667815,Clothes,Offline,L,6265,109.28,35.84,684639.20,224537.60,460101.60
6291446,Australia and Oceania,Tuvalu,403025567,Clothes,Online,L,3076,109.28,35.84,336145.28,110243.84,225901.44
6291447,Asia,Brunei,299461829,Cereal,Offline,M,6265,205.70,117.11,1288710.50,733694.15,555016.35
6291448,Asia,Philippines,354972398,Vegetables,Offline,H,3076,154.06,90.93,473888.56,279700.68,194187.88


## Make sure there are no na or null columns

In [6]:
data.isna().any()

Region            False
Country           False
Order_ID          False
Item_Type         False
Sales_Channel     False
Order_Priority    False
Units_Sold        False
Unit_Price        False
Unit_Cost         False
Total_Revenue     False
Total_Cost        False
Total_Profit      False
dtype: bool

In [7]:
data.isnull().any()

Region            False
Country           False
Order_ID          False
Item_Type         False
Sales_Channel     False
Order_Priority    False
Units_Sold        False
Unit_Price        False
Unit_Cost         False
Total_Revenue     False
Total_Cost        False
Total_Profit      False
dtype: bool

In [8]:
data.columns

Index(['Region', 'Country', 'Order_ID', 'Item_Type', 'Sales_Channel',
       'Order_Priority', 'Units_Sold', 'Unit_Price', 'Unit_Cost',
       'Total_Revenue', 'Total_Cost', 'Total_Profit'],
      dtype='object')

## Correlation

In [9]:
# Create correlation matrix
corr_matrix = data.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

In [10]:
corr_matrix

Unnamed: 0,Units_Sold,Unit_Price,Unit_Cost,Total_Revenue,Total_Cost,Total_Profit
Units_Sold,1.0,0.000807,0.000659,0.523055,0.471242,0.59829
Unit_Price,0.000807,1.0,0.986049,0.738524,0.753562,0.577423
Unit_Cost,0.000659,0.986049,1.0,0.728145,0.764154,0.505104
Total_Revenue,0.523055,0.738524,0.728145,1.0,0.987724,0.880793
Total_Cost,0.471242,0.753562,0.764154,0.987724,1.0,0.796014
Total_Profit,0.59829,0.577423,0.505104,0.880793,0.796014,1.0


In [11]:
print(type(data))

<class 'pandas.core.frame.DataFrame'>


In [12]:
df = data.iloc[:,6:]
df

Unnamed: 0,Units_Sold,Unit_Price,Unit_Cost,Total_Revenue,Total_Cost,Total_Profit
0,7216,47.45,31.79,342399.20,229396.64,113002.56
1,3049,47.45,31.79,144675.05,96927.71,47747.34
2,1519,47.45,31.79,72076.55,48289.01,23787.54
3,6909,47.45,31.79,327832.05,219637.11,108194.94
4,6088,47.45,31.79,288875.60,193537.52,95338.08
...,...,...,...,...,...,...
6291445,6265,109.28,35.84,684639.20,224537.60,460101.60
6291446,3076,109.28,35.84,336145.28,110243.84,225901.44
6291447,6265,205.70,117.11,1288710.50,733694.15,555016.35
6291448,3076,154.06,90.93,473888.56,279700.68,194187.88


In [13]:
for i in df.columns:
    print(df[i])

0          7216
1          3049
2          1519
3          6909
4          6088
           ... 
6291445    6265
6291446    3076
6291447    6265
6291448    3076
6291449    3076
Name: Units_Sold, Length: 6291450, dtype: int64
0           47.45
1           47.45
2           47.45
3           47.45
4           47.45
            ...  
6291445    109.28
6291446    109.28
6291447    205.70
6291448    154.06
6291449    255.28
Name: Unit_Price, Length: 6291450, dtype: float64
0           31.79
1           31.79
2           31.79
3           31.79
4           31.79
            ...  
6291445     35.84
6291446     35.84
6291447    117.11
6291448     90.93
6291449    159.42
Name: Unit_Cost, Length: 6291450, dtype: float64
0           342399.20
1           144675.05
2            72076.55
3           327832.05
4           288875.60
              ...    
6291445     684639.20
6291446     336145.28
6291447    1288710.50
6291448     473888.56
6291449     785241.28
Name: Total_Revenue, Length: 6291450, d

## Train SciKit Model

`train_data` is the data you want to train your model with.

In order to deploy a model to AWS using the Scikit-learn Sagemaker SDK, you must have a script that tells Sagemaker how to train and deploy the model. The path to the script is passed to the `train_sklearn_model` function in the `train_script` parameter.

`instance_type` specifies how much computing power we want AWS to allocate for our services.

In [14]:
clf = dwcs.train_sklearn_model(df,
                               train_script='sales_train.py',
                               instance_type='ml.c4.xlarge',
                              wait=True)

2022-03-21 17:00:46,955: fedml_aws.dwcsagemaker INFO: Training data uploaded
2022-03-21 17:00:47 Starting - Starting the training job...ProfilerReport-1647882047: InProgress
...
2022-03-21 17:01:17 Starting - Preparing the instances for training.........
2022-03-21 17:03:05 Downloading - Downloading input data...
2022-03-21 17:03:45 Training - Downloading the training image...
2022-03-21 17:04:10 Training - Training image download completed. Training in progress..[34m2022-03-21 17:04:13,248 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2022-03-21 17:04:13,250 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-03-21 17:04:13,261 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2022-03-21 17:04:13,739 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-03-21 17:04:13,761 sagemaker-training-toolkit INFO     No GPU

## Using the fedml_aws deploy function

In [15]:
predictor = dwcs.deploy(clf, initial_instance_count=1, instance_type="ml.c4.xlarge", endpoint_name='linear-v2')

---------!

In [16]:
predictor

'linear-v2'

## Using the fedml_aws predict function

In [42]:
test_data = data.tail(60)

In [43]:
df = test_data.iloc[:,6:-1]
df

Unnamed: 0,Units_Sold,Unit_Price,Unit_Cost,Total_Revenue,Total_Cost
6291390,3076,81.73,56.67,251401.48,174316.92
6291391,406,81.73,56.67,33182.38,23008.02
6291392,3076,109.28,35.84,336145.28,110243.84
6291393,406,437.2,263.33,177503.2,106911.98
6291394,6265,651.21,524.96,4079830.65,3288874.4
6291395,3076,437.2,263.33,1344827.2,810003.08
6291396,6265,668.27,502.54,4186711.55,3148413.1
6291397,6265,205.7,117.11,1288710.5,733694.15
6291398,6265,9.33,6.92,58452.45,43353.8
6291399,406,81.73,56.67,33182.38,23008.02


In [44]:
result = dwcs.predict(endpoint_name=predictor, 
             body=df.to_csv(header=False, index=False).encode('utf-8'), 
             content_type='text/csv')

In [45]:
type(result)

str

In [46]:
res = result.strip('][').split(', ')
res

['77084.56000000218',
 '10174.360000002947',
 '225901.44000000175',
 '70591.21999999917',
 '790956.2499999946',
 '534824.119999998',
 '1038298.4499999944',
 '555016.3499999999',
 '15098.650000002024',
 '10174.360000002947',
 '70591.21999999917',
 '6357.960000003297',
 '169610.6400000014',
 '98109.90000000157',
 '534824.119999998',
 '555016.3499999999',
 '358357.99999999785',
 '22386.8400000022',
 '25630.780000002145',
 '157000.90000000125',
 '98109.90000000157',
 '67286.37999999715',
 '6357.960000003297',
 '23223.199999999808',
 '10174.360000002947',
 '345452.1000000005',
 '194187.88000000134',
 '460101.60000000073',
 '25630.780000002145',
 '272502.8400000008',
 '555016.3499999999',
 '509785.4799999962',
 '35967.54000000156',
 '77084.56000000218',
 '98109.90000000157',
 '157000.90000000125',
 '272502.8400000008',
 '29816.64000000249',
 '67286.37999999715',
 '460101.60000000073',
 '534824.119999998',
 '534824.119999998',
 '35967.54000000156',
 '395509.45000000036',
 '70591.21999999917',

## Creating table in SAP Datasphere to write results back

In [47]:
dwc_data = df

In [48]:
dwc_data = dwc_data.assign(totalprofit = res)
dwc_data

Unnamed: 0,Units_Sold,Unit_Price,Unit_Cost,Total_Revenue,Total_Cost,totalprofit
6291390,3076,81.73,56.67,251401.48,174316.92,77084.56000000218
6291391,406,81.73,56.67,33182.38,23008.02,10174.360000002947
6291392,3076,109.28,35.84,336145.28,110243.84,225901.44000000172
6291393,406,437.2,263.33,177503.2,106911.98,70591.21999999917
6291394,6265,651.21,524.96,4079830.65,3288874.4,790956.2499999946
6291395,3076,437.2,263.33,1344827.2,810003.08,534824.119999998
6291396,6265,668.27,502.54,4186711.55,3148413.1,1038298.4499999944
6291397,6265,205.7,117.11,1288710.5,733694.15,555016.3499999999
6291398,6265,9.33,6.92,58452.45,43353.8,15098.650000002024
6291399,406,81.73,56.67,33182.38,23008.02,10174.360000002947


In [49]:
dwc_data.insert(0,'ID',dwc_data.index, False)

In [50]:
dwc_data.columns

Index(['ID', 'Units_Sold', 'Unit_Price', 'Unit_Cost', 'Total_Revenue',
       'Total_Cost', 'totalprofit'],
      dtype='object')

In [51]:
dwc_data

Unnamed: 0,ID,Units_Sold,Unit_Price,Unit_Cost,Total_Revenue,Total_Cost,totalprofit
6291390,6291390,3076,81.73,56.67,251401.48,174316.92,77084.56000000218
6291391,6291391,406,81.73,56.67,33182.38,23008.02,10174.360000002947
6291392,6291392,3076,109.28,35.84,336145.28,110243.84,225901.44000000172
6291393,6291393,406,437.2,263.33,177503.2,106911.98,70591.21999999917
6291394,6291394,6265,651.21,524.96,4079830.65,3288874.4,790956.2499999946
6291395,6291395,3076,437.2,263.33,1344827.2,810003.08,534824.119999998
6291396,6291396,6265,668.27,502.54,4186711.55,3148413.1,1038298.4499999944
6291397,6291397,6265,205.7,117.11,1288710.5,733694.15,555016.3499999999
6291398,6291398,6265,9.33,6.92,58452.45,43353.8,15098.650000002024
6291399,6291399,406,81.73,56.67,33182.38,23008.02,10174.360000002947


In [52]:
dwc_data.dtypes

ID                 int64
Units_Sold         int64
Unit_Price       float64
Unit_Cost        float64
Total_Revenue    float64
Total_Cost       float64
totalprofit       object
dtype: object

In [53]:
dwc_data = dwc_data.astype({'totalprofit': 'float64'})
dwc_data.dtypes

ID                 int64
Units_Sold         int64
Unit_Price       float64
Unit_Cost        float64
Total_Revenue    float64
Total_Cost       float64
totalprofit      float64
dtype: object

In [55]:
# ['ID', 'Units_Sold', 'Unit_Price', 'Unit_Cost', 'Total_Revenue','Total_Cost', 'totalprofit']

db.create_table("CREATE TABLE Linear_Reg_Model (ID INTEGER PRIMARY KEY, Units_Sold INTEGER, Unit_Price FLOAT(2), Unit_Cost FLOAT(2), Total_Revenue FLOAT(2), Total_Cost FLOAT(2), totalprofit FLOAT(2))")

creating table...
CREATE TABLE Linear_Reg_Model (ID INTEGER PRIMARY KEY, Units_Sold INTEGER, Unit_Price FLOAT(2), Unit_Cost FLOAT(2), Total_Revenue FLOAT(2), Total_Cost FLOAT(2), totalprofit FLOAT(2), INSERTED_AT TIMESTAMP NOT NULL)


In [56]:
db.insert_into_table('Linear_Reg_Model', dwc_data)

inserting into table...
INSERT INTO Linear_Reg_Model (ID, Units_Sold, Unit_Price, Unit_Cost, Total_Revenue, Total_Cost, totalprofit, "INSERTED_AT") VALUES (6291390.0, 3076.0, 81.73, 56.67, 251401.48, 174316.92, 77084.56000000218, '2022-03-21 17:30:19')
INSERT INTO Linear_Reg_Model (ID, Units_Sold, Unit_Price, Unit_Cost, Total_Revenue, Total_Cost, totalprofit, "INSERTED_AT") VALUES (6291391.0, 406.0, 81.73, 56.67, 33182.38, 23008.02, 10174.360000002947, '2022-03-21 17:30:19')
INSERT INTO Linear_Reg_Model (ID, Units_Sold, Unit_Price, Unit_Cost, Total_Revenue, Total_Cost, totalprofit, "INSERTED_AT") VALUES (6291392.0, 3076.0, 109.28, 35.84, 336145.28, 110243.84, 225901.44000000175, '2022-03-21 17:30:19')
INSERT INTO Linear_Reg_Model (ID, Units_Sold, Unit_Price, Unit_Cost, Total_Revenue, Total_Cost, totalprofit, "INSERTED_AT") VALUES (6291393.0, 406.0, 437.2, 263.33, 177503.2, 106911.98, 70591.21999999917, '2022-03-21 17:30:19')
INSERT INTO Linear_Reg_Model (ID, Units_Sold, Unit_Price, Uni