# Ridge Regression Model

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn import metrics
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBRegressor
import re
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
df_train = pd.read_csv(r'C:\Users\court\Desktop\AdvRegPre\regression_JM3_DSFT\regression_JM3_DSFT\data\df-train_set.csv')
df_test = pd.read_csv(r'C:\Users\court\Desktop\AdvRegPre\regression_JM3_DSFT\regression_JM3_DSFT\data\df-test_set.csv')

In [3]:
#Getting the months and quarters for the dates for train data by converting the data to datetime format
df_gda = df_train[df_train.Commodities == 'APPLE GOLDEN DELICIOUS']
df_gda.date = pd.to_datetime(df_gda.Date) 
df_gda["Quarter"] = df_gda.date.dt.quarter 
df_gda["Month"] = df_gda.date.dt.month 
df_gda["Year"] = df_gda.date.dt.year

df_gda.head()

Unnamed: 0,Province,Container,Size_Grade,Weight_Kg,Commodities,Date,Low_Price,High_Price,Sales_Total,Total_Qty_Sold,Total_Kg_Sold,Stock_On_Hand,avg_price_per_kg,Quarter,Month,Year
1,CAPE,M4183,1L,18.3,APPLE GOLDEN DELICIOUS,2020-09-09,150.0,170.0,51710.0,332,6075.6,822,8.51,3,9,2020
7,CAPE,JG110,2M,11.0,APPLE GOLDEN DELICIOUS,2020-04-14,50.0,50.0,16000.0,320,3520.0,0,4.55,2,4,2020
24,W.CAPE-BERGRIVER ETC,JE090,2S,9.0,APPLE GOLDEN DELICIOUS,2020-04-16,55.0,55.0,990.0,18,162.0,1506,6.11,2,4,2020
40,CAPE,M4183,1S,18.3,APPLE GOLDEN DELICIOUS,2020-05-04,80.0,120.0,32020.0,388,7100.4,443,4.51,2,5,2020
69,EASTERN CAPE,IA400,1S,400.0,APPLE GOLDEN DELICIOUS,2020-09-28,1800.0,1800.0,1800.0,1,400.0,2,4.5,3,9,2020


In [4]:
#Getting the months and quarters for the dates for test data
df_test.date = pd.to_datetime(df_test.Date)
df_test["Quarter"] = df_test.date.dt.quarter
df_test["Month"] = df_test.date.dt.month
df_test["Year_Month"] = df_test.date.dt.year

df_test.head()

Unnamed: 0,Index,Province,Container,Size_Grade,Weight_Kg,Commodities,Date,Low_Price,High_Price,Sales_Total,Total_Qty_Sold,Total_Kg_Sold,Stock_On_Hand,Quarter,Month,Year_Month
0,1,W.CAPE-BERGRIVER ETC,EC120,1M,12.0,APPLE GOLDEN DELICIOUS,2020-07-09,128.0,136.0,5008.0,38,456.0,0,3,7,2020
1,2,W.CAPE-BERGRIVER ETC,M4183,1X,18.3,APPLE GOLDEN DELICIOUS,2020-01-20,220.0,220.0,1760.0,8,146.4,2,1,1,2020
2,3,W.CAPE-BERGRIVER ETC,EC120,1S,12.0,APPLE GOLDEN DELICIOUS,2020-08-19,120.0,120.0,720.0,6,72.0,45,3,8,2020
3,4,W.CAPE-BERGRIVER ETC,M4183,1M,18.3,APPLE GOLDEN DELICIOUS,2020-05-06,160.0,160.0,160.0,1,18.3,8,2,5,2020
4,5,W.CAPE-BERGRIVER ETC,M4183,1L,18.3,APPLE GOLDEN DELICIOUS,2020-05-04,140.0,160.0,14140.0,100,1830.0,19,2,5,2020


In [5]:
df_gda = df_gda.drop('Date', axis=1)
df_gda.head()

Unnamed: 0,Province,Container,Size_Grade,Weight_Kg,Commodities,Low_Price,High_Price,Sales_Total,Total_Qty_Sold,Total_Kg_Sold,Stock_On_Hand,avg_price_per_kg,Quarter,Month,Year
1,CAPE,M4183,1L,18.3,APPLE GOLDEN DELICIOUS,150.0,170.0,51710.0,332,6075.6,822,8.51,3,9,2020
7,CAPE,JG110,2M,11.0,APPLE GOLDEN DELICIOUS,50.0,50.0,16000.0,320,3520.0,0,4.55,2,4,2020
24,W.CAPE-BERGRIVER ETC,JE090,2S,9.0,APPLE GOLDEN DELICIOUS,55.0,55.0,990.0,18,162.0,1506,6.11,2,4,2020
40,CAPE,M4183,1S,18.3,APPLE GOLDEN DELICIOUS,80.0,120.0,32020.0,388,7100.4,443,4.51,2,5,2020
69,EASTERN CAPE,IA400,1S,400.0,APPLE GOLDEN DELICIOUS,1800.0,1800.0,1800.0,1,400.0,2,4.5,3,9,2020


In [6]:
dummy_df = pd.get_dummies(df_gda, drop_first=True)
dummy_df.head()

Unnamed: 0,Weight_Kg,Low_Price,High_Price,Sales_Total,Total_Qty_Sold,Total_Kg_Sold,Stock_On_Hand,avg_price_per_kg,Quarter,Month,...,Container_M9125,Size_Grade_1M,Size_Grade_1S,Size_Grade_1U,Size_Grade_1X,Size_Grade_2L,Size_Grade_2M,Size_Grade_2S,Size_Grade_2U,Size_Grade_2X
1,18.3,150.0,170.0,51710.0,332,6075.6,822,8.51,3,9,...,0,0,0,0,0,0,0,0,0,0
7,11.0,50.0,50.0,16000.0,320,3520.0,0,4.55,2,4,...,0,0,0,0,0,0,1,0,0,0
24,9.0,55.0,55.0,990.0,18,162.0,1506,6.11,2,4,...,0,0,0,0,0,0,0,1,0,0
40,18.3,80.0,120.0,32020.0,388,7100.4,443,4.51,2,5,...,0,0,1,0,0,0,0,0,0,0
69,400.0,1800.0,1800.0,1800.0,1,400.0,2,4.5,3,9,...,0,0,1,0,0,0,0,0,0,0


In [7]:
X = dummy_df.drop('avg_price_per_kg', axis=1)
y = dummy_df['avg_price_per_kg']

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [9]:
# Create scaler object
scaler = StandardScaler()

In [10]:
# Create scaled version of the predictors (there is no need to scale the response)
X_scaled = scaler.fit_transform(X)

In [11]:
# Convert the scaled predictor values into a dataframe
X_standardise = pd.DataFrame(X_scaled,columns=X.columns)
X_standardise.head()

Unnamed: 0,Weight_Kg,Low_Price,High_Price,Sales_Total,Total_Qty_Sold,Total_Kg_Sold,Stock_On_Hand,Quarter,Month,Year,...,Container_M9125,Size_Grade_1M,Size_Grade_1S,Size_Grade_1U,Size_Grade_1X,Size_Grade_2L,Size_Grade_2M,Size_Grade_2S,Size_Grade_2U,Size_Grade_2X
0,-0.222433,-0.065087,-0.105317,0.811807,0.510117,0.511073,0.57107,0.83716,1.262165,0.181119,...,-0.039233,-0.44198,-0.612085,-0.032026,-0.30986,-0.255934,-0.367265,-0.432837,-0.093731,-0.116187
1,-0.295704,-0.332855,-0.382175,-0.10395,0.471248,0.091837,-0.563874,-0.26401,-0.598952,0.181119,...,-0.039233,-0.44198,-0.612085,-0.032026,-0.30986,-0.255934,2.722828,-0.432837,-0.093731,-0.116187
2,-0.315779,-0.319467,-0.370639,-0.48887,-0.506948,-0.459029,1.515476,-0.26401,-0.598952,0.181119,...,-0.039233,-0.44198,-0.612085,-0.032026,-0.30986,-0.255934,-0.367265,2.310338,-0.093731,-0.116187
3,-0.222433,-0.252525,-0.220674,0.306871,0.691504,0.679187,0.047781,-0.26401,-0.226728,0.181119,...,-0.039233,-0.44198,1.63376,-0.032026,-0.30986,-0.255934,-0.367265,-0.432837,-0.093731,-0.116187
4,3.608756,4.353082,3.655338,-0.468098,-0.562012,-0.419986,-0.561112,0.83716,1.262165,0.181119,...,-0.039233,-0.44198,1.63376,-0.032026,-0.30986,-0.255934,-0.367265,-0.432837,-0.093731,-0.116187


In [12]:
# Import train/test splitting function from sklearn
from sklearn.model_selection import train_test_split

In [13]:
# Import the ridge regression module from sklearn
from sklearn.linear_model import Ridge

In [14]:
# Create ridge model
ridge = Ridge()

In [15]:
# Train the model
ridge.fit(X_train, y_train)

Ridge()

In [16]:
# Extract the model intercept value
b0 = float(ridge.intercept_)

In [17]:
# Extract the model coefficient value
coeff = pd.DataFrame(ridge.coef_, X.columns, columns=['Coefficient'])

In [18]:
print("Intercept:", float(b0))

Intercept: 2790.1964785262494


In [19]:
# Check out the coefficients
coeff

Unnamed: 0,Coefficient
Weight_Kg,-0.011814
Low_Price,0.000278
High_Price,0.00174
Sales_Total,7.1e-05
Total_Qty_Sold,-0.002282
Total_Kg_Sold,-0.000362
Stock_On_Hand,-0.00014
Quarter,-0.315406
Month,0.114916
Year,-1.377767


In [20]:
# Create model object
lm = LinearRegression()

In [21]:
# Train model
lm.fit(X_train, y_train)

LinearRegression()

In [22]:
# Check training accuracy

train_ridge = ridge.predict(X_train)

print('Training MSE')

print('Ridge :', metrics.mean_squared_error(y_train, train_ridge))

Training MSE
Ridge : 1.8774424733674406


In [23]:
import pickle

model_save_path = "dct_model.pkl"
with open(model_save_path,'wb') as file:
    pickle.dump(ridge,file)

In [24]:
df_test = df_test.drop('Date', axis=1)
df_test.head()

Unnamed: 0,Index,Province,Container,Size_Grade,Weight_Kg,Commodities,Low_Price,High_Price,Sales_Total,Total_Qty_Sold,Total_Kg_Sold,Stock_On_Hand,Quarter,Month,Year_Month
0,1,W.CAPE-BERGRIVER ETC,EC120,1M,12.0,APPLE GOLDEN DELICIOUS,128.0,136.0,5008.0,38,456.0,0,3,7,2020
1,2,W.CAPE-BERGRIVER ETC,M4183,1X,18.3,APPLE GOLDEN DELICIOUS,220.0,220.0,1760.0,8,146.4,2,1,1,2020
2,3,W.CAPE-BERGRIVER ETC,EC120,1S,12.0,APPLE GOLDEN DELICIOUS,120.0,120.0,720.0,6,72.0,45,3,8,2020
3,4,W.CAPE-BERGRIVER ETC,M4183,1M,18.3,APPLE GOLDEN DELICIOUS,160.0,160.0,160.0,1,18.3,8,2,5,2020
4,5,W.CAPE-BERGRIVER ETC,M4183,1L,18.3,APPLE GOLDEN DELICIOUS,140.0,160.0,14140.0,100,1830.0,19,2,5,2020


In [25]:
test_ID = df_test['Index']
df_test = df_test.drop('Index', axis=1)

In [26]:
X_real = pd.get_dummies(df_test, drop_first=True)
X_real.head()

Unnamed: 0,Weight_Kg,Low_Price,High_Price,Sales_Total,Total_Qty_Sold,Total_Kg_Sold,Stock_On_Hand,Quarter,Month,Year_Month,...,Container_M9125,Size_Grade_1M,Size_Grade_1S,Size_Grade_1U,Size_Grade_1X,Size_Grade_2L,Size_Grade_2M,Size_Grade_2S,Size_Grade_2U,Size_Grade_2X
0,12.0,128.0,136.0,5008.0,38,456.0,0,3,7,2020,...,0,1,0,0,0,0,0,0,0,0
1,18.3,220.0,220.0,1760.0,8,146.4,2,1,1,2020,...,0,0,0,0,1,0,0,0,0,0
2,12.0,120.0,120.0,720.0,6,72.0,45,3,8,2020,...,0,0,1,0,0,0,0,0,0,0
3,18.3,160.0,160.0,160.0,1,18.3,8,2,5,2020,...,0,1,0,0,0,0,0,0,0,0
4,18.3,140.0,160.0,14140.0,100,1830.0,19,2,5,2020,...,0,0,0,0,0,0,0,0,0,0


In [27]:
prediction = ridge.predict(X_real)

In [28]:
# create submission dataframe for price of apples
submission = pd.DataFrame(
    {'Index': test_ID,
     'avg_price_per_kg': prediction
    })

In [29]:
submission.head()

Unnamed: 0,Index,avg_price_per_kg
0,1,8.703995
1,2,8.823244
2,3,8.361802
3,4,7.976387
4,5,8.474749


In [30]:
submission.describe()

Unnamed: 0,Index,avg_price_per_kg
count,685.0,685.0
mean,343.0,6.910237
std,197.886752,1.608847
min,1.0,-1.54776
25%,172.0,5.87029
50%,343.0,6.978449
75%,514.0,8.179626
max,685.0,10.869032


In [31]:
submission.to_csv("ridge.csv", index=False)