# Shipping Cost Prediction


### Reg No: IT21134180
### Name: Vihansa S.A.S

<hr/>

<ul>
    <li><b>Target Variable:</b> Shipping Cost </li>
    <li><b>Predictors:</b>category,sub_category,ship_mode,sales,quantity,profit,order_priority,product_id,market,state</li>
    <li><b>Objective:</b>Predict the shipping cost that is incured by a customer by considering few variables which affects its cost, such as payment priorities, etc.</li>
</ul>




In [46]:
#Importing required libraries
import seaborn as sns
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from datetime import datetime
from dateutil.parser import parse
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge
from sklearn.datasets import make_regression
from sklearn.linear_model import ElasticNet
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from scipy.stats import uniform, randint
from sklearn import preprocessing

from sklearn.model_selection import RandomizedSearchCV

In [47]:
#Import the dataset 
dataset = pd.read_csv('../dataset/SuperStoreOrders.csv')

In [48]:
# Visualise the dataset 
dataset

Unnamed: 0,order_id,order_date,ship_date,ship_mode,customer_name,segment,state,country,market,region,...,category,sub_category,product_name,sales,quantity,discount,profit,shipping_cost,order_priority,year
0,AG-2011-2040,1/1/2011,6/1/2011,Standard Class,Toby Braunhardt,Consumer,Constantine,Algeria,Africa,Africa,...,Office Supplies,Storage,"Tenex Lockers, Blue",408,2,0.0,106.1400,35.46,Medium,2011
1,IN-2011-47883,1/1/2011,8/1/2011,Standard Class,Joseph Holt,Consumer,New South Wales,Australia,APAC,Oceania,...,Office Supplies,Supplies,"Acme Trimmer, High Speed",120,3,0.1,36.0360,9.72,Medium,2011
2,HU-2011-1220,1/1/2011,5/1/2011,Second Class,Annie Thurman,Consumer,Budapest,Hungary,EMEA,EMEA,...,Office Supplies,Storage,"Tenex Box, Single Width",66,4,0.0,29.6400,8.17,High,2011
3,IT-2011-3647632,1/1/2011,5/1/2011,Second Class,Eugene Moren,Home Office,Stockholm,Sweden,EU,North,...,Office Supplies,Paper,"Enermax Note Cards, Premium",45,3,0.5,-26.0550,4.82,High,2011
4,IN-2011-47883,1/1/2011,8/1/2011,Standard Class,Joseph Holt,Consumer,New South Wales,Australia,APAC,Oceania,...,Furniture,Furnishings,"Eldon Light Bulb, Duo Pack",114,5,0.1,37.7700,4.70,Medium,2011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51285,CA-2014-115427,31-12-2014,4/1/2015,Standard Class,Erica Bern,Corporate,California,United States,US,West,...,Office Supplies,Binders,"Cardinal Slant-D Ring Binder, Heavy Gauge Vinyl",14,2,0.2,4.5188,0.89,Medium,2014
51286,MO-2014-2560,31-12-2014,5/1/2015,Standard Class,Liz Preis,Consumer,Souss-Massa-Draâ,Morocco,Africa,Africa,...,Office Supplies,Binders,"Wilson Jones Hole Reinforcements, Clear",4,1,0.0,0.4200,0.49,Medium,2014
51287,MX-2014-110527,31-12-2014,2/1/2015,Second Class,Charlotte Melton,Consumer,Managua,Nicaragua,LATAM,Central,...,Office Supplies,Labels,"Hon Color Coded Labels, 5000 Label Set",26,3,0.0,12.3600,0.35,Medium,2014
51288,MX-2014-114783,31-12-2014,6/1/2015,Standard Class,Tamara Dahlen,Consumer,Chihuahua,Mexico,LATAM,North,...,Office Supplies,Labels,"Hon Legal Exhibit Labels, Alphabetical",7,1,0.0,0.5600,0.20,Medium,2014


In [49]:
#Select columns in the dataset 
dataset = dataset[["product_id","category", "sub_category","market", "state","ship_mode","order_priority","quantity","profit","sales","discount", "shipping_cost"]]
print(dataset)

             product_id         category sub_category  market  \
0      OFF-TEN-10000025  Office Supplies      Storage  Africa   
1       OFF-SU-10000618  Office Supplies     Supplies    APAC   
2      OFF-TEN-10001585  Office Supplies      Storage    EMEA   
3       OFF-PA-10001492  Office Supplies        Paper      EU   
4       FUR-FU-10003447        Furniture  Furnishings    APAC   
...                 ...              ...          ...     ...   
51285   OFF-BI-10002103  Office Supplies      Binders      US   
51286  OFF-WIL-10001069  Office Supplies      Binders  Africa   
51287   OFF-LA-10004182  Office Supplies       Labels   LATAM   
51288   OFF-LA-10000413  Office Supplies       Labels   LATAM   
51289   OFF-FA-10003472  Office Supplies    Fasteners      US   

                  state       ship_mode order_priority  quantity    profit  \
0           Constantine  Standard Class         Medium         2  106.1400   
1       New South Wales  Standard Class         Medium         

In [50]:
#Checking for Missing values 
dataset.isnull() #It is difficult to identify missing values using this method since the dataset is large.
dataset.isnull().values.any() #Using this command we can scan the entire dataset and get a verification whether there are missing values or not 

False

In [51]:
#Getting the number of unique values in the ship_mode variables of each feature seperately.
dataset['ship_mode'].value_counts(normalize=False)

ship_mode
Standard Class    30775
Second Class      10309
First Class        7505
Same Day           2701
Name: count, dtype: int64

In [52]:
#Get the count of variables that has unique values in the dataset 
categorical_counts = dataset.select_dtypes(include=['object']).nunique()
print(categorical_counts)

product_id        10292
category              3
sub_category         17
market                7
state              1094
ship_mode             4
order_priority        4
sales              2246
dtype: int64


In [53]:
#Prevent sales value been read as string due to the , in between the numbers : Remove , in sales 
dataset.loc[:, 'sales'] = dataset['sales'].astype(str).str.replace(',', '')

# Convert 'sales' column to numerical value
dataset.loc[:, 'sales'] = pd.to_numeric(dataset['sales'])


In [54]:
# Initialise label encoder
label_encoder = LabelEncoder() 
  
# Encode labels in the dataset: Categorical variables conversion to number format 
dataset.loc[:, 'product_id'] = label_encoder.fit_transform(dataset['product_id'])
dataset.loc[:, 'category'] = label_encoder.fit_transform(dataset['category'])
dataset.loc[:, 'sub_category'] = label_encoder.fit_transform(dataset['sub_category'])
dataset.loc[:, 'market'] = label_encoder.fit_transform(dataset['market'])
dataset.loc[:, 'state'] = label_encoder.fit_transform(dataset['state'])
dataset.loc[:, 'ship_mode'] = label_encoder.fit_transform(dataset['ship_mode'])
dataset.loc[:, 'order_priority'] = label_encoder.fit_transform(dataset['order_priority'])



In [55]:
#Print the data types of the dataset
print(dataset.dtypes)

product_id         object
category           object
sub_category       object
market             object
state              object
ship_mode          object
order_priority     object
quantity            int64
profit            float64
sales              object
discount          float64
shipping_cost     float64
dtype: object


In [56]:
#Normalisation
profit_column = dataset['profit']
normalized_profit = preprocessing.normalize([profit_column])
dataset.loc[:,'profit'] = normalized_profit[0]

profit_column = dataset['shipping_cost']
normalized_profit = preprocessing.normalize([profit_column])
dataset.loc[:,'shipping_cost'] = normalized_profit[0]

profit_column = dataset['sales']
normalized_profit = preprocessing.normalize([profit_column])
dataset.loc[:,'sales'] = normalized_profit[0]

In [57]:
#Visualise dataset after normalisation
dataset

Unnamed: 0,product_id,category,sub_category,market,state,ship_mode,order_priority,quantity,profit,sales,discount,shipping_cost
0,7846,1,14,1,255,3,3,2,0.002651,0.003298,0.0,0.002482
1,7464,1,15,0,702,3,3,3,0.000900,0.00097,0.1,0.000680
2,7858,1,14,3,175,2,1,4,0.000740,0.000533,0.0,0.000572
3,6206,1,12,4,939,2,1,3,-0.000651,0.000364,0.5,0.000337
4,1492,0,9,0,702,3,3,5,0.000944,0.000921,0.1,0.000329
...,...,...,...,...,...,...,...,...,...,...,...,...
51285,3659,1,3,6,192,3,3,2,0.000113,0.000113,0.2,0.000062
51286,7877,1,3,1,923,3,3,1,0.000010,0.000032,0.0,0.000034
51287,5889,1,10,5,599,2,3,3,0.000309,0.00021,0.0,0.000025
51288,5529,1,10,5,230,3,3,1,0.000014,0.000057,0.0,0.000014


In [58]:
#Split the dataset as X and Y 
x = dataset.drop(['shipping_cost'],axis=1)
y = dataset['shipping_cost'] #value that is expected to predict 

In [59]:
#Visualise X
x

Unnamed: 0,product_id,category,sub_category,market,state,ship_mode,order_priority,quantity,profit,sales,discount
0,7846,1,14,1,255,3,3,2,0.002651,0.003298,0.0
1,7464,1,15,0,702,3,3,3,0.000900,0.00097,0.1
2,7858,1,14,3,175,2,1,4,0.000740,0.000533,0.0
3,6206,1,12,4,939,2,1,3,-0.000651,0.000364,0.5
4,1492,0,9,0,702,3,3,5,0.000944,0.000921,0.1
...,...,...,...,...,...,...,...,...,...,...,...
51285,3659,1,3,6,192,3,3,2,0.000113,0.000113,0.2
51286,7877,1,3,1,923,3,3,1,0.000010,0.000032,0.0
51287,5889,1,10,5,599,2,3,3,0.000309,0.00021,0.0
51288,5529,1,10,5,230,3,3,1,0.000014,0.000057,0.0


In [60]:
#Visualise y : Shipping cost column values that is expected to be predicted 
y

0        0.002482
1        0.000680
2        0.000572
3        0.000337
4        0.000329
           ...   
51285    0.000062
51286    0.000034
51287    0.000025
51288    0.000014
51289    0.000012
Name: shipping_cost, Length: 51290, dtype: float64

In [61]:
#Split train and test dataset 
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.2,train_size=0.8,random_state=1,shuffle=True)


In [62]:
print('x train')
print(x_train)



x train
      product_id category sub_category market state ship_mode order_priority  \
48142       1035        0            4      3    50         3              3   
46551      10050        2           13      6   703         0              1   
13524       7854        1           14      2   819         3              3   
32461       4805        1            8      5   540         0              1   
23857       5971        1           10      1   499         2              1   
...          ...      ...          ...    ...   ...       ...            ...   
50057       6940        1           14      5   854         2              1   
32511       9878        2           13      6  1050         3              3   
5192        5456        1           15      3   452         3              1   
12172       5711        1           10      0   924         3              3   
33003       6566        1           12      0   820         3              3   

       quantity    profit     s

In [63]:
print('X test')
print (x_test)



X test
      product_id category sub_category market state ship_mode order_priority  \
14799       9165        2           11      0   702         2              3   
13808       8752        2            6      5   964         3              3   
1113        1584        0            9      6   332         3              3   
25534        893        0            5      0   463         3              2   
8472        6949        1           14      4   722         3              3   
...          ...      ...          ...    ...   ...       ...            ...   
12538       8714        2            6      0  1056         2              3   
34586       7331        1            2      1   826         2              0   
45279       4930        1            8      5   371         3              2   
10220       8708        2            6      0   897         3              3   
10615       3946        1            3      4   722         3              1   

       quantity    profit     sa

In [64]:
print('y train')
print(y_train)



y train
48142    0.001272
46551    0.000302
13524    0.000207
32461    0.000757
23857    0.000313
           ...   
50057    0.000041
32511    0.000900
5192     0.000092
12172    0.000060
33003    0.000066
Name: shipping_cost, Length: 41032, dtype: float64


In [65]:
print('y test')
print (y_test)

y test
14799    0.002879
13808    0.002000
1113     0.000166
25534    0.008176
8472     0.000679
           ...   
12538    0.002095
34586    0.000215
45279    0.000084
10220    0.001345
10615    0.000897
Name: shipping_cost, Length: 10258, dtype: float64


In [66]:
#Linear Regression without regularisation

reg = LinearRegression().fit(x_train, y_train)
y_pred = reg.predict(x_train)

# Print coefficients
# print("Coefficients:", reg.coef_)

# Print intercept
print("Intercept:", reg.intercept_)

# Print R-squared
print("R-squared:", r2_score(y_train, y_pred))

# Print Mean Squared Error (MSE)
print("Mean Squared Error:", mean_squared_error(y_train, y_pred))

# More detailed information about the model
print("Model details:", reg)


Intercept: 0.0021231647864835457
R-squared: 0.655971996001409
Mean Squared Error: 5.575408851647874e-06
Model details: LinearRegression()


In [67]:
clf = Ridge(alpha=10.0)

# Define the grid of hyperparameters to search
param_grid = {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0]}

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='r2')
grid_search.fit(x_train, y_train)

# Get the best hyperparameters
best_alpha = grid_search.best_params_['alpha']
print("Best alpha:", best_alpha)

# Use the best model
best_clf = grid_search.best_estimator_
y_pred = best_clf.predict(x_train)

# Print R-squared
print("R-squared:", r2_score(y_train, y_pred))

# Print Mean Squared Error (MSE)
print("Mean Squared Error:", mean_squared_error(y_train, y_pred))

# More detailed information about the best model
print("Best model details:", best_clf)

Best alpha: 0.001
R-squared: 0.655969264129847
Mean Squared Error: 5.575453125081172e-06
Best model details: Ridge(alpha=0.001)


In [68]:
regr = ElasticNet(alpha=0.01,random_state=0)

# Define the grid of hyperparameters to search
param_grid = {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0],
              'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]}

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=regr, param_grid=param_grid, cv=5, scoring='r2')
grid_search.fit(x_train, y_train)

# Get the best hyperparameters
best_alpha = grid_search.best_params_['alpha']
best_l1_ratio = grid_search.best_params_['l1_ratio']
print("Best alpha:", best_alpha)
print("Best l1_ratio:", best_l1_ratio)

# Use the best model
best_regr = grid_search.best_estimator_
y_pred = best_regr.predict(x_train)

# Print R-squared
print("R-squared:", r2_score(y_train, y_pred))

# Print Mean Squared Error (MSE)
print("Mean Squared Error:", mean_squared_error(y_train, y_pred))

# More detailed information about the best model
print("Best model details:", best_regr)

Best alpha: 0.001
Best l1_ratio: 0.1
R-squared: 0.12200520266375547
Mean Squared Error: 1.4229015975075467e-05
Best model details: ElasticNet(alpha=0.001, l1_ratio=0.1, random_state=0)


In [69]:
#Lasso Regression
regr = linear_model.Lasso(alpha=0.1)
regr.fit(x_train, y_train)
y_pred=regr.predict(x_train)

# Define the hyperparameter distribution for Randomized Search
param_dist = {
    'alpha': uniform(0.001, 10.0),
    'fit_intercept': [True, False],
    'precompute': [True, False],
    'copy_X': [True, False],
    'max_iter': randint(100, 1000),
    'warm_start': [True, False],
    'positive': [True, False],
    'random_state': [None, 0, 42],
    'selection': ['cyclic', 'random']
}

# Perform Randomized Search with cross-validation
random_search = RandomizedSearchCV(estimator=regr, param_distributions=param_dist, n_iter=100, cv=3, scoring='r2', random_state=42)
random_search.fit(x_train, y_train)

# Get the best hyperparameters
best_params = random_search.best_params_
# print("Best hyperparameters:", best_params)

# Use the best model
best_regr = random_search.best_estimator_
y_pred = best_regr.predict(x_train)

# Print R-squared
print("R-squared:", r2_score(y_train, y_pred))

# Print Mean Squared Error (MSE)
print("Mean Squared Error:", mean_squared_error(y_train, y_pred))

# More detailed information about the best model
print("Best model details:", best_regr)

R-squared: 0.00017001679277084403
Mean Squared Error: 1.62035092310083e-05
Best model details: Lasso(alpha=0.18175363615520868, copy_X=False, max_iter=915, random_state=42,
      warm_start=True)
