GOAL : 

1.   Predicting the Fastest and Average shipment duration of goods/product from the date of order. 
2.   Late_risk_classification model for predicting order with high risk(1) and low risk(0).



1) Importing necessary Libraries and packages

In [None]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
import keras
from keras.layers import Dense
from keras.models import Sequential

In [None]:
data = pd.read_csv('../input/dataco-smart-supply-chain-for-big-data-analysis/DataCoSupplyChainDataset.csv',header= 0,encoding='unicode_escape')
pd.set_option('display.max_columns',None)
data.head()

2) Exploratory Data Analysis [EDA]

In [None]:
def data_info(data):
    print('1) Number of columns are : ',data.shape[1])
    print('2) Number of rows are : ',data.shape[0])
    print('3) Total number of data-points :',data.size)
    numerical_features = [f for f in data.columns if data[f].dtypes!='O']
    print('4) Count of Numerical Features :',len(numerical_features))
    cat_features = [c for c in data.columns if data[c].dtypes=='O']
    print('5) Count of Categorical Features :',len(cat_features))
data_info(data)

3) Missing value identification

In [None]:
def features_with_missing_values(data):
    for i in data.columns:
        if data[i].isna().sum()>0:
            print('The Feature ',i,' has '+ str(data[i].isna().sum()) + ' missing values')
            
features_with_missing_values(data)

4) Statistical Analysis

In [None]:
# using pearson correlation to find correlation b/w numerical features in the dataset
data.corr(method='pearson')

In [None]:
data.describe()

6) Feature Engineering

In [None]:
#Converting categorical features that represent date and time to datetime datatype.
data['order_date'] = pd.to_datetime(data['order date (DateOrders)'])
data['shipping_date']=pd.to_datetime(data['shipping date (DateOrders)'])

7) Feature Selection

In [None]:
new_dataset_features = ['Type','Days for shipping (real)','Days for shipment (scheduled)','Late_delivery_risk','Benefit per order',
                        'Sales per customer','Latitude','Longitude','Shipping Mode','Order Status','Order Region',
                        'Order Country','Order City','Market','Delivery Status','order_date','shipping_date']
len(new_dataset_features)

In [None]:
new_data = data[new_dataset_features]
new_data.head()

In [None]:
model_data = new_data.drop(['order_date','shipping_date'],axis=1)
model_data.head()

In [None]:
#One-Hot encoding categotical variables in the data
model_data = pd.get_dummies(model_data)
model_data.shape

*  Target features : Days for shipping (real), Days for shipment (scheduled)
*  Problem type : Multi-class Regression




In [None]:
# Splitting independent and dependent features from the processsed dataset
x = model_data.drop(['Days for shipping (real)','Days for shipment (scheduled)'],axis=1)
y = model_data[['Days for shipping (real)','Days for shipment (scheduled)']]
x.shape,y.shape

In [None]:
# Splitting data into train and test. Alloting 25% data for testing
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20)

Applying Multioutput Regression with Decision Tree Regressor

In [None]:
model = DecisionTreeRegressor()
model.fit(x_train,y_train)
pred = model.predict(x_test)

Evaluating the Regression model

In [None]:
#Function for estimating r2_score, mean_squared_error, mean_absolute_error
def metrics(y_test,pred):
    a =r2_score(y_test,pred)
    b =mean_squared_error(y_test,pred)
    c =mean_absolute_error(y_test,pred)
    print('The r-squared score of the model is ',a)
    print('The mean squared error is',b)
    print('The mean accuracy score is',c)

In [None]:
metrics(y_test,pred)

In [None]:
from sklearn.tree import export_graphviz
dot_data = export_graphviz(model,out_file='tree.dot',feature_names=x.columns,class_names= model.classes_)

In [None]:
#from sklearn.model_selection import cross_validate
#cv_results = cross_validate(model,x,y,cv=4)
#cv_results

In [None]:
#Converting the predicted output array to dataframe
Prediction = pd.DataFrame(pred)
prediction = Prediction.rename(columns={0:'Fastest_shipment',1:'Avg_shipment'})
prediction.head()

Risk in delivery is a feature that highly corresponds and correlates to Fastest Shipment and Avg_shipment duration in days. If the fastest shipment duration is greater than avg_shipment, then late delivery risk is heavier as the scheduled days are outnumbered. Similarly, if avg_shipment values are greater than former, then there is a high probability for FAST/EARLY delivery of goods to customers

In [None]:
# Statiscal modelling
prediction['risk'] = np.where(prediction['Avg_shipment'] >= prediction['Fastest_shipment'],0,1)

In [None]:
prediction.head()

In [None]:
l = prediction['risk']
m = x_test['Late_delivery_risk']
l.shape,m.shape

In [None]:
# Defining a function to evaluate our statiscal model for Late_delivery_risk_prediction
def evaluation_risk_factor(l,m):
  print('The accuracy of the risk predictor model is ',accuracy_score(l,m))
  print('Some of the key classification metrics are :')
  print(classification_report(l,m))
  ax=plt.subplot()
  sns.heatmap(confusion_matrix(l,m),annot=True,ax=ax);
  ax.set_xlabel('Predicted labels'); ax.set_ylabel('True labels')
  ax.set_title('Confusion matrix for Risk Delivery classfication');

In [None]:
evaluation_risk_factor(l,m)
#comparing_late_delivery_risk_values of predicted and actual values.

In [None]:
sns.set(style="darkgrid")
ax=sns.countplot(x="risk",data=prediction).set_title('Predicted Late delivery risks')

In [None]:
bx=sns.countplot(x='Late_delivery_risk',data=x_test).set_title('Actual Late_delivery_risk')

In [None]:
filename = 'Shipping_duration_estimator.pkl'
pickle.dump(model,open(filename,'wb'))