In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import scipy.stats as spstats


In [2]:
pd.set_option('display.max_columns', 85)
pd.set_option('display.max_rows', 85)

In [3]:
df = pd.read_csv("C://DataScience//Regis24/unique_orders.csv")
df.drop(columns = ['ORDER_ID', 'CUSTOMER_ID', 'delay_duration','NR_ITEMS_UNIQUE', 'NR_ITEMS',
                   'total_orders', 'paytime', 'rank','payment_duration' ], inplace=True)


In [4]:
# Monday is 0 and Sunday is 6

df['ORDER_DATE'] = pd.to_datetime(df['ORDER_DATE'])
df['ORDER_DATE_year'] = df['ORDER_DATE'].dt.year#2
df['ORDER_DATE_month'] = df['ORDER_DATE'].dt.month#3
df['ORDER_DATE_day'] = df['ORDER_DATE'].dt.day#2
df['ORDER_DATE_day_of_week'] = df['ORDER_DATE'].dt.dayofweek
df['ORDER_DATE_is_month_start'] = df['ORDER_DATE'].dt.is_month_start
df['ORDER_DATE_is_month_end'] = df['ORDER_DATE'].dt.is_month_end
df['ORDER_DATE_is_weekend'] = np.where(df['ORDER_DATE_day_of_week'].isin([5,6]), 1, 0)
df['ORDER_DATE_is_month_end'] = df['ORDER_DATE_is_month_end'].astype(str)
df['ORDER_DATE_is_month_start'] = df['ORDER_DATE_is_month_start'].astype(str)


In [5]:
df.drop(columns = ['PAYMENT_DATE','ORDER_DATE'], inplace=True)

In [6]:
# drop these rows having cart value above 1000
# from dataFrame
df = df.drop(df[df['CART_VALUE'] > 800].index)
df.shape


(1681198, 14)

In [7]:
df['CART_VALUE']= round(df['CART_VALUE'])
len(df['CART_VALUE'].unique())

801

In [8]:
df.shape

(1681198, 14)

### SPLIT DATA INTO TRAIN AND TEST

In [9]:
#Splitting the dataset to Test and Train
from sklearn.model_selection import train_test_split

X = df.drop(['delay_category'], axis= 1)
y = df['delay_category']
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=.2)
X_train = X_train.copy()            

In [10]:
### Normalize the pay_duration, age and Cart_value

In [11]:
y_train = pd.DataFrame(y_train)

In [12]:
from sklearn import preprocessing
min_max = preprocessing.MinMaxScaler()

In [13]:
# copy the data
X_train_min_max_scaled = X_train.copy()
  
# apply normalization techniques by Column 
column = 'AGE'
X_train_min_max_scaled[column] = (X_train_min_max_scaled[column] - X_train_min_max_scaled[column].min()) / (X_train_min_max_scaled[column].max() - X_train_min_max_scaled[column].min())    

column = 'CART_VALUE'
X_train_min_max_scaled[column] = (X_train_min_max_scaled[column] - X_train_min_max_scaled[column].min()) / (X_train_min_max_scaled[column].max() - X_train_min_max_scaled[column].min())    


In [14]:
# Test data same steps:
X_train = X_train_min_max_scaled
X_train.shape

(1344958, 13)

In [15]:
from sklearn.preprocessing import LabelEncoder
# Instantiate LabelEncoder
le = LabelEncoder() 

In [16]:
cols = ['ORDER_DATE_year','ORDER_DATE_month','ORDER_DATE_day','ORDER_DATE_day_of_week' ]

# Encode labels of multiple columns at once
X_train[cols] = X_train[cols].apply(le.fit_transform)


In [17]:
dummies_X_Train = pd.get_dummies(X_train[['ORDER_DATE_is_month_start',
        'ORDER_DATE_is_month_end', 'ORDER_DATE_is_weekend','SHOP', 
        'PAYMENT_INFO', 'GENDER','Default_Flag']])

In [18]:
dummies_X_Train.shape

(1344958, 27)

In [19]:
dummies_X_Train.join(X_train[['AGE','CART_VALUE', 'ORDER_DATE_year','ORDER_DATE_month','ORDER_DATE_day','ORDER_DATE_day_of_week' ]])

Unnamed: 0,ORDER_DATE_is_weekend,Default_Flag,ORDER_DATE_is_month_start_False,ORDER_DATE_is_month_start_True,ORDER_DATE_is_month_end_False,ORDER_DATE_is_month_end_True,SHOP_Baby & Child,SHOP_Car Accessories,SHOP_DIY Superstore & Tools,SHOP_Electronics,SHOP_Fashion,SHOP_Food,SHOP_Furniture,SHOP_Jewellery,SHOP_Media,SHOP_Shoes,SHOP_Sports,SHOP_Toys,SHOP_Young Fashion,PAYMENT_INFO_CONTRACT_TERMINATION,PAYMENT_INFO_INKASSO_CLOSED,PAYMENT_INFO_INKASSO_PENDING,PAYMENT_INFO_PAID_FULL,PAYMENT_INFO_PAID_OVER,GENDER_f,GENDER_m,GENDER_u,AGE,CART_VALUE,ORDER_DATE_year,ORDER_DATE_month,ORDER_DATE_day,ORDER_DATE_day_of_week
1554985,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0.230769,0.06250,4,11,13,0
1191845,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0.256410,0.22125,4,6,16,4
804690,1,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0.316239,0.35500,3,7,24,6
330517,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0.538462,0.04125,2,4,28,1
437840,1,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0.307692,0.08250,2,8,21,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
950319,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0.264957,0.04500,3,11,17,2
441823,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0.495726,0.15000,2,8,25,2
570552,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0.435897,0.20750,3,0,29,2
1265554,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0.658120,0.25750,4,6,6,1


In [20]:
X_train_final = dummies_X_Train.join(X_train[['AGE','CART_VALUE',
        'ORDER_DATE_year','ORDER_DATE_month','ORDER_DATE_day','ORDER_DATE_day_of_week' ]])

In [21]:
X_train_final.shape

(1344958, 33)

In [22]:
X_train_columnlist = X_train.columns
X_train.columns

Index(['PAYMENT_INFO', 'SHOP', 'AGE', 'CART_VALUE', 'GENDER', 'Default_Flag',
       'ORDER_DATE_year', 'ORDER_DATE_month', 'ORDER_DATE_day',
       'ORDER_DATE_day_of_week', 'ORDER_DATE_is_month_start',
       'ORDER_DATE_is_month_end', 'ORDER_DATE_is_weekend'],
      dtype='object')

In [23]:
# Encode labels of multiple columns at once

y_train = y_train.apply(le.fit_transform)

# Print head
y_train.head(3)

Unnamed: 0,delay_category
1554985,4
1191845,4
804690,0


# Test Data

In [24]:
# copy the data
# copy the data
X_test_min_max_scaled = X_test.copy()
  
# apply normalization techniques by Column 
column = 'AGE'
X_test_min_max_scaled[column] = (X_test_min_max_scaled[column] - X_test_min_max_scaled[column].min()) / (X_test_min_max_scaled[column].max() - X_test_min_max_scaled[column].min())    
column = 'CART_VALUE'
X_test_min_max_scaled[column] = (X_test_min_max_scaled[column] - X_test_min_max_scaled[column].min()) / (X_test_min_max_scaled[column].max() - X_test_min_max_scaled[column].min())    


In [25]:
# Test data same steps:
X_test = X_test_min_max_scaled
X_test.shape


(336240, 13)

In [26]:
cols = ['ORDER_DATE_year','ORDER_DATE_month','ORDER_DATE_day','ORDER_DATE_day_of_week' ]

# Encode labels of multiple columns at once
X_test[cols] = X_test[cols].apply(le.fit_transform)

In [27]:
dummies_X_Test = pd.get_dummies(X_test[['ORDER_DATE_is_month_start',
        'ORDER_DATE_is_month_end', 'ORDER_DATE_is_weekend','SHOP', 
        'PAYMENT_INFO', 'GENDER','Default_Flag']])

In [28]:
dummies_X_Train.shape

(1344958, 27)

In [29]:
dummies_X_Test.join(X_test[['AGE','CART_VALUE',
        'ORDER_DATE_year','ORDER_DATE_month','ORDER_DATE_day','ORDER_DATE_day_of_week'  ]])

Unnamed: 0,ORDER_DATE_is_weekend,Default_Flag,ORDER_DATE_is_month_start_False,ORDER_DATE_is_month_start_True,ORDER_DATE_is_month_end_False,ORDER_DATE_is_month_end_True,SHOP_Baby & Child,SHOP_Car Accessories,SHOP_DIY Superstore & Tools,SHOP_Electronics,SHOP_Fashion,SHOP_Food,SHOP_Furniture,SHOP_Jewellery,SHOP_Media,SHOP_Shoes,SHOP_Sports,SHOP_Toys,SHOP_Young Fashion,PAYMENT_INFO_CONTRACT_TERMINATION,PAYMENT_INFO_INKASSO_CLOSED,PAYMENT_INFO_INKASSO_PENDING,PAYMENT_INFO_PAID_FULL,PAYMENT_INFO_PAID_OVER,GENDER_f,GENDER_m,GENDER_u,AGE,CART_VALUE,ORDER_DATE_year,ORDER_DATE_month,ORDER_DATE_day,ORDER_DATE_day_of_week
1657786,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0.426087,0.06375,5,1,8,1
1426112,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0.513043,0.08375,4,9,7,3
573365,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0.521739,0.10000,3,1,1,5
744536,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0.330435,0.39875,3,6,3,3
177267,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0.556522,0.11625,1,10,9,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1497957,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0.460870,0.05750,4,10,16,1
209227,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0.347826,0.06375,1,11,10,0
179080,1,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0.356522,0.06875,1,10,11,6
1032146,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0.513043,0.02125,4,1,17,1


In [30]:
X_test_final = dummies_X_Test.join(X_test[['AGE','CART_VALUE',
        'ORDER_DATE_year','ORDER_DATE_month','ORDER_DATE_day','ORDER_DATE_day_of_week'  ]])


In [31]:
y_test = pd.DataFrame(y_test)

In [32]:
# Encode labels of multiple columns at once

y_test = y_test.apply(le.fit_transform)

# Print head
y_test.head(3)

Unnamed: 0,delay_category
1657786,4
1426112,4
573365,4


In [33]:
list(le.classes_)

['1-14 days', '15-29 days', '30-59 days', '60+ days', 'On Time']

### Check the final X and y and train and test

In [34]:
print (X_test_final.shape)
print (X_train_final.shape)
print (y_test.shape)
print (y_train.shape)

(336240, 33)
(1344958, 33)
(336240, 1)
(1344958, 1)


## MODEL CREATION 

In [35]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=500,criterion='gini',random_state=4,max_depth=9)



In [36]:
#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train_final,y_train.values.ravel() )  



RandomForestClassifier(max_depth=9, n_estimators=500, random_state=4)

In [37]:
y_pred=clf.predict(X_test_final)

In [38]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.5345021413276231


In [54]:
from sklearn.metrics import classification_report


In [55]:
print (classification_report(y_test,y_pred))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.41      0.07      0.12    105682
           1       0.00      0.00      0.00     31477
           2       0.75      0.00      0.00     14662
           3       0.95      0.20      0.33      9543
           4       0.54      0.97      0.69    174876

    accuracy                           0.53    336240
   macro avg       0.53      0.25      0.23    336240
weighted avg       0.47      0.53      0.41    336240



  _warn_prf(average, modifier, msg_start, len(result))
