In [30]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score,mean_absolute_error

In [2]:
df = pd.read_csv('E:/GUVI/Project/shipping_ecommerce.csv')
df.head()

Unnamed: 0,Customer_care_calls,Customer_rating,Prior_purchases,Discount_offered,Weight_in_gms,Warehouse_block,Mode_of_Shipment,Product_importance,Gender,Class
0,5,4,2,10,5395,A,Ship,medium,M,1
1,4,3,2,6,5867,F,Ship,medium,F,0
2,3,4,2,2,5957,D,Ship,medium,M,0
3,3,1,2,27,2551,D,Ship,medium,M,1
4,7,5,4,9,1329,B,Ship,medium,M,1


In [3]:
df.shape

(10998, 10)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10998 entries, 0 to 10997
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Customer_care_calls  10998 non-null  int64 
 1   Customer_rating      10998 non-null  int64 
 2   Prior_purchases      10998 non-null  int64 
 3   Discount_offered     10998 non-null  int64 
 4   Weight_in_gms        10998 non-null  int64 
 5   Warehouse_block      10998 non-null  object
 6   Mode_of_Shipment     10998 non-null  object
 7   Product_importance   10998 non-null  object
 8   Gender               10998 non-null  object
 9   Class                10998 non-null  int64 
dtypes: int64(6), object(4)
memory usage: 859.3+ KB


# Checking for Null Values

In [5]:
df.isnull().sum()

Customer_care_calls    0
Customer_rating        0
Prior_purchases        0
Discount_offered       0
Weight_in_gms          0
Warehouse_block        0
Mode_of_Shipment       0
Product_importance     0
Gender                 0
Class                  0
dtype: int64

In [6]:
df.isna().sum()

Customer_care_calls    0
Customer_rating        0
Prior_purchases        0
Discount_offered       0
Weight_in_gms          0
Warehouse_block        0
Mode_of_Shipment       0
Product_importance     0
Gender                 0
Class                  0
dtype: int64

# Checking for unique values to encode categorical data

In [7]:
for i in df.columns:
    print(i,df[i].nunique())

Customer_care_calls 6
Customer_rating 5
Prior_purchases 8
Discount_offered 65
Weight_in_gms 4034
Warehouse_block 5
Mode_of_Shipment 3
Product_importance 3
Gender 2
Class 2


In [8]:
df1 = df.copy()

In [9]:
df1.head()

Unnamed: 0,Customer_care_calls,Customer_rating,Prior_purchases,Discount_offered,Weight_in_gms,Warehouse_block,Mode_of_Shipment,Product_importance,Gender,Class
0,5,4,2,10,5395,A,Ship,medium,M,1
1,4,3,2,6,5867,F,Ship,medium,F,0
2,3,4,2,2,5957,D,Ship,medium,M,0
3,3,1,2,27,2551,D,Ship,medium,M,1
4,7,5,4,9,1329,B,Ship,medium,M,1


# Encoding the categorical data using LabelEncoder

In [10]:
le = LabelEncoder()
df1['Warehouse_block'] = le.fit_transform(df1['Warehouse_block'])
df1['Mode_of_Shipment'] =le.fit_transform(df1['Mode_of_Shipment'])
df1['Product_importance'] = le.fit_transform(df1['Product_importance'])
df1['Gender'] = le.fit_transform(df1['Gender'])

In [11]:
df1.head()

Unnamed: 0,Customer_care_calls,Customer_rating,Prior_purchases,Discount_offered,Weight_in_gms,Warehouse_block,Mode_of_Shipment,Product_importance,Gender,Class
0,5,4,2,10,5395,0,2,2,1,1
1,4,3,2,6,5867,4,2,2,0,0
2,3,4,2,2,5957,3,2,2,1,0
3,3,1,2,27,2551,3,2,2,1,1
4,7,5,4,9,1329,1,2,2,1,1


# Creating list of dtype 'int64' to check the distribution.

In [12]:
num_col = [x for x in df1.columns if df1[x].dtype == 'int64']

In [13]:
for i in num_col:
    print(i,df1[i].skew())

Customer_care_calls 0.39177325169075056
Customer_rating 0.004521408219225412
Prior_purchases 1.681914754497086
Discount_offered 1.7987871854055941
Weight_in_gms -0.24958549524039783
Class -0.39448713850496814


if skew value close to zero or equal to zero it is normally distributed if it is greater or lesser than 1 then it is positive or negatively skewed.

In [14]:
#usiing log transormation to correct the skewness
df1['Prior_purchases'] = np.log(df1['Prior_purchases'])
df1['Discount_offered'] = np.log(df1['Discount_offered'])

In [15]:
for i in num_col:
    print(i,df1[i].skew())

Customer_care_calls 0.39177325169075056
Customer_rating 0.004521408219225412
Prior_purchases 0.46284968814319644
Discount_offered 0.21823367846217892
Weight_in_gms -0.24958549524039783
Class -0.39448713850496814


# Assigning X,y with appropriate variables

In [17]:
X = df1.iloc[:,0:8]
y = df1.iloc[:,9:]

In [19]:
y

Unnamed: 0,Class
0,1
1,0
2,0
3,1
4,1
...,...
10993,1
10994,1
10995,1
10996,1


# Using StandardScaler to scale the values with mean of '0' and standard deviation of '1'. 

In [20]:
sc = StandardScaler()
X_scaled = sc.fit_transform(X)

In [21]:
X_scaled

array([[ 0.82828724,  0.71418247, -1.32319831, ..., -1.56559075,
         0.6383826 ,  1.03565174],
       [-0.04779499,  0.00675395, -1.32319831, ...,  1.11796539,
         0.6383826 ,  1.03565174],
       [-0.92387723,  0.71418247, -1.32319831, ...,  0.44707635,
         0.6383826 ,  1.03565174],
       ...,
       [-0.92387723, -0.70067456,  2.91478614, ...,  1.11796539,
         0.6383826 ,  1.03565174],
       [-0.92387723, -1.40810307, -0.25552443, ...,  1.11796539,
        -2.00404564,  1.03565174],
       [-0.04779499,  0.00675395, -0.25552443, ..., -0.89470172,
         0.6383826 ,  1.03565174]])

# Using train_test_split to split the dataset

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=42)

In [24]:
X_train.shape, Y_test.shape

((7698, 8), (3300, 1))

# Using DecisionTreeClassifier for prediction of Target variable

In [32]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train,Y_train)
predict = dtc.predict(X_train)
print('mae',mean_absolute_error(Y_train,predict))
print('r2',r2_score(Y_train,predict))

mae 0.00012990387113535984
r2 0.9994599902057459


In [35]:
Y_train['Predict'] = predict

In [36]:
Y_train.head(20)

Unnamed: 0,Class,Predict
9956,1,1
1507,1,1
6382,1,1
1559,1,1
5703,1,1
6075,1,1
3692,0,0
8990,1,1
4466,1,1
1836,0,0
