In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

In [None]:
custom_colors = ["#023e8a", "#0096c7","#90e0ef","#ff5400","#ffbd00"]
customPalette = sns.set_palette(sns.color_palette(custom_colors))

In [None]:
df=pd.read_csv("term.csv")
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df=df.dropna()
df.isnull().sum()

In [None]:
print('Warehouse_block',df['Warehouse_block'].unique())
print('Gender',df['Gender'].unique())
print('Mode_of_Shipment',df['Mode_of_Shipment'].unique())
print('Customer_rating',df['Customer_rating'].unique())
print('Reached.on.Time_Y.N',df['Reached.on.Time_Y.N'].unique())
print('Customer_care_calls',df['Customer_care_calls'].unique())

In [None]:
#Just assigning names to binary values
def Reached_yn(i):
    if i==0:
        return "On Time"
    return "Delayed"
df['Reached']=df['Reached.on.Time_Y.N'].apply(Reached_yn)
df['Reached']  

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
sns.pairplot(df)

In [None]:
df[df['Customer_care_calls']>=5].median()

It's observed that Prior_purchases and Discount_offered	are drastically changes after 75th percentile. Let's analyze more.

In [None]:
plt.figure(figsize=(10,5))
boxplot = df.boxplot(column=['Prior_purchases','Discount_offered'])

Prior purchase and discount offered has some outliers

In [None]:
print(df[['Prior_purchases','Discount_offered']].quantile(0.75))
print(df[['Prior_purchases','Discount_offered']].quantile(0.80))
print(df[['Prior_purchases','Discount_offered']].quantile(0.85))
print(df[['Prior_purchases','Discount_offered']].quantile(0.90))
print(df[['Prior_purchases','Discount_offered']].quantile(0.95))
print(df[['Prior_purchases','Discount_offered']].quantile(0.99))

In [None]:
print(len(df[df['Prior_purchases']>=10]))
print(len(df[df['Discount_offered']>=20]))

Since the count of outliers very less for prior purchase, removing them straight from data. Discount offered having around 15-20% outliers. this needs some treatment. 

Considering discount offered 75th percentile value as max
and replacing higher discounts with it in below code

In [None]:
#df['Discount_offered'].filter(lambda x: '13' if x>=13 else x)
df['Discount_offered'].mask(df['Discount_offered']>=13,13,inplace=True)
df

In [None]:
plt.figure(figsize=(20,6))
corr=df.corr()
sns.heatmap(corr,annot=True,cmap='BuPu', robust=True, center=0,
            square=True, linewidths=.5)

plt.title('Correlation between Fields', fontsize=20,font="Serif")
plt.show()

In [None]:
#number of delayed and on time orders
sns.countplot(x='Reached',data=df)
pd.concat([df['Reached'] .value_counts(), 
           df['Reached'] .value_counts(normalize=True).mul(100)],axis=1, 
          keys=('counts','percentage'))    


almost around 60% products are getting delayed.

In [None]:
#Who purchased more products
sns.countplot(x=df['Gender'])
df['ID'].groupby(df['Gender']).count()

In [None]:
# creating pivot table to anaylze more
shipment=df.loc[:,['Mode_of_Shipment','Reached','Cost_of_the_Product','Weight_in_gms']]
shipment
table = pd.pivot_table(shipment, index=['Mode_of_Shipment','Reached']
        ,values=['Cost_of_the_Product'
                 ,'Cost_of_the_Product'
                 ,'Weight_in_gms'
                 ,'Reached']
                 ,aggfunc = {'Cost_of_the_Product':np.sum
                            # ,lambda x:x.sum()/shipment['Cost_of_the_Product'].sum()
                             ,'Cost_of_the_Product':np.mean
                             ,'Weight_in_gms':np.sum
                             ,'Reached':np.size}
                      )

table['% Reached'] = (table.Reached / table.Reached.sum() * 100).astype(str) + '%'
table['% Weight'] = (table.Weight_in_gms / table.Weight_in_gms.sum() * 100).astype(str) + '%'
table

Ships are causing more delay in delivering products but it's also the only mode that is delivery very high weighted products having almost 36.3+31.4= 67.7 %  of weight out of total and 7462 products out of 10999 products

In [None]:
sns.countplot(x='Mode_of_Shipment',hue='Product_importance',data=df)

from above plot, it's clear that Ships are delivering important products in large quantity compared to flight and road. 

In [None]:
#Bivariate analysis - product Weight vs cost based on shipment mode
plt.figure(figsize=(8,5))
sns.scatterplot(data=df,x='Weight_in_gms',y='Cost_of_the_Product',hue='Mode_of_Shipment')

In [None]:
#Bivariate analysis - product Weight vs cost based on importance of product
plt.figure(figsize=(8,5))
sns.scatterplot(data=df,x='Weight_in_gms',y='Cost_of_the_Product',hue='Product_importance')

1. 1st scattered plot - It's clearly visible that there are few outliers, products having low cost but very heavy weight, this is happening in ship mode of delivery and one outlier is also for road delivery mode.
2. 2nd scattered plot - The outliers belongs to medium importance products.

In [None]:
Outliers=df[(df['Cost_of_the_Product']<= 180.0) & (df['Weight_in_gms']>6200.0)]
Outliers

In [None]:
df.drop(df[(df['Cost_of_the_Product']<= 180.0) & (df['Weight_in_gms']>6200.0)].index,inplace=True)

In [None]:
df

In [None]:
sns.countplot(x='Customer_rating',hue='Reached',data=df,palette='PiYG')

In [None]:
#Let's check for more outliers
sns.pairplot(df)

In [None]:
custom_colors = ["#023e8a", "#0096c7","#90e0ef","#ff5400","#ffbd00"]
customPalette = sns.set_palette(sns.color_palette(custom_colors))

In [None]:
#Which warehouse block is delivering more products and which one delaying 
sns.countplot(x='Warehouse_block',hue='Reached.on.Time_Y.N',data=df)
df['ID'].groupby(df['Warehouse_block']).count()

In [None]:
def triple_plot(x, title,c):
    fig, ax = plt.subplots(3,1,figsize=(20,12),sharex=True)
    sns.distplot(x, ax=ax[0],color=c)
    ax[0].set(xlabel=None)
    ax[0].set_title('Histogram + KDE')
    sns.boxplot(x, ax=ax[1],color=c)
    ax[1].set(xlabel=None)
    ax[1].set_title('Boxplot')
    sns.violinplot(x, ax=ax[2],color=c)
    ax[2].set(xlabel=None)
    ax[2].set_title('Violin plot')
    fig.suptitle(title, fontsize=25)
    plt.tight_layout(pad=3.0)
    plt.show()

In [None]:
triple_plot(df["Cost_of_the_Product"],'Cost_of_the_Product',custom_colors[0])

In [None]:
# Binary encoding
df['Gender'] = df['Gender'].replace({'F': 0, 'M': 1})
    
# One-hot encoding
df=pd.get_dummies(df,columns= ['Mode_of_Shipment','Warehouse_block','Product_importance'])
df

In [None]:
df.columns

In [None]:
X=df[['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product',
       'Prior_purchases', 'Gender', 'Discount_offered', 'Weight_in_gms',
       'Mode_of_Shipment_Flight',
       'Mode_of_Shipment_Road', 'Mode_of_Shipment_Ship', 'Warehouse_block_A',
       'Warehouse_block_B', 'Warehouse_block_C', 'Warehouse_block_D',
       'Warehouse_block_F', 'Product_importance_high',
       'Product_importance_low', 'Product_importance_medium']]
X

In [None]:
y=df[['Reached.on.Time_Y.N']]
y

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test =train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.metrics import roc_auc_score , plot_roc_curve, accuracy_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix 
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

dt = DecisionTreeClassifier()
ad = AdaBoostClassifier()
kn = KNeighborsClassifier()
lg = LogisticRegression()
svm= SVC(random_state = 43, C = 10, gamma = 0.1, kernel ='rbf')
rf= RandomForestClassifier()
xg = XGBClassifier()
#xgb = XGBClassifier(use_label_encoder=False, random_state = 43)

models = [ dt,ad, kn, svm,xg,rf,lg]
for model in models:
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    scores = cross_val_score(model, X, y, cv=5).mean().round(3)
    accuracy = metrics.accuracy_score(y_test, y_pred).round(3)
    ROC = metrics.roc_auc_score(y_test, y_pred).round(3)
    #f1score = metrics.f1_score(y_test, y_pred).round(3)
    print(model, '\n', 'Accuracy:', accuracy,'\n', 'mean_CV_score:',scores, '\n' , 'ROC:', ROC,'\n')

In [None]:
ad.feature_importances_  

In [None]:
features=x_train.columns
importances = ad.feature_importances_
indices = np.argsort(importances)

plt.figure(1)
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), features[indices])
plt.xlabel('Relative Importance')

In [None]:
features=x_train.columns
importances = dt.feature_importances_
indices = np.argsort(importances)

plt.figure(1)
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), features[indices])
plt.xlabel('Relative Importance')

In [None]:
features=x_train.columns
importances = xg.feature_importances_
indices = np.argsort(importances)

plt.figure(1)
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), features[indices])
plt.xlabel('Relative Importance')

In [None]:
from sklearn.decomposition import PCA

In [None]:
df2=df['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product',
       'Prior_purchases', 'Gender', 'Discount_offered', 'Weight_in_gms',
       'Mode_of_Shipment_Flight',
       'Mode_of_Shipment_Road', 'Mode_of_Shipment_Ship', 'Warehouse_block_A',
       'Warehouse_block_B', 'Warehouse_block_C', 'Warehouse_block_D',
       'Warehouse_block_F', 'Product_importance_high',
       'Product_importance_low', 'Product_importance_medium']
df=df2

In [None]:
# TODO: Apply PCA by fitting the good data with only two dimensions
# Instantiate
pca = PCA(n_components=2)
pca.fit(df2)

# TODO: Transform the good data using the PCA fit above
reduced_data = pca.transform(df2)


# Create a DataFrame for the reduced data
reduced_data = pd.DataFrame(reduced_data, columns = ['Dimension 1', 'Dimension 2'])

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df2 = scaler.fit_transform(df2)

In [None]:
from sklearn.cluster import KMeans
wcss = []
for k in range(1,11):
    kmeans=KMeans(n_clusters=k, init="k-means++")
    kmeans.fit(df2)
    wcss.append(kmeans.inertia_)

In [None]:
wcss

In [None]:
plt.plot(range(1,11),wcss)
plt.title("Elbow")
plt.show()

In [None]:
kmeans= KMeans(n_clusters=6, init="k-means++")
cluster=kmeans.fit_predict(df)
cluster

In [None]:
pd.Series(cluster).value_counts()

In [None]:
df['label']=cluster

In [None]:
df['label'].unique()

In [None]:
from mpl_toolkits import mplot3d


In [None]:
fig=plt.figure(figsize=(20,10))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(df.Prior_purchases[df.label == 0], df["Gender"][df.label == 0], 
           df["Discount_offered"][df.label == 0], c='blue', s=60)
ax.scatter(df.Prior_purchases[df.label == 1], df["Gender"][df.label == 1], 
           df["Discount_offered"][df.label == 1], c='red', s=60)
ax.scatter(df.Prior_purchases[df.label == 2], df["Gender"][df.label == 2], 
           df["Discount_offered"][df.label == 2], c='green', s=60)

ax.view_init(30, 185)

plt.show()