In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
import joblib

In [2]:
# Processed data after text classfication and other calculations
df = pd.read_csv('processed_data_with_K_L_text_clf.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,shipping_weight,quantity,description,total_price,length,width,height,weight,vw,bckt,vol_w,desc_clf
0,0,365.0,10.0,"Notebook, Envelopes, Gift tags",2799.0,22.0,17.0,2.0,365.0,500.0,0,187.0,0
1,1,1152.0,1.0,Handloom Saree,4082.0,48.0,32.0,3.0,1000.0,2104.75,3,1152.0,1
2,2,1000.0,1.0,Handloom Saree,8197.0,44.0,38.0,2.0,1000.0,1000.0,1,836.0,1
3,3,405.0,1.0,Ladies Watch,1799.0,18.0,10.0,9.0,250.0,372.0,0,405.0,0
4,4,500.0,1.0,Man Watch,4000.0,18.0,10.0,9.0,500.0,500.0,0,405.0,0


In [4]:
df.drop(['Unnamed: 0','description'], inplace=True, axis=1)

In [5]:
df = df[df.height.notna() | df.length.notna()]

In [6]:
df.height = df.height.fillna(1)

In [7]:
df.quantity = df.quantity.fillna(1)

In [8]:
def outliers_vw(df):
    Q1 = df.vw.quantile(0.25)
    Q3 = df.vw.quantile(0.75)
    IQR = Q3-Q1
    upper_end = Q3 + 1.5 * IQR
    lower_end = Q1 - 1.5 * IQR 
    outlier = df[(df.vw > upper_end) | (df.vw < lower_end)]
    return outlier

In [9]:
def outliers_w(df):
    Q1 = df.width.quantile(0.25)
    Q3 = df.width.quantile(0.75)
    IQR = Q3-Q1
    upper_end = Q3 + 1.5 * IQR
    lower_end = Q1 - 1.5 * IQR 
    outlier = df[(df.width > upper_end) | (df.width < lower_end)]
    return outlier

In [10]:
df.drop(outliers_w(df).index, inplace=True)
df.drop(outliers_vw(df).index, inplace=True)
df2 = df

In [11]:
df2.weight = df2.weight.fillna(0)

In [12]:
df2.drop(['vw'], inplace=True, axis=1)

In [13]:
(df < 0).values.any()

False

In [14]:
# As observed there are no negative values so we can go without scaling our data, though after applying scalling to data there is no huge difference in accuracy, therefore we are going with plain provided data for now.

In [15]:
X = df2.drop('bckt',axis=1)
Y = df2.bckt

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=10)

In [17]:
rf = RandomForestClassifier(n_estimators = 50, random_state = 42, max_depth=20, max_features="auto", criterion="entropy")
rf.fit(X_train, Y_train)

RandomForestClassifier(criterion='entropy', max_depth=20, n_estimators=50,
                       random_state=42)

In [18]:
rf.score(X_test, Y_test)*100

88.71508379888267

In [19]:
knn = KNN(n_neighbors = 100, n_jobs=10, weights='distance')

In [20]:
knn.fit(X_train, Y_train)

KNeighborsClassifier(n_jobs=10, n_neighbors=100, weights='distance')

In [21]:
knn.score(X_test, Y_test)*100

86.87150837988827

In [22]:
ADBClf = AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1, algorithm='SAMME.R', random_state=42)

In [23]:
ADBClf.fit(X_train,Y_train)

AdaBoostClassifier(learning_rate=1, random_state=42)

In [24]:
ADBClf.score(X_test, Y_test)*100

87.17178770949721

In [25]:
GDBClf = GradientBoostingClassifier()

In [26]:
GDBClf.fit(X_train,Y_train)

GradientBoostingClassifier()

In [27]:
GDBClf.score(X_test, Y_test)*100

88.40083798882682

In [28]:
ensemble=VotingClassifier(estimators=[ ('RF', rf), ('GDB', GDBClf)], 
                       voting='soft', weights=[2,1]).fit(X_train,Y_train)

In [29]:
ensemble.score(X_test, Y_test)*100

89.0572625698324

##### After comparing scores of different algorithms, we are going with Random Forest and GradientBoosting for our ensemble model, though there is no huge difference in accuracy.

##### As we can see that our model accuracy is ~89.06%, which is considered good. Further tuning and optimization can be done based of future data and analysis. We can go without ensemble approach as well, but as observed that in some schenarios different algorithms predicted different inputs accurately therefore we have used ensemble. 

##### It is purely business decision as there no huge impact on accuracy of model.

In [30]:
joblib.dump(ensemble, 'model.joblib')

['model.joblib']