In [52]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

In [53]:
df = pd.read_csv("Europe Sales Records.csv")
df.head()

Unnamed: 0,Region,Country,Item Type,Sales Channel,Order Priority,Order Date,Order ID,Ship Date,Units Sold,Unit Price,Unit Cost,Total Revenue,Total Cost,Total Profit
0,Europe,Czech Republic,Beverages,Offline,C,9/12/2011,478051030,9/29/2011,4778,47.45,31.79,226716.1,151892.62,74823.48
1,Europe,Bosnia and Herzegovina,Clothes,Online,M,10/14/2013,919133651,11/4/2013,927,109.28,35.84,101302.56,33223.68,68078.88
2,Europe,Austria,Cereal,Offline,C,8/13/2014,987410676,9/6/2014,5616,205.7,117.11,1155211.2,657689.76,497521.44
3,Europe,Bulgaria,Office Supplies,Online,L,10/31/2010,672330081,11/29/2010,6266,651.21,524.96,4080481.86,3289399.36,791082.5
4,Europe,Estonia,Fruits,Online,L,9/28/2016,579463422,11/1/2016,4958,9.33,6.92,46258.14,34309.36,11948.78


In [54]:
df.shape


(1330, 14)

In [55]:
df.columns

Index(['Region', 'Country', 'Item Type', 'Sales Channel', 'Order Priority',
       'Order Date', 'Order ID', 'Ship Date', 'Units Sold', 'Unit Price',
       'Unit Cost', 'Total Revenue', 'Total Cost', 'Total Profit'],
      dtype='object')

In [56]:
df.isnull().sum()

Region            0
Country           0
Item Type         0
Sales Channel     0
Order Priority    0
Order Date        0
Order ID          0
Ship Date         0
Units Sold        0
Unit Price        0
Unit Cost         0
Total Revenue     0
Total Cost        0
Total Profit      0
dtype: int64

In [57]:
df["Order Date"] = pd.to_datetime(df["Order Date"])
df["Ship Date"] = pd.to_datetime(df["Ship Date"])
df.head()

Unnamed: 0,Region,Country,Item Type,Sales Channel,Order Priority,Order Date,Order ID,Ship Date,Units Sold,Unit Price,Unit Cost,Total Revenue,Total Cost,Total Profit
0,Europe,Czech Republic,Beverages,Offline,C,2011-09-12,478051030,2011-09-29,4778,47.45,31.79,226716.1,151892.62,74823.48
1,Europe,Bosnia and Herzegovina,Clothes,Online,M,2013-10-14,919133651,2013-11-04,927,109.28,35.84,101302.56,33223.68,68078.88
2,Europe,Austria,Cereal,Offline,C,2014-08-13,987410676,2014-09-06,5616,205.7,117.11,1155211.2,657689.76,497521.44
3,Europe,Bulgaria,Office Supplies,Online,L,2010-10-31,672330081,2010-11-29,6266,651.21,524.96,4080481.86,3289399.36,791082.5
4,Europe,Estonia,Fruits,Online,L,2016-09-28,579463422,2016-11-01,4958,9.33,6.92,46258.14,34309.36,11948.78


In [58]:
df["Ordered Days"] = (df["Ship Date"]-df["Order Date"]).dt.days
df.head()

Unnamed: 0,Region,Country,Item Type,Sales Channel,Order Priority,Order Date,Order ID,Ship Date,Units Sold,Unit Price,Unit Cost,Total Revenue,Total Cost,Total Profit,Ordered Days
0,Europe,Czech Republic,Beverages,Offline,C,2011-09-12,478051030,2011-09-29,4778,47.45,31.79,226716.1,151892.62,74823.48,17
1,Europe,Bosnia and Herzegovina,Clothes,Online,M,2013-10-14,919133651,2013-11-04,927,109.28,35.84,101302.56,33223.68,68078.88,21
2,Europe,Austria,Cereal,Offline,C,2014-08-13,987410676,2014-09-06,5616,205.7,117.11,1155211.2,657689.76,497521.44,24
3,Europe,Bulgaria,Office Supplies,Online,L,2010-10-31,672330081,2010-11-29,6266,651.21,524.96,4080481.86,3289399.36,791082.5,29
4,Europe,Estonia,Fruits,Online,L,2016-09-28,579463422,2016-11-01,4958,9.33,6.92,46258.14,34309.36,11948.78,34


In [59]:
df.loc[df['Total Profit'] >100000,'Result'] =1
df.loc[df['Total Profit'] <100000,'Result'] =0
df.tail()

Unnamed: 0,Region,Country,Item Type,Sales Channel,Order Priority,Order Date,Order ID,Ship Date,Units Sold,Unit Price,Unit Cost,Total Revenue,Total Cost,Total Profit,Ordered Days,Result
1325,Europe,Norway,Personal Care,Offline,M,2014-01-14,634033286,2014-01-15,3394,81.73,56.67,277391.62,192337.98,85053.64,1,0.0
1326,Europe,Ukraine,Cereal,Offline,L,2014-04-14,559183347,2014-05-21,3633,205.7,117.11,747308.1,425460.63,321847.47,37,1.0
1327,Europe,Armenia,Meat,Offline,M,2015-11-09,781416594,2015-12-23,7390,421.89,364.69,3117767.1,2695059.1,422708.0,44,1.0
1328,Europe,Denmark,Clothes,Offline,H,2012-05-09,713357150,2012-06-03,7088,109.28,35.84,774576.64,254033.92,520542.72,25,1.0
1329,Europe,Finland,Clothes,Online,L,2014-04-22,906794202,2014-05-11,9410,109.28,35.84,1028324.8,337254.4,691070.4,19,1.0


In [60]:
df['Region'].unique()

array(['Europe'], dtype=object)

In [61]:
len(df['Country'].unique())

48

In [62]:
len(df['Item Type'].unique())

12

In [63]:
df_train = df.drop(columns = ['Region', 'Order Date', 'Order ID', 'Ship Date','Result','Ordered Days'],axis=1)
df_test = df['Result']


In [64]:
x_train,x_test,y_train,y_test = train_test_split(df_train,df_test, test_size = 0.2, random_state=42)

In [65]:
x_train.head()

Unnamed: 0,Country,Item Type,Sales Channel,Order Priority,Units Sold,Unit Price,Unit Cost,Total Revenue,Total Cost,Total Profit
844,Latvia,Office Supplies,Online,L,4375,651.21,524.96,2849043.75,2296700.0,552343.75
170,Slovenia,Household,Online,C,6992,668.27,502.54,4672543.84,3513759.68,1158784.16
942,Estonia,Beverages,Offline,L,8222,47.45,31.79,390133.9,261377.38,128756.52
461,United Kingdom,Personal Care,Offline,L,4509,81.73,56.67,368520.57,255525.03,112995.54
209,Netherlands,Baby Food,Offline,C,4077,255.28,159.42,1040776.56,649955.34,390821.22


In [70]:
from sklearn import set_config
set_config(display='diagram')

In [77]:
ohe = OneHotEncoder()
ohe_cols=[0,1,2,3]
trf1 = ColumnTransformer([('ohe_cat',ohe,ohe_cols)],remainder = 'passthrough')

In [78]:
trf1

In [79]:
trf2 = DecisionTreeClassifier()

In [80]:
pipe = Pipeline([('trf1',trf1),('trf2',trf2)])
pipe

In [81]:
pipe.fit(x_train,y_train)

In [87]:
 predicated=pipe.predict(x_test)

In [88]:
from sklearn.metrics import accuracy_score
accuracy_score(predicated,y_test)

1.0

In [91]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(predicated,y_test)
cm

array([[ 76,   0],
       [  0, 190]])