# Orders dataset

## Loading the data

In [3]:
import pandas as pd

# Import data
df1 = pd.read_csv("./Project Informa/orders/FirstMillionLines.csv")
df2 = pd.read_csv("./Project Informa/orders/SecondMillionLines.csv")
df3 = pd.read_csv("./Project Informa/orders/ThirdMillionLines.csv")

FileNotFoundError: [Errno 2] File b'./Project Informa/orders/FirstMillionLines.csv' does not exist: b'./Project Informa/orders/FirstMillionLines.csv'

In [None]:
# Check for incosistent dimensions
print("Shape:", df1.shape)
print("Shape:", df2.shape)
print("Shape:", df3.shape)

Amount of columns is consistent, amount of rows isn't. However, that's fine.

In [None]:
# Join dataframes
orders = df1.merge(df2, how='outer')
orders = orders.merge(df3, how='outer')

In [None]:
# Show orders info
print("Shape:", orders.info())

# Show orders head
orders.head()

## Cleaning the Data

Checking for missing data

In [None]:
# Show missing data
print("Missing data: ")
print(orders.isnull().sum(), "\n")

Not a lot of data is missing, only 155 product numbers. Reason of return is so high because those weren't returns. 

Make smaller set for quick exploration

In [None]:
# Truncate data to explore faster
orders_small = orders.truncate(before=200001, after=300000)
orders_small.shape

## Look at data types and change if necessary

In [None]:
# Some column types should be changed to datetime
orders_small['DoB'] = orders_small['DoB'].astype('datetime64[ns]')
orders_small['Order date'] = orders_small['Order date'].astype('datetime64[ns]')
orders_small.info()

In [None]:
orders_small['Gender'].value_counts(dropna=False)

In [None]:
# Gender should be an int: 0 for male 1 for female
orders_small['Gender int'] = orders_small['Gender']
orders_small['Gender int'] = orders_small['Gender int'].replace('Man', '0')
orders_small['Gender int'] = orders_small['Gender int'].replace('Woman', '1')
orders_small['Gender int'] = orders_small['Gender int'].replace('\\ ', '-1')
orders_small['Gender int'].value_counts(dropna=False)

In [None]:
# Change gender column type
orders_small['Gender int'] = orders_small['Gender int'].astype('int64')
orders_small.info()

## Map category to numbers

In [None]:
# get all categories
cat = orders_small['Category'].unique()
# create new column cat. numbers
orders_small['Cat. number'] = orders_small['Category']

In [None]:
# replace all values with numbers
i = 0

for c in cat:
    orders_small['Cat. number'] = orders_small['Cat. number'].replace(c, str(i))
    i = i + 1
    # print(c + " is equal to " + str(i))

### Do the same for sub category

In [None]:
subC = orders_small['Sub category'].unique()
orders_small['Sub cat. number'] = orders_small['Sub category']

In [None]:
i = 0

for c in subC:
    orders_small['Sub cat. number'] = orders_small['Sub cat. number'].replace(c, str(i))
    i = i + 1
    # print(c + " is equal to " + str(i))

### Do the same again for place of residence

In [None]:
por = orders_small['PoR'].unique()
orders_small['PoR number'] = orders_small['PoR']

In [None]:
i = 0

for c in por:
    orders_small['PoR number'] = orders_small['PoR number'].replace(c, str(i))
    i = i + 1
    # print(c + " is equal to " + str(i))

### And month of the year

In [None]:
orders_small['Month'] = pd.DatetimeIndex(orders_small['Order date']).month

## And birthyear...

In [None]:
orders_small['Birthyear'] = pd.DatetimeIndex(orders_small['DoB']).year

In [None]:
# convert to int64 to show on heatmap
orders_small['Cat. number'] = orders_small['Cat. number'].astype('int64')
orders_small['Sub cat. number'] = orders_small['Sub cat. number'].astype('int64')
orders_small['PoR number'] = orders_small['PoR number'].astype('int64')
orders_small['Month'] = orders_small['Month'].astype('int64')
orders_small['Birthyear'] = orders_small['Birthyear'].astype('int64')
orders_small.info()

In [None]:
# check if it worked
orders_small.head()

Now that all columns are usable types and the data is cleaned, we can start exploring

## Exploratory Data Analysis

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots()

x = orders_small['Expected delivery time']
y = orders_small['Actual delivery time']

color = orders_small['Rating']
scale = orders_small['Rating'] * 50;
ax.scatter(x, y, c=color, s=scale, alpha=0.3, edgecolors='none')
# ax.scatter(x2, y2, c='pink', s=scale, alpha=0.7, edgecolors='none')
ax.grid(True)
fig.set_size_inches(10, 6)

# Set labels
plt.title('Ratings and Delivery Time')
ax.set_xlabel(r'Expected delivery time', fontsize=12)
ax.set_ylabel(r'Actual delivery time', fontsize=12)

plt.show()

As can be seen by the straight line going from (0,0) to about (48,48), most orders arrive at around the expected delivery time. There seems to be no correlation between late orders and ratings.

In [None]:
# Use seaborn to show plots
import seaborn as sns; sns.set()

In [None]:
# selected_feature_names=['Gender', 'Product number', 'Count', 'Price', 'Expected delivery time', 'Actual delivery time', 'Rating']
# sns.pairplot(orders_small, vars=selected_feature_names, height=5)

In [None]:
# # Zoom in
# sns.pairplot(orders_small, x_vars=['Rating', 'Actual delivery time'], y_vars=['Product number'], height=7)

## Predict what products new customers may buy based on their age, gender and possibly place or residence.

In [None]:
# selected_feature_names=['Gender', 'Product number', 'Birthyear']
# sns.pairplot(orders_small, vars=selected_feature_names, height=5)

Focus on age and product number 

In [None]:
fig, ax = plt.subplots()

x = orders_small['Birthyear']
y = orders_small['Product number']

# color = orders_small['Age']

color = 'blue'
scale = 50
ax.scatter(x, y, c=color, s=scale, alpha=0.3, edgecolors='none')
# ax.scatter(x2, y2, c='pink', s=scale, alpha=0.7, edgecolors='none')
ax.grid(True)
fig.set_size_inches(10, 10)
# ax.set_ylim([5.000000e+12,5.300000e+12])

# Set labels
# plt.title('Ratings and Delivery Time')
ax.set_xlabel(r'Birthyear', fontsize=12)
ax.set_ylabel(r'Product nummber', fontsize=12)

plt.show()

In [None]:
orders_small['Birthyear'][orders_small['Category'] == 'Garden furniture'].count()

In [None]:
fig, ax = plt.subplots()

x = orders_small['Birthyear'][orders_small['Gender'] == 0]
y = orders_small['Category'][orders_small['Gender'] == 0]

x2 = orders_small['Birthyear'][orders_small['Gender'] == 1]
y2 = orders_small['Category'][orders_small['Gender'] == 1]

scale = 20

ax.scatter(x, y, c='blue', s=scale, alpha=0.5, edgecolors='none')
ax.scatter(x2, y2, c='pink', s=scale, alpha=0.3, edgecolors='none')
ax.grid(True)
fig.set_size_inches(10, 10)

#set labels
plt.title('Birthyear and gender v product number')
ax.set_xlabel(r'Birthyear', fontsize=12)
ax.set_ylabel(r'Product nummber', fontsize=12)

plt.show()

In [None]:
fig, ax = plt.subplots()

x = orders_small['Birthyear'][orders_small['Gender'] == 0]
y = orders_small['Category'][orders_small['Gender'] == 0]

x2 = orders_small['Birthyear'][orders_small['Gender'] == 1]
y2 = orders_small['Category'][orders_small['Gender'] == 1]

scale = 20

ax.scatter(x, y, c='blue', s=scale, alpha=0.5, edgecolors='none')
ax.scatter(x2, y2, c='pink', s=scale, alpha=0.3, edgecolors='none')
ax.grid(True)
fig.set_size_inches(10, 10)
# ax.set_xlim(40,60)

#set labels
plt.title('Birthyear and gender v product number')
ax.set_xlabel(r'Birthyear', fontsize=12)
ax.set_ylabel(r'Category', fontsize=12)

plt.show()

In [None]:
fig, ax = plt.subplots()

x = orders_small['Birthyear'][orders_small['Gender'] == 0]
y = orders_small['Sub category'][orders_small['Gender'] == 0]

x2 = orders_small['Birthyear'][orders_small['Gender'] == 1]
y2 = orders_small['Sub category'][orders_small['Gender'] == 1]

scale = 20

ax.scatter(x, y, c='blue', s=scale, alpha=0.5, edgecolors='none')
ax.scatter(x2, y2, c='pink', s=scale, alpha=0.3, edgecolors='none')
ax.grid(True)
fig.set_size_inches(10, 10)
# ax.set_xlim(40,60)

#set labels
plt.title('Birthyear and gender v product number')
ax.set_xlabel(r'Birthyear', fontsize=12)
ax.set_ylabel(r'Sub category', fontsize=12)

plt.show()

## Heatmap

In [None]:
plt.figure(figsize=(20,20))
cor = orders_small.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)

# Predictive analysis

## Preparation

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
import numpy as np
from scipy.stats import sem

#use all available data
orders_x2 = orders_small[['Birthyear', 'Gender']]
orders_x = orders_small[['Birthyear', 'Gender', 'Month']]
orders_y2 = orders_small['Sub cat. number']
orders_y = orders_small['Cat. number']
orders_y3 = orders_small['Count']

In [None]:
# split data
x_train, x_test, y_train, y_test = train_test_split(orders_x, orders_y, test_size=0.2)

In [None]:
# create normalised set
x_train_n = np.array(x_train)
y_train_n = np.array(y_train)
preprocessing.normalize(x_train_n)
preprocessing.normalize(y_train_n.reshape(-1,1))

In [None]:
# imports
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score, KFold
from sklearn.svm import LinearSVC
from sklearn import neighbors as n
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model

## LinearSVC

In [None]:
svc = LinearSVC(class_weight='balanced', max_iter=2000, random_state=0)

# fit training data
svc.fit(x_train_n, y_train_n)

# calculate prediction
y_pred = svc.predict(x_test)
print("Accuracy: ", svc.score(x_test, y_test))

In [None]:
# cross validation

def evaluate_cross_val(clf, x, y, k):
    kf = KFold(k, shuffle=True, random_state=0)
    scores = cross_val_score(clf, x, y, cv=kf)
    print(scores)
    print(("Mean score: {0:.3f} (+/-{1:.3f})").format(np.mean(scores), sem(scores)))
    
evaluate_cross_val(svc, x_train_n, y_train_n, 5)

## Nearest Neighbors

In [None]:
n_neighbors = 200 # value of 200 produces best results
clf = n.KNeighborsClassifier(n_neighbors, weights='uniform')
clf.fit(x_train_n, y_train_n)

# calculate prediction
y_pred = clf.predict(x_test)
print("Accuracy: ", clf.score(x_test, y_test))

In [None]:
# cross validation
def evaluate_cross_val(clf, x, y, k):
    kf = KFold(k, shuffle=True, random_state=0)
    scores = cross_val_score(clf, x, y, cv=kf)
    print(scores)
    print(("Mean score: {0:.3f} (+/-{1:.3f})").format(np.mean(scores), sem(scores)))
    
evaluate_cross_val(clf, x_train_n, y_train_n, 5)

## Decision Trees

In [None]:
tr = DecisionTreeClassifier(random_state=0, min_samples_split=250)
tr.fit(x_train_n, y_train_n)

# calculate prediction
y_pred = tr.predict(x_test)
print("Accuracy: ", tr.score(x_test, y_test))

In [None]:
# cross validation
def evaluate_cross_val(clf, x, y, k):
    kf = KFold(k, shuffle=True, random_state=0)
    scores = cross_val_score(clf, x, y, cv=kf)
    print(scores)
    print(("Mean score: {0:.3f} (+/-{1:.3f})").format(np.mean(scores), sem(scores)))
    
evaluate_cross_val(tr, x_train_n, y_train_n, 5)

## Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=100, max_depth=1, random_state=0)
rf.fit(x_train_n, y_train_n)

# calculate prediction
y_pred = rf.predict(x_test)
print("Accuracy: ", rf.score(x_test, y_test))

In [None]:
# cross validation
def evaluate_cross_val(clf, x, y, k):
    kf = KFold(k, shuffle=True, random_state=0)
    scores = cross_val_score(clf, x, y, cv=kf)
    print(scores)
    print(("Mean score: {0:.3f} (+/-{1:.3f})").format(np.mean(scores), sem(scores)))
    
evaluate_cross_val(rf, x_train_n, y_train_n, 5)