# Import libraries

In [1]:
import numpy as np
import pandas as pd

import os
print(os.listdir("../input"))

['yoochoose-data']


# Load training datasets

In [2]:
date_format = '%Y-%m-%dT%H:%M:%S.%fZ'

# TODO: read timestamp as pandas.Timestamp object

# Load Datasets
clicks_df=pd.read_csv('../input/yoochoose-data/yoochoose-clicks.dat',
                      names=['session_id','timestamp','item_id','category'],
                      dtype={'category': str})
clicks_df['timestamp'] = pd.to_datetime(clicks_df['timestamp'])

display("Clicks Data",)
display(clicks_df.head())

buys_df = pd.read_csv('../input/yoochoose-data/yoochoose-buys.dat', 
                      names=['session_id', 'timestamp', 'item_id', 'price', 'quantity'],
                      )
buys_df['timestamp'] = pd.to_datetime(buys_df['timestamp'])

display("Buys Data",)
display(buys_df.head())

buys_df.info()

'Clicks Data'

Unnamed: 0,session_id,timestamp,item_id,category
0,1,2014-04-07 10:51:09.277,214536502,0
1,1,2014-04-07 10:54:09.868,214536500,0
2,1,2014-04-07 10:54:46.998,214536506,0
3,1,2014-04-07 10:57:00.306,214577561,0
4,2,2014-04-07 13:56:37.614,214662742,0


'Buys Data'

Unnamed: 0,session_id,timestamp,item_id,price,quantity
0,420374,2014-04-06 18:44:58.314,214537888,12462,1
1,420374,2014-04-06 18:44:58.325,214537850,10471,1
2,281626,2014-04-06 09:40:13.032,214535653,1883,1
3,420368,2014-04-04 06:13:28.848,214530572,6073,1
4,420368,2014-04-04 06:13:28.858,214835025,2617,1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1150753 entries, 0 to 1150752
Data columns (total 5 columns):
session_id    1150753 non-null int64
timestamp     1150753 non-null datetime64[ns]
item_id       1150753 non-null int64
price         1150753 non-null int64
quantity      1150753 non-null int64
dtypes: datetime64[ns](1), int64(4)
memory usage: 43.9 MB


# Understanding category column

In [3]:
# Category – the context of the click. 
# The value "S" indicates a special offer,
# "0" indicates  a missing value, 
# a number between 1 to 12 indicates a real category identifier,
# any other number indicates a brand.
#   - if an item has been clicked in the context of a special offer then the value will be "S", 
#   - if the context was a brand 
#              eg. BOSCH, then the value will be an 8-10 digits number.
#   - If the item has been clicked under regular category, 
#              eg. sport, then the value will be a number between 1 to 12. 

## Inpect missing values

In [4]:
# Clicks data has some missing values for category
# clicks_df[clicks_df.category==0].head()
print("Number of items with missing category info:", 
      len(np.unique(clicks_df[clicks_df.category=="0"].item_id)))

# Buys data has some missing values for price AND quantity
# buys_df[buys_df.price==0].head()
print("Number of missing price and qty entries in buys data:", 
      len(buys_df[buys_df.price==0]), len(buys_df[buys_df.quantity==0]))


Number of items with missing category info: 44042
Number of missing price and qty entries in buys data: 610030 610030


# Some important numbers

In [5]:
print("The training dataset has", len(clicks_df), "clicks", 
      "from", len(np.unique(clicks_df.session_id)), "sessions")
print("There are ",len(buys_df), "purchases", 
      "from", len(np.unique(buys_df.session_id)), "sessions\n",
      "involving", len(np.unique(buys_df.item_id)), "unique items",
      "out of", len(np.unique(clicks_df.item_id)), "items in the whole training set.")

print("\nThis means that the sessions with/without purchases are highly imbalanced.")
print("Number of sessions with purchases", len(np.unique(buys_df.session_id)))
print("Number of sessions without purchases", len(np.unique(clicks_df.session_id)) - len(np.unique(buys_df.session_id)))
print("Difference:", len(np.unique(clicks_df.session_id)) - 2 * len(np.unique(buys_df.session_id)))

The training dataset has 33003944 clicks from 9249729 sessions
There are  1150753 purchases from 509696 sessions
 involving 19949 unique items out of 52739 items in the whole training set.

This means that the sessions with/without purchases are highly imbalanced.
Number of sessions with purchases 509696
Number of sessions without purchases 8740033
Difference: 8230337


# EDA

# Feature engineering

We need to extract features for 2 classifiers:
- Buy or not buy ? (Binary)
- What to buy ? (Multilabel)

# Random Forest Model

In [6]:
from sklearn.ensemble import RandomForestClassifier

In [7]:
# TODO: Handle class imbalance by downsampling non buy data from clicks_df
# How to do downsampling:-
# Take all entries from buys data
# Take same number of samples from clicks data but make sure that they belong to non buy category


# FIXME: This is a JUGAAD to prevent memory error
clicks_df = clicks_df.head(1000)
buys_df = buys_df.head(1000)

In [8]:
# temporary
clicks_df['buy'] = 0
buys_df['buy'] = 1

union_df = pd.concat([clicks_df, buys_df], ignore_index=True, sort=True).sort_values(by=['session_id','item_id'])
union_df['category'] = union_df['category'].fillna(method='ffill')
union_df.head()
union_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 1 to 1933
Data columns (total 7 columns):
buy           2000 non-null int64
category      2000 non-null object
item_id       2000 non-null int64
price         1000 non-null float64
quantity      1000 non-null float64
session_id    2000 non-null int64
timestamp     2000 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(2), int64(3), object(1)
memory usage: 125.0+ KB


In [9]:
from sklearn.model_selection import train_test_split

# TODO: one-hot encode categorical features
# for col in union_df.dtypes[union_df.dtypes == 'object'].index:
#     for_dummy = union_df.pop(col)
#     union_df = pd.concat([union_df, pd.get_dummies(for_dummy, prefix=col)], axis=1)

# FIXME: using sparse=TRUE prevents MEMORY ERROR. find RCA???
one_hot = pd.get_dummies(union_df['category'], sparse=True)
union_df = union_df.drop('category', axis=1)
union_df = union_df.join(one_hot)

one_hot = pd.get_dummies(union_df['timestamp'], sparse=True)
union_df = union_df.drop('timestamp', axis=1)
union_df = union_df.join(one_hot)

In [10]:
union_df = union_df.drop(['price', 'quantity'], axis=1)

X = union_df.drop(['buy'], axis=1)
y = union_df[['buy']]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((1500, 1997), (500, 1997), (1500, 1), (500, 1))

In [11]:
union_df.describe()

Unnamed: 0,buy,item_id,session_id,0,2014-04-01 04:59:39.603000,2014-04-01 05:26:12.539000,2014-04-01 05:26:32.156000,2014-04-01 06:44:20.441000,2014-04-01 07:05:17.304000,2014-04-01 07:07:10.597000,2014-04-01 07:07:10.891000,2014-04-01 07:07:46.076000,2014-04-01 07:09:03.999000,2014-04-01 07:09:04.009000,2014-04-01 07:09:04.037000,2014-04-01 07:10:27.592000,2014-04-01 07:11:25.761000,2014-04-01 07:12:42.091000,2014-04-01 07:17:35.492000,2014-04-01 07:17:46.212000,2014-04-01 07:33:07.173000,2014-04-01 07:40:28.688000,2014-04-01 07:41:46.683000,2014-04-01 07:41:54.239000,2014-04-01 07:42:09.535000,2014-04-01 07:42:16.709000,2014-04-01 07:43:16.726000,2014-04-01 07:59:37.475000,2014-04-01 08:11:18.595000,2014-04-01 08:21:28.613000,2014-04-01 08:21:49.381000,2014-04-01 08:31:39.655000,2014-04-01 08:35:03.227000,2014-04-01 08:35:45.863000,2014-04-01 08:36:04.146000,2014-04-01 08:36:06.076000,2014-04-01 08:36:34.683000,2014-04-01 08:37:18.399000,2014-04-01 08:37:31.053000,2014-04-01 08:38:17.266000,...,2014-04-07 17:53:29.459000,2014-04-07 17:58:58.549000,2014-04-07 18:04:21.435000,2014-04-07 18:12:30.007000,2014-04-07 18:12:35.794000,2014-04-07 18:13:27.129000,2014-04-07 18:19:27.211000,2014-04-07 18:29:25.611000,2014-04-07 18:30:02.454000,2014-04-07 18:31:04.993000,2014-04-07 18:31:45.755000,2014-04-07 18:33:23.598000,2014-04-07 18:37:27.522000,2014-04-07 18:46:31.323000,2014-04-07 18:52:22.928000,2014-04-07 18:56:00.868000,2014-04-07 18:56:19.738000,2014-04-07 18:56:42.948000,2014-04-07 18:56:54.404000,2014-04-07 18:57:39.065000,2014-04-07 18:59:58.414000,2014-04-07 19:01:58.299000,2014-04-07 19:17:17.815000,2014-04-07 19:17:17.816000,2014-04-07 19:17:17.824000,2014-04-07 19:19:29.810000,2014-04-07 19:22:14.701000,2014-04-07 19:44:27.340000,2014-04-07 19:44:27.396000,2014-04-07 19:59:03.434000,2014-04-07 19:59:03.500000,2014-04-07 20:06:14.524000,2014-04-07 20:06:14.580000,2014-04-07 20:06:21.951000,2014-04-07 20:12:31.438000,2014-04-07 20:17:39.691000,2014-04-07 20:17:39.701000,2014-04-07 20:17:39.710000,2014-04-07 20:17:39.720000,2014-04-07 20:33:54.828000
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,0.5,214735400.0,125812.3945,1.0,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,...,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.001,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005,0.0005
std,0.500125,105015.0,172233.328782,0.0,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,...,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.031615,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361,0.022361
min,0.0,214508300.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,214678800.0,147.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.5,214753500.0,327.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,214826700.0,280661.25,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,214844400.0,490971.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(x_train, y_train.values.ravel())

y_pred = rf.predict(x_test)




In [13]:
from sklearn.metrics import roc_curve, auc, accuracy_score
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc

0.9771863117870723

In [14]:
accuracy_score(y_pred,y_test)

0.976