## Data Exploration and Basic Function Definition

In [31]:
import pandas as pd
from collections import Counter
import itertools

### dim_fashion_matchsets:
- coll_id: boundle ID
- item_list: multiple items in each bundle
    - separated by ";" : items
    - separated by "," : substitute items in a particular category

In [32]:
# load data
dim_fashion_matchsets = pd.read_table('dim_fashion_matchsets(new).txt', sep='\s+',names = ['coll_id','item_list'])
dim_fashion_matchsets.head()

Unnamed: 0,coll_id,item_list
0,1,160870;3118604
1,2,1842610;2741506
2,3,"893028;993019,1375599,1913565,3036503;2849440;..."
3,4,2612866;1272124;2181942
4,5,3128145;2683359;855149


In [33]:
dim_fashion_matchsets['coll_id'].unique()

array([    1,     2,     3, ..., 23103, 23104, 23105])

### dim_items:
- item_id: individual product item ID
- cat_id: the category that product item belongs to
- terms: probably words (i.e. title/descriptions)
- img_data: item image (image filename = item_id.jpg)

In [34]:
dim_items = pd.read_table('dim_items(new).txt',sep = '\s+' , names = ['item_id','cat_id','terms','img_data'])
dim_items.head()

Unnamed: 0,item_id,cat_id,terms,img_data
0,29,155,"123950,53517,106068,59598,7503,171811,25618,14...",
1,49,228,"73035,33202,116593,48909,92233,181255,127004,3...",
2,59,284,"123950,38910,22837,5026,15459,47776,158346,101...",
3,109,461,"122071,35420,123950,27207,116593,24893,31897,1...",
4,119,368,"48909,125706,116593,179606,20819,158346,157222...",


In [35]:
# no img_data in this table
dim_items['img_data'].describe()

count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: img_data, dtype: float64

In [36]:
user_bought_history = pd.read_table('user_bought_history.txt', sep = '\s+' , names = ['user_id','item_id','create_at'])
user_bought_history.head()

Unnamed: 0,user_id,item_id,create_at
0,1915871,8,20150417
1,4371603,8,20150418
2,8034236,8,20150516
3,6135829,8,20150405
4,11650079,8,20150404


In [37]:
test_items = pd.read_table('test_items(new).txt', names = ['test_items_id'])
test_items.head()

Unnamed: 0,test_items_id
0,1417
1,2227
2,3967
3,7237
4,8467


## Define functions

In [38]:
class item(object):
    def __init__(self,ID):
        self.id = ID
        self.match = []
        self.replacement = []
        self.title = []
        self.category = []
        self.buyer = [] # obj
        self.buy_date = []
        self.img_data = []
        self.match_counter = []
        self.replace_counter =[]
        self.also_buy_counter = []

In [39]:
class buyer(object):
    def __init__(self,user_id,user_bought_history,items):
        self.id = user_id
        self.items = []
    def get_buy_items(self,user_bought_history,items):
        item_id = get_item_id_from_user_history(user_bought_history,self.id)
        return [get_item(items,i) for i in item_id if i in [item.id for item in items]]

### Function "get_matchset":  return the match set of coll_id

In [40]:
# Given the product boundle ID, get the matched item list
def get_matchset(dim_fashion_matchsets,coll_id): # coll_id: boundle ID
    return dim_fashion_matchsets.item_list[dim_fashion_matchsets.coll_id == coll_id].values[0].split(';')   

In [72]:
# Check output when coll_id=11
get_matchset(dim_fashion_matchsets,11)

['1463018,230955', '1596334,1704853', '2226122,284814,36278,480281']

### Function "get_replace_matchset":  return the match set of coll_id (dealed with replace items)

In [75]:
# Given the product boundle ID, get the matched replaced item list
def get_replace_matchset(dim_fashion_matchsets,coll_id):
    return [content.split(',') for content in get_matchset(dim_fashion_matchsets,coll_id)]

In [76]:
# Check output when coll_id=11
get_replace_matchset(dim_fashion_matchsets,11)

[['1463018', '230955'],
 ['1596334', '1704853'],
 ['2226122', '284814', '36278', '480281']]

### Function "get_match_list": return all the matched combinations of coll_id

In [78]:
def get_match_list(dim_fashion_matchsets,coll_id):
    matchset_combine = get_replace_matchset(dim_fashion_matchsets,coll_id)
    prodcut_list = itertools.product(*matchset_combine)
    match_list = [match for match in prodcut_list]
    return match_list

In [79]:
# Check output when coll_id=11
get_match_list(dim_fashion_matchsets,11)

[('1463018', '1596334', '2226122'),
 ('1463018', '1596334', '284814'),
 ('1463018', '1596334', '36278'),
 ('1463018', '1596334', '480281'),
 ('1463018', '1704853', '2226122'),
 ('1463018', '1704853', '284814'),
 ('1463018', '1704853', '36278'),
 ('1463018', '1704853', '480281'),
 ('230955', '1596334', '2226122'),
 ('230955', '1596334', '284814'),
 ('230955', '1596334', '36278'),
 ('230955', '1596334', '480281'),
 ('230955', '1704853', '2226122'),
 ('230955', '1704853', '284814'),
 ('230955', '1704853', '36278'),
 ('230955', '1704853', '480281')]

### Function get_category(dim_items,item_id): return the category ID (cat_id) given an item_id

In [81]:
def get_category(dim_items,item_id):
    return dim_items.cat_id[dim_items.item_id == item_id].values[0]

In [82]:
# Check item_id = 33547
get_category(dim_items,33547)

368

### Function get_term_title(dim_items,item_id): return the title/desciption of this item

In [83]:
def get_term_title(dim_items,item_id):
    return dim_items.terms[dim_items.item_id == item_id].values[0].split(',')    

In [84]:
# Check item_id = 33547
get_term_title(dim_items,33547)

['162272',
 '123950',
 '182506',
 '116593',
 '48061',
 '52775',
 '216844',
 '56917',
 '32290',
 '31897',
 '71740',
 '24893',
 '146294',
 '205682',
 '131134',
 '130974',
 '20819']

### Function get_term_img_data(dim_items,item_id): return image data

In [85]:
def get_term_img_data(dim_items,item_id):
    return dim_items.img_data[dim_items.item_id == item_id].values   

In [86]:
# Check item_id = 33547
get_term_img_data(dim_items,33547)

array([nan])

### Function get_user_id(user_bought_history,item_id): return the buyer of this item

In [87]:
def get_user_id(user_bought_history,item_id):
    return list(user_bought_history.user_id[user_bought_history.item_id == item_id].values)    

In [91]:
# Check item_id = 33547
get_user_id(user_bought_history,33547)[:10]

[7581276,
 7044613,
 924592,
 9061047,
 12443054,
 5427062,
 5427062,
 9368797,
 7593526,
 2153670]

### Function get_buy_date(user_bought_history,item_id): return the time of buying this item

In [92]:
def get_buy_date(user_bought_history,item_id):
    return list(user_bought_history.create_at[user_bought_history.item_id == item_id].values)    

In [94]:
# Check item_id = 33547
get_buy_date(user_bought_history,33547)[:10]

[20150531,
 20150525,
 20150506,
 20150527,
 20150528,
 20150523,
 20150526,
 20150609,
 20150428,
 20150510]

In [97]:
#futher split the buying date into year, month and day
def get_detail_buy_date(buy_date_list):
    year = []
    month =[]
    day =[]
    for i in range(len(buy_date_list)):
        date = str(buy_date_list[i])
        year.append(date[:4])
        month.append(date[4:6])
        day.append(date[6:])
    return year , month , day

In [99]:
# Check item_id = 33547
get_detail_buy_date(get_buy_date(user_bought_history,33547)[:10])

(['2015',
  '2015',
  '2015',
  '2015',
  '2015',
  '2015',
  '2015',
  '2015',
  '2015',
  '2015'],
 ['05', '05', '05', '05', '05', '05', '05', '06', '04', '05'],
 ['31', '25', '06', '27', '28', '23', '26', '09', '28', '10'])