# AI Recomender Project

In [21]:
import numpy as np
import pandas as pd
import sklearn
import gzip
import json
from tqdm import tqdm
import os
from collections import Counter
from datetime import datetime
import math
import scipy.sparse as sparse
from scipy.sparse import csr_matrix, csc_matrix
from scipy.sparse import lil_matrix
import copy
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold 

tqdm.pandas() #for progres_apply etc.

In [2]:
#read file line-by-line and parse json, returns dataframe
def parse_json(filename_gzipped_python_json, read_max=-1):
  #read gzipped content
  f=gzip.open(filename_gzipped_python_json,'r')
  
  #parse json
  parse_data = []
  for line in tqdm(f): #tqdm is for showing progress bar, always good when processing large amounts of data
    line = line.decode('utf-8')
    line = line.replace('true','True') #difference json/python
    line = line.replace('false','False')
    parsed_result = eval(line) #load python nested datastructure
    parse_data.append(parsed_result)
    if read_max !=-1 and len(parse_data) > read_max:
      print(f'Break reading after {read_max} records')
      break
  print(f"Reading {len(parse_data)} rows.")

  #create dataframe
  df= pd.DataFrame.from_dict(parse_data)
  return df


# 1. Load Goodreads data

In [3]:
goodreads_path = './'
books = 'goodreads_books_comics_graphic.json.gz'
interactions = 'goodreads_interactions_comics_graphic.json.gz'
reviews = 'goodreads_reviews_comics_graphic.json.gz'

# 2. Clean data
Example of:
- Merging two files
- tqdm pd.progress_apply
- Example of non-destructive transforms, i.e. keep original data and make re-running cell works
- Parsing dates

In [4]:
#books
books_df = pd.read_json(goodreads_path + books, lines=True)
books_df = books_df[['book_id',	'title','authors',	'publisher',	'num_pages',	'publication_year']]
display(books_df.head(5))

Unnamed: 0,book_id,title,authors,publisher,num_pages,publication_year
0,25742454,The Switchblade Mamma,"[{'author_id': '8551671', 'role': ''}]",,,
1,30128855,Cruelle,"[{'author_id': '3274315', 'role': ''}]",Dargaud,,2016.0
2,13571772,Captain America: Winter Soldier (The Ultimate ...,"[{'author_id': '37450', 'role': ''}]",Hachette Partworks Ltd.,146.0,2012.0
3,35452242,Bounty Hunter 4/3: My Life in Combat from Mari...,"[{'author_id': '16209952', 'role': ''}, {'auth...",,,
4,707611,"Superman Archives, Vol. 2","[{'author_id': '81563', 'role': ''}, {'author_...",DC Comics,272.0,1997.0


In [5]:
#get author names (authors metadata is an additional dowload from goodreads)
authors = 'goodreads_book_authors.json.gz'
authors_df =  pd.read_json(goodreads_path + authors, lines=True) #829.529 authors (also non-graphic and comics)
display(authors_df.head(5))

Unnamed: 0,average_rating,author_id,text_reviews_count,name,ratings_count
0,3.98,604031,7,Ronald J. Fields,49
1,4.08,626222,28716,Anita Diamant,546796
2,3.92,10333,5075,Barbara Hambly,122118
3,3.68,9212,36262,Jennifer Weiner,888522
4,3.82,149918,96,Nigel Pennick,1740


In [6]:
#merge, but inline for each row, since each book has many authors
author_id_to_name = {}
for idx, row in tqdm(authors_df.iterrows(), total=authors_df.shape[0]):
  author_id_to_name[row['author_id']] = row['name']
display(books_df.head(5))
#important: type of author is np.int64
books_df['author_name'] = books_df['authors'].apply(lambda authors_dct_lst: author_id_to_name.get(np.int64(authors_dct_lst[0]['author_id'])))
display(books_df.head(5))

100%|██████████| 829529/829529 [00:43<00:00, 19290.68it/s]


Unnamed: 0,book_id,title,authors,publisher,num_pages,publication_year
0,25742454,The Switchblade Mamma,"[{'author_id': '8551671', 'role': ''}]",,,
1,30128855,Cruelle,"[{'author_id': '3274315', 'role': ''}]",Dargaud,,2016.0
2,13571772,Captain America: Winter Soldier (The Ultimate ...,"[{'author_id': '37450', 'role': ''}]",Hachette Partworks Ltd.,146.0,2012.0
3,35452242,Bounty Hunter 4/3: My Life in Combat from Mari...,"[{'author_id': '16209952', 'role': ''}, {'auth...",,,
4,707611,"Superman Archives, Vol. 2","[{'author_id': '81563', 'role': ''}, {'author_...",DC Comics,272.0,1997.0


Unnamed: 0,book_id,title,authors,publisher,num_pages,publication_year,author_name
0,25742454,The Switchblade Mamma,"[{'author_id': '8551671', 'role': ''}]",,,,Lindsey Schussman
1,30128855,Cruelle,"[{'author_id': '3274315', 'role': ''}]",Dargaud,,2016.0,Florence Dupre la Tour
2,13571772,Captain America: Winter Soldier (The Ultimate ...,"[{'author_id': '37450', 'role': ''}]",Hachette Partworks Ltd.,146.0,2012.0,Ed Brubaker
3,35452242,Bounty Hunter 4/3: My Life in Combat from Mari...,"[{'author_id': '16209952', 'role': ''}, {'auth...",,,,Jason Delgado
4,707611,"Superman Archives, Vol. 2","[{'author_id': '81563', 'role': ''}, {'author_...",DC Comics,272.0,1997.0,Jerry Siegel


In [7]:
#interactions
#wc -l interactions is 7.347.630 
#sample first 500.000 interactions
interactions_df = parse_json(goodreads_path + interactions)# , read_max=500000) #Note: RAM issue if loading with pd.read_json, no issue with parse_json 

7347630it [05:12, 23492.69it/s]


Reading 7347630 rows.


In [8]:
#1) parse date 
interactions_df_new = interactions_df[['user_id', 'book_id', 'rating', 'date_updated']]
format_str = '%a %b %d %H:%M:%S %z %Y' #see https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior
#test: datetime_object = datetime.strptime('Fri Jun 21 10:25:05 -0700 2013', format_str) 
interactions_df_new['date_updated'] = interactions_df_new['date_updated'].progress_apply(lambda s: np.datetime64(datetime.strptime(s,format_str)))

#2) sort on user_id, then date
interactions_df_new = interactions_df_new.sort_values(by=['user_id', 'date_updated'], ascending=[True,True])
display(interactions_df_new)

  interactions_df_new['date_updated'] = interactions_df_new['date_updated'].progress_apply(lambda s: np.datetime64(datetime.strptime(s,format_str)))
100%|██████████| 7347630/7347630 [03:25<00:00, 35730.52it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interactions_df_new['date_updated'] = interactions_df_new['date_updated'].progress_apply(lambda s: np.datetime64(datetime.strptime(s,format_str)))


Unnamed: 0,user_id,book_id,rating,date_updated
1651325,00004584d524ec468619e81b176cc991,271199,4,2013-06-21 17:23:44
1651324,00004584d524ec468619e81b176cc991,287380,4,2013-06-21 17:24:05
1651322,00004584d524ec468619e81b176cc991,287381,4,2013-06-21 17:24:31
1651316,00004584d524ec468619e81b176cc991,287382,4,2013-06-21 17:25:05
1651314,00004584d524ec468619e81b176cc991,287388,3,2013-06-21 17:25:13
...,...,...,...,...
3225969,fffff8a718843c0e11dfd93fb41c1297,6606855,3,2017-03-01 01:37:24
3225968,fffff8a718843c0e11dfd93fb41c1297,29890569,3,2017-03-16 14:03:44
3225967,fffff8a718843c0e11dfd93fb41c1297,17256441,0,2017-07-07 02:50:46
3225973,fffff8a718843c0e11dfd93fb41c1297,29214708,4,2017-07-08 19:01:43


# 3. Preprocessing

In [11]:
def preprocess_classic(df, minsup=5):
    """
    Goal: - Remove reconsumption items
          - Remove items that have less than minsup interactions 
          - Remove users that have less than minsup interactions 
               
    :input df: Dataframe containing user_id, item_id and time
    """
    before = df.shape[0]
    #drop reconsumption items
    df = df.drop_duplicates(subset=["user_id","item_id"])
    print("After drop_duplicates (reconsumption items): {} -> {}".format(before,df.shape[0]))
    #compute item/user counts
    g1 = df.groupby('item_id', as_index=False)['user_id'].size()
    g1 = g1.rename({'size': 'count_item'}, axis='columns')
    g2 = df.groupby('user_id', as_index=False)['item_id'].size()
    g2 = g2.rename({'size': 'count_user'}, axis='columns')
    df = pd.merge(df, g1, how='left', on=['item_id'])
    df = pd.merge(df, g2, how='left', on=['user_id'])
    display(df.head(5))
    #drop items occurring less than minsup times
    before = df.shape[0]
    df = df[df['count_item'] >= 35]
    print("After dropping items with less than {} interactions: {} -> {}".format(minsup, before,df.shape[0]))
    before = df.shape[0]
    #drop users with less then minsup items in history
    df = df[df['count_user'] >= minsup]
    df = df[['user_id','item_id','datetime']]
    print("After dropping users with less than {} interactions: {} -> {}".format(minsup, before,df.shape[0]))
    return df

# just rating >= 3
#print(f"number of unique users: {interactions_df_new['user_id'].nunique()}")
#print(f"number of unique items: {interactions_df_new['book_id'].nunique()}")
#interactions_df_new = interactions_df_new[interactions_df_new["rating"] >= 3]
#print(f"number of unique users after rating < 3 removal: {interactions_df_new['user_id'].nunique()}")
#print(f"number of unique items after rating < 3 removal: {interactions_df_new['book_id'].nunique()}")

#print number of users and items
interactions_df_processed = interactions_df_new[['user_id','book_id','date_updated']]
interactions_df_processed = interactions_df_processed.rename(columns={"user_id": "user_id", "book_id": "item_id", "date_updated": "datetime"})
print(f"df size {interactions_df_processed.shape[0]}")
print(f"number of unique users: {interactions_df_processed['user_id'].nunique()}")
print(f"number of unique items: {interactions_df_processed['item_id'].nunique()}")
interactions_df_processed = preprocess_classic(interactions_df_processed)
interactions_df_processed.reset_index(drop=True, inplace=True)
display(interactions_df_processed.head(5))
print(f"df size {interactions_df_processed.shape[0]}")
print(f"number of unique users: {interactions_df_processed['user_id'].nunique()}")
print(f"number of unique items: {interactions_df_processed['item_id'].nunique()}")
interactions_df_processed.sort_values(by=['datetime'], inplace=True)
print("Sorting by date")
interactions_df_processed.reset_index(inplace=True, drop=True)
display(interactions_df_processed.head(5))
column = interactions_df_processed["datetime"]
print(f"Max date is {column.max()}, min is {column.min()} ")

df size 7347630
number of unique users: 342415
number of unique items: 89411
After drop_duplicates (reconsumption items): 7347630 -> 7347630


Unnamed: 0,user_id,item_id,datetime,count_item,count_user
0,00004584d524ec468619e81b176cc991,271199,2013-06-21 17:23:44,10102,24
1,00004584d524ec468619e81b176cc991,287380,2013-06-21 17:24:05,1628,24
2,00004584d524ec468619e81b176cc991,287381,2013-06-21 17:24:31,198,24
3,00004584d524ec468619e81b176cc991,287382,2013-06-21 17:25:05,247,24
4,00004584d524ec468619e81b176cc991,287388,2013-06-21 17:25:13,249,24


After dropping items with less than 5 interactions: 7347630 -> 6686728
After dropping users with less than 5 interactions: 6686728 -> 6355864


Unnamed: 0,user_id,item_id,datetime
0,00004584d524ec468619e81b176cc991,271199,2013-06-21 17:23:44
1,00004584d524ec468619e81b176cc991,287380,2013-06-21 17:24:05
2,00004584d524ec468619e81b176cc991,287381,2013-06-21 17:24:31
3,00004584d524ec468619e81b176cc991,287382,2013-06-21 17:25:05
4,00004584d524ec468619e81b176cc991,287388,2013-06-21 17:25:13


df size 6355864
number of unique users: 148304
number of unique items: 26431
Sorting by date


Unnamed: 0,user_id,item_id,datetime
0,a309c35c5c32f1edbdc5e6770848394a,15067,2006-12-26 15:25:43
1,12c1ea7e1c88a03d24f164fc576ef42c,53178,2007-02-02 07:14:38
2,12c1ea7e1c88a03d24f164fc576ef42c,53179,2007-02-02 07:14:53
3,12c1ea7e1c88a03d24f164fc576ef42c,39916,2007-02-02 07:17:54
4,12c1ea7e1c88a03d24f164fc576ef42c,25179,2007-02-02 07:18:04


Max date is 2017-11-05 21:19:42, min is 2006-12-26 15:25:43 


# 4. Create consecutive ID's
- Working with numpy types != python types
- Mapping ID's to consecutive integgers for matrix operations (and scipy sparse matrices, see https://docs.scipy.org/doc/scipy/reference/sparse.html) 

In [12]:
#change type book_id to numpy.int64
display(interactions_df_processed.dtypes)
interactions_df_processed['item_id'] = interactions_df_processed['item_id'].astype('int64')
display(interactions_df_processed.dtypes)

user_id             object
item_id             object
datetime    datetime64[ns]
dtype: object

user_id             object
item_id              int64
datetime    datetime64[ns]
dtype: object

In [13]:
dct = {}
def map_to_consecutive_id(uuid):
  if uuid in dct:
    return dct[uuid]
  else:
    id = len(dct)
    dct[uuid] = id
    return id

#1) convert user uuid to consecutive integer ID's 
interactions_df_processed['user_id_int'] = interactions_df_processed['user_id'].progress_apply(map_to_consecutive_id)

#2) convert book_id to to consecutive integer ID's 
dct.clear()
interactions_df_processed['item_id_int'] = interactions_df_processed['item_id'].progress_apply(map_to_consecutive_id)
display(interactions_df_processed.head(10))

column = interactions_df_processed['item_id_int'] 
max_item_id = column.max()

column = interactions_df_processed['user_id_int'] 
max_user_id = column.max()

100%|██████████| 6355864/6355864 [00:08<00:00, 733073.68it/s]
100%|██████████| 6355864/6355864 [00:09<00:00, 693112.21it/s]


Unnamed: 0,user_id,item_id,datetime,user_id_int,item_id_int
0,a309c35c5c32f1edbdc5e6770848394a,15067,2006-12-26 15:25:43,0,0
1,12c1ea7e1c88a03d24f164fc576ef42c,53178,2007-02-02 07:14:38,1,1
2,12c1ea7e1c88a03d24f164fc576ef42c,53179,2007-02-02 07:14:53,1,2
3,12c1ea7e1c88a03d24f164fc576ef42c,39916,2007-02-02 07:17:54,1,3
4,12c1ea7e1c88a03d24f164fc576ef42c,25179,2007-02-02 07:18:04,1,4
5,12c1ea7e1c88a03d24f164fc576ef42c,38333,2007-02-02 07:18:23,1,5
6,45c3f0e4d05be7eeca4ebb1f88646113,102920,2007-02-14 21:18:46,2,6
7,45c3f0e4d05be7eeca4ebb1f88646113,23754,2007-02-15 17:45:20,2,7
8,45c3f0e4d05be7eeca4ebb1f88646113,25103,2007-02-15 17:45:28,2,8
9,45c3f0e4d05be7eeca4ebb1f88646113,25106,2007-02-15 17:45:29,2,9


In [14]:
df = interactions_df_processed[['user_id_int', 'item_id_int', 'datetime', 'user_id', 'item_id']].copy().rename(columns={"user_id_int": "user_id", "item_id_int": "item_id", "datetime": "datetime", "user_id": 'old_user', "item_id": "old_item"})
display(df)

print(f"Max user_id is {max_user_id}, Max item_id is {max_item_id}")

Unnamed: 0,user_id,item_id,datetime,old_user,old_item
0,0,0,2006-12-26 15:25:43,a309c35c5c32f1edbdc5e6770848394a,15067
1,1,1,2007-02-02 07:14:38,12c1ea7e1c88a03d24f164fc576ef42c,53178
2,1,2,2007-02-02 07:14:53,12c1ea7e1c88a03d24f164fc576ef42c,53179
3,1,3,2007-02-02 07:17:54,12c1ea7e1c88a03d24f164fc576ef42c,39916
4,1,4,2007-02-02 07:18:04,12c1ea7e1c88a03d24f164fc576ef42c,25179
...,...,...,...,...,...
6355859,96720,21580,2017-11-05 16:21:49,e90ae527118f4e30c4b15b7ff69a0e12,24612600
6355860,5968,1671,2017-11-05 19:23:22,617e3285e80e7b7468fd5c69e3d881cd,1270615
6355861,69300,186,2017-11-05 19:55:34,f3be22529de075b9b877f4d4ec8025c4,5805
6355862,60231,186,2017-11-05 20:26:48,e4f0ca741ff5113c25fcd562a8c5aedc,5805


Max user_id is 148303, Max item_id is 26430


In [19]:
def split_train_test_proportion(data, test_prop=0.2):
    data_grouped_by_user = data.groupby('user_id')
    tr_list, te_list = list(), list()

    np.random.seed(98765)

    for i, (_, group) in enumerate(data_grouped_by_user):
        n_items_u = len(group)

        if n_items_u >= 5:
            idx = np.zeros(n_items_u, dtype='bool')
            idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True

            tr_list.append(group[np.logical_not(idx)])
            te_list.append(group[idx])
        else:
            tr_list.append(group)

        if i % 1000 == 0:
            #print("%d users sampled" % i)
            #sys.stdout.flush()
            pass

    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)
    
    return data_tr, data_te

for iteration in range(3):
    print(f"iteration{iteration}")
    # splitting
    train, test = train_test_split(df, test_size=0.2, shuffle=True)

    # rebuilding user_id
    dct.clear()

    train['user_id'] = train['user_id'].progress_apply(map_to_consecutive_id)

    dct.clear()

    test['user_id'] = test['user_id'].progress_apply(map_to_consecutive_id)

    dct.clear()

    test_tr, test_te = split_train_test_proportion(test)
    test_tr.to_csv(f'./test_tr{iteration}.csv', index=False)
    test_te.to_csv(f'./test_te{iteration}.csv', index=False)
    train.to_csv(f'./train{iteration}.csv', index=False)
    test.to_csv(f'./test{iteration}.csv', index=False)


iteration0


100%|██████████| 5084691/5084691 [00:08<00:00, 614707.62it/s]
100%|██████████| 1271173/1271173 [00:02<00:00, 584062.78it/s]


iteration1


100%|██████████| 5084691/5084691 [00:09<00:00, 544905.53it/s]
100%|██████████| 1271173/1271173 [00:02<00:00, 545667.33it/s]


iteration2


100%|██████████| 5084691/5084691 [00:08<00:00, 596948.65it/s]
100%|██████████| 1271173/1271173 [00:02<00:00, 542163.58it/s]


In [20]:
test_tr["item_id"].max()

26430

In [30]:
# folds

counter = 0
for iteration in range(3):
    df = pd.read_csv(f"train{iteration}.csv")

    kf = KFold(n_splits=5, shuffle=True) # Define the split - into 5 folds 
    print(kf.get_n_splits(df))
    for train_index, test_index in kf.split(df):
        train = df.iloc[train_index]
        test = df.iloc[test_index]

        # rebuilding user_id
        dct.clear()

        train['user_id'] = train['user_id'].progress_apply(map_to_consecutive_id)

        dct.clear()

        test['user_id'] = test['user_id'].progress_apply(map_to_consecutive_id)

        dct.clear()

        test_tr, test_te = split_train_test_proportion(test)
        test_tr.to_csv(f'./test_tr_fold{counter}.csv', index=False)
        test_te.to_csv(f'./test_te_fold{counter}.csv', index=False)
        train.to_csv(f'./train_fold{counter}.csv', index=False)
        test.to_csv(f'./test_fold{counter}.csv', index=False)
        print(train.head())
        counter+=1


5


100%|██████████| 4067752/4067752 [00:06<00:00, 614023.11it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['user_id'] = train['user_id'].progress_apply(map_to_consecutive_id)
100%|██████████| 1016939/1016939 [00:01<00:00, 606019.25it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['user_id'] = test['user_id'].progress_apply(map_to_consecutive_id)


   user_id  item_id             datetime                          old_user  \
0        0      128  2017-09-01 02:23:41  ee0d05672402dfa749653b7b13097ffd   
1        1      216  2008-08-19 10:43:58  30ac74c769883b12f6db2262c816ead2   
3        2     4172  2013-06-09 21:40:43  9002e18fc10924ec00145e26a1cf72a0   
4        3     6615  2016-05-04 20:43:51  1d83ade6f1901e01b3a6a7d5c5f6fab0   
5        4    14636  2016-11-25 05:17:09  a8743951826cbac65ed05e69c85dd206   

   old_item  
0    271265  
1    102955  
3     52368  
4   2418888  
5   6345999  


100%|██████████| 4067753/4067753 [00:07<00:00, 548859.40it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['user_id'] = train['user_id'].progress_apply(map_to_consecutive_id)
100%|██████████| 1016938/1016938 [00:02<00:00, 411052.50it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['user_id'] = test['user_id'].progress_apply(map_to_consecutive_id)


   user_id  item_id             datetime                          old_user  \
0        0      128  2017-09-01 02:23:41  ee0d05672402dfa749653b7b13097ffd   
2        1    19334  2017-03-06 11:04:31  407a03010634c3d3aa104f7c44c2c6ee   
3        2     4172  2013-06-09 21:40:43  9002e18fc10924ec00145e26a1cf72a0   
4        3     6615  2016-05-04 20:43:51  1d83ade6f1901e01b3a6a7d5c5f6fab0   
5        4    14636  2016-11-25 05:17:09  a8743951826cbac65ed05e69c85dd206   

   old_item  
0    271265  
2  18405520  
3     52368  
4   2418888  
5   6345999  


100%|██████████| 4067753/4067753 [00:07<00:00, 544804.23it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['user_id'] = train['user_id'].progress_apply(map_to_consecutive_id)
100%|██████████| 1016938/1016938 [00:02<00:00, 507428.03it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['user_id'] = test['user_id'].progress_apply(map_to_consecutive_id)


   user_id  item_id             datetime                          old_user  \
0        0      128  2017-09-01 02:23:41  ee0d05672402dfa749653b7b13097ffd   
1        1      216  2008-08-19 10:43:58  30ac74c769883b12f6db2262c816ead2   
2        2    19334  2017-03-06 11:04:31  407a03010634c3d3aa104f7c44c2c6ee   
4        3     6615  2016-05-04 20:43:51  1d83ade6f1901e01b3a6a7d5c5f6fab0   
5        4    14636  2016-11-25 05:17:09  a8743951826cbac65ed05e69c85dd206   

   old_item  
0    271265  
1    102955  
2  18405520  
4   2418888  
5   6345999  


100%|██████████| 4067753/4067753 [00:07<00:00, 575395.62it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['user_id'] = train['user_id'].progress_apply(map_to_consecutive_id)
100%|██████████| 1016938/1016938 [00:01<00:00, 514962.16it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['user_id'] = test['user_id'].progress_apply(map_to_consecutive_id)


   user_id  item_id             datetime                          old_user  \
1        0      216  2008-08-19 10:43:58  30ac74c769883b12f6db2262c816ead2   
2        1    19334  2017-03-06 11:04:31  407a03010634c3d3aa104f7c44c2c6ee   
3        2     4172  2013-06-09 21:40:43  9002e18fc10924ec00145e26a1cf72a0   
4        3     6615  2016-05-04 20:43:51  1d83ade6f1901e01b3a6a7d5c5f6fab0   
6        4    12100  2012-12-09 00:30:56  15632d9740f8b86b8d7e99f11edc248c   

   old_item  
1    102955  
2  18405520  
3     52368  
4   2418888  
6   7684644  


100%|██████████| 4067753/4067753 [00:07<00:00, 561625.27it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['user_id'] = train['user_id'].progress_apply(map_to_consecutive_id)
100%|██████████| 1016938/1016938 [00:02<00:00, 499924.06it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['user_id'] = test['user_id'].progress_apply(map_to_consecutive_id)


   user_id  item_id             datetime                          old_user  \
0        0      128  2017-09-01 02:23:41  ee0d05672402dfa749653b7b13097ffd   
1        1      216  2008-08-19 10:43:58  30ac74c769883b12f6db2262c816ead2   
2        2    19334  2017-03-06 11:04:31  407a03010634c3d3aa104f7c44c2c6ee   
3        3     4172  2013-06-09 21:40:43  9002e18fc10924ec00145e26a1cf72a0   
5        4    14636  2016-11-25 05:17:09  a8743951826cbac65ed05e69c85dd206   

   old_item  
0    271265  
1    102955  
2  18405520  
3     52368  
5   6345999  
5


100%|██████████| 4067752/4067752 [00:07<00:00, 529319.85it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['user_id'] = train['user_id'].progress_apply(map_to_consecutive_id)
100%|██████████| 1016939/1016939 [00:01<00:00, 526656.80it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['user_id'] = test['user_id'].progress_apply(map_to_consecutive_id)


   user_id  item_id             datetime                          old_user  \
0        0     9239  2013-09-02 13:05:32  07290e6714f98ef10f9b843a6ddf54d2   
1        1    12663  2012-07-15 05:25:14  8ab6526c1e17dc9aa1befb767cbfa4e2   
3        2    23419  2016-09-08 05:46:58  ba7d7bab66e081c1999c281fc981fb87   
4        3    22615  2016-01-27 19:00:30  f5f463ec57f86011c539f1d7cc2bd35e   
5        4    22822  2017-01-02 17:48:05  dfd25b244a487a1b3c1aa315794dd316   

   old_item  
0   6280053  
1  10630620  
3  27247277  
4  25870111  
5  25604474  


100%|██████████| 4067753/4067753 [00:07<00:00, 542193.42it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['user_id'] = train['user_id'].progress_apply(map_to_consecutive_id)
100%|██████████| 1016938/1016938 [00:01<00:00, 514866.18it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['user_id'] = test['user_id'].progress_apply(map_to_consecutive_id)


   user_id  item_id             datetime                          old_user  \
0        0     9239  2013-09-02 13:05:32  07290e6714f98ef10f9b843a6ddf54d2   
2        1    25319  2017-02-08 06:28:01  48c90de0626f9334e6f621ae4214e2b9   
3        2    23419  2016-09-08 05:46:58  ba7d7bab66e081c1999c281fc981fb87   
4        3    22615  2016-01-27 19:00:30  f5f463ec57f86011c539f1d7cc2bd35e   
5        4    22822  2017-01-02 17:48:05  dfd25b244a487a1b3c1aa315794dd316   

   old_item  
0   6280053  
2  30082505  
3  27247277  
4  25870111  
5  25604474  


100%|██████████| 4067753/4067753 [00:07<00:00, 576592.69it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['user_id'] = train['user_id'].progress_apply(map_to_consecutive_id)
100%|██████████| 1016938/1016938 [00:02<00:00, 497405.67it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['user_id'] = test['user_id'].progress_apply(map_to_consecutive_id)


   user_id  item_id             datetime                          old_user  \
0        0     9239  2013-09-02 13:05:32  07290e6714f98ef10f9b843a6ddf54d2   
1        1    12663  2012-07-15 05:25:14  8ab6526c1e17dc9aa1befb767cbfa4e2   
2        2    25319  2017-02-08 06:28:01  48c90de0626f9334e6f621ae4214e2b9   
4        3    22615  2016-01-27 19:00:30  f5f463ec57f86011c539f1d7cc2bd35e   
5        4    22822  2017-01-02 17:48:05  dfd25b244a487a1b3c1aa315794dd316   

   old_item  
0   6280053  
1  10630620  
2  30082505  
4  25870111  
5  25604474  


100%|██████████| 4067753/4067753 [00:07<00:00, 538257.07it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['user_id'] = train['user_id'].progress_apply(map_to_consecutive_id)
100%|██████████| 1016938/1016938 [00:02<00:00, 491741.67it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['user_id'] = test['user_id'].progress_apply(map_to_consecutive_id)


   user_id  item_id             datetime                          old_user  \
1        0    12663  2012-07-15 05:25:14  8ab6526c1e17dc9aa1befb767cbfa4e2   
2        1    25319  2017-02-08 06:28:01  48c90de0626f9334e6f621ae4214e2b9   
3        2    23419  2016-09-08 05:46:58  ba7d7bab66e081c1999c281fc981fb87   
4        3    22615  2016-01-27 19:00:30  f5f463ec57f86011c539f1d7cc2bd35e   
6        4     4443  2016-10-26 17:08:57  157e04019ba370c02172a8df2aff238b   

   old_item  
1  10630620  
2  30082505  
3  27247277  
4  25870111  
6    364960  


100%|██████████| 4067753/4067753 [00:07<00:00, 544111.92it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['user_id'] = train['user_id'].progress_apply(map_to_consecutive_id)
100%|██████████| 1016938/1016938 [00:01<00:00, 517679.04it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['user_id'] = test['user_id'].progress_apply(map_to_consecutive_id)


   user_id  item_id             datetime                          old_user  \
0        0     9239  2013-09-02 13:05:32  07290e6714f98ef10f9b843a6ddf54d2   
1        1    12663  2012-07-15 05:25:14  8ab6526c1e17dc9aa1befb767cbfa4e2   
2        2    25319  2017-02-08 06:28:01  48c90de0626f9334e6f621ae4214e2b9   
3        3    23419  2016-09-08 05:46:58  ba7d7bab66e081c1999c281fc981fb87   
5        4    22822  2017-01-02 17:48:05  dfd25b244a487a1b3c1aa315794dd316   

   old_item  
0   6280053  
1  10630620  
2  30082505  
3  27247277  
5  25604474  
5


100%|██████████| 4067752/4067752 [00:07<00:00, 522685.54it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['user_id'] = train['user_id'].progress_apply(map_to_consecutive_id)
100%|██████████| 1016939/1016939 [00:01<00:00, 521872.02it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['user_id'] = test['user_id'].progress_apply(map_to_consecutive_id)


   user_id  item_id             datetime                          old_user  \
0        0    24244  2017-03-25 08:01:02  dd669721e136c1be47d739b14fa23d20   
1        1     1948  2013-05-27 07:45:38  95b359d98964316e5b7cd33d0ea6e940   
3        2    20170  2017-06-08 05:32:16  8eea88b90159fe51fb97ed09ed72579a   
4        3      413  2016-06-09 15:13:43  448d266cf9a737a8e5a0fad159b580b2   
5        4      233  2014-11-13 17:08:55  9782d82fdbfa09da49e1b0a5344f4bd4   

   old_item  
0  29093045  
1    294963  
3  18667307  
4     30220  
5    472331  


100%|██████████| 4067753/4067753 [00:07<00:00, 541493.48it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['user_id'] = train['user_id'].progress_apply(map_to_consecutive_id)
100%|██████████| 1016938/1016938 [00:02<00:00, 485059.30it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['user_id'] = test['user_id'].progress_apply(map_to_consecutive_id)


   user_id  item_id             datetime                          old_user  \
0        0    24244  2017-03-25 08:01:02  dd669721e136c1be47d739b14fa23d20   
2        1    18309  2013-09-29 12:44:50  6524dab6c11287750ff8d4f8d373b384   
3        2    20170  2017-06-08 05:32:16  8eea88b90159fe51fb97ed09ed72579a   
4        3      413  2016-06-09 15:13:43  448d266cf9a737a8e5a0fad159b580b2   
5        4      233  2014-11-13 17:08:55  9782d82fdbfa09da49e1b0a5344f4bd4   

   old_item  
0  29093045  
2  17785891  
3  18667307  
4     30220  
5    472331  


100%|██████████| 4067753/4067753 [00:07<00:00, 531851.24it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['user_id'] = train['user_id'].progress_apply(map_to_consecutive_id)
100%|██████████| 1016938/1016938 [00:02<00:00, 492226.92it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['user_id'] = test['user_id'].progress_apply(map_to_consecutive_id)


   user_id  item_id             datetime                          old_user  \
0        0    24244  2017-03-25 08:01:02  dd669721e136c1be47d739b14fa23d20   
1        1     1948  2013-05-27 07:45:38  95b359d98964316e5b7cd33d0ea6e940   
2        2    18309  2013-09-29 12:44:50  6524dab6c11287750ff8d4f8d373b384   
4        3      413  2016-06-09 15:13:43  448d266cf9a737a8e5a0fad159b580b2   
5        4      233  2014-11-13 17:08:55  9782d82fdbfa09da49e1b0a5344f4bd4   

   old_item  
0  29093045  
1    294963  
2  17785891  
4     30220  
5    472331  


100%|██████████| 4067753/4067753 [00:07<00:00, 535147.65it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['user_id'] = train['user_id'].progress_apply(map_to_consecutive_id)
100%|██████████| 1016938/1016938 [00:02<00:00, 504445.10it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['user_id'] = test['user_id'].progress_apply(map_to_consecutive_id)


   user_id  item_id             datetime                          old_user  \
1        0     1948  2013-05-27 07:45:38  95b359d98964316e5b7cd33d0ea6e940   
2        1    18309  2013-09-29 12:44:50  6524dab6c11287750ff8d4f8d373b384   
3        2    20170  2017-06-08 05:32:16  8eea88b90159fe51fb97ed09ed72579a   
4        3      413  2016-06-09 15:13:43  448d266cf9a737a8e5a0fad159b580b2   
6        4     2577  2016-01-26 17:37:46  aed07d8f90b4391ce1b1c7415f606b85   

   old_item  
1    294963  
2  17785891  
3  18667307  
4     30220  
6    507689  


100%|██████████| 4067753/4067753 [00:07<00:00, 555301.61it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['user_id'] = train['user_id'].progress_apply(map_to_consecutive_id)
100%|██████████| 1016938/1016938 [00:01<00:00, 519824.05it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['user_id'] = test['user_id'].progress_apply(map_to_consecutive_id)


   user_id  item_id             datetime                          old_user  \
0        0    24244  2017-03-25 08:01:02  dd669721e136c1be47d739b14fa23d20   
1        1     1948  2013-05-27 07:45:38  95b359d98964316e5b7cd33d0ea6e940   
2        2    18309  2013-09-29 12:44:50  6524dab6c11287750ff8d4f8d373b384   
3        3    20170  2017-06-08 05:32:16  8eea88b90159fe51fb97ed09ed72579a   
5        4      233  2014-11-13 17:08:55  9782d82fdbfa09da49e1b0a5344f4bd4   

   old_item  
0  29093045  
1    294963  
2  17785891  
3  18667307  
5    472331  
