# Data preprocessing

In this notebook, I am going to take the raw book data and prepare it for the modeling stage. 

In [2]:
# Import the necessary libraries 
import numpy as np 
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error # We will use squared=False

import time

import warnings
warnings.filterwarnings("ignore")

In [3]:
# Import the data 
books = pd.read_csv('../data/raw/books.csv')
ratings = pd.read_csv('../data/raw/ratings.csv')
tags = pd.read_csv('../data/raw/tags.csv')
book_tags = pd.read_csv('../data/raw/book_tags.csv')
to_read = pd.read_csv('../data/raw/to_read.csv')

In [4]:
books.replace(to_replace='J.K. Rowling, Mary GrandPré', value = 'J.K. Rowling', inplace=True)

In [5]:
to_read_counts = pd.DataFrame(to_read['book_id'].value_counts())
to_read_counts = to_read_counts.rename(columns={'book_id':'to_read_count'})
to_read_counts.index.name ="book_id"
to_read_counts.reset_index(inplace=True)
to_read_counts.head()

Unnamed: 0,book_id,to_read_count
0,47,2772
1,143,1967
2,113,1840
3,13,1812
4,11,1767


In [6]:
books=books.merge(to_read_counts, on='book_id')
books.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,to_read_count
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,973
1,2,3,3,4640799,491,439554934,9780440000000.0,J.K. Rowling,1997.0,Harry Potter and the Philosopher's Stone,...,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...,400
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...,287
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...,1478
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...,1293


In [7]:
books=books.drop(['image_url','small_image_url'], axis=1)

In [8]:
tags.head()

Unnamed: 0,tag_id,tag_name
0,0,-
1,1,--1-
2,2,--10-
3,3,--12-
4,4,--122-


In [9]:
book_tags.head()

Unnamed: 0,goodreads_book_id,tag_id,count
0,1,30574,167697
1,1,11305,37174
2,1,11557,34173
3,1,8717,12986
4,1,33114,12716


In [10]:
tag_merged = book_tags.merge(tags, on='tag_id')
tag_merged.head()

Unnamed: 0,goodreads_book_id,tag_id,count,tag_name
0,1,30574,167697,to-read
1,2,30574,24549,to-read
2,3,30574,496107,to-read
3,5,30574,11909,to-read
4,6,30574,298,to-read


In [11]:
books.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,to_read_count
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4.34,4780653,4942365,155254,66715,127936,560092,1481305,2706317,973
1,2,3,3,4640799,491,439554934,9780440000000.0,J.K. Rowling,1997.0,Harry Potter and the Philosopher's Stone,...,4.44,4602479,4800065,75867,75504,101676,455024,1156318,3011543,400
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3.57,3866839,3916824,95009,456191,436802,793319,875073,1355439,287
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,4.25,3198671,3340896,72586,60427,117415,446835,1001952,1714267,1478
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,3.89,2683664,2773745,51992,86236,197621,606158,936012,947718,1293


### Genres
I am going to hardcode some basic genres since the tags list is so varied and unclean. I will base it off of the most popular tags.  

In [12]:
# Top 50 tags
top_tags = sorted(tag_merged.sort_values('count',ascending=False).tag_name.unique()[:50])

In [13]:
# Hard code some popular genres
genres=["Art", "Biography", "Business", "Chick Lit", "Children's", "Christian", "Classics", "Comics", \
        "Contemporary", "Cookbooks", "Crime", "Ebooks", "Fantasy", "Fiction", "Gay and Lesbian", \
        "Graphic Novels", "Historical Fiction", "History", "Horror", "Humor and Comedy", "Manga", \
        "Memoir", "Music", "Mystery", "Nonfiction", "Paranormal", "Philosophy", "Poetry", "Psychology", \
        "Religion", "Romance", "Science", "Science Fiction", "Self Help", "Suspense", "Spirituality", \
        "Sports", "Thriller", "Travel", "Young Adult"]
for i in range(len(genres)):
    genres[i]=genres[i].lower()

for genre in top_tags: 
    if genre not in genres: 
        genres.append(genre)

In [14]:
len(genres)

72

In [15]:
new_tags=tag_merged[tag_merged.tag_name.isin(genres)]

In [16]:
new_tags.sort_values('count', ascending=False)

Unnamed: 0,goodreads_book_id,tag_id,count,tag_name
84,865,30574,596234,to-read
6140,2429135,30574,586235,to-read
9108,18143977,30574,505884,to-read
2,3,30574,496107,to-read
1671,24280,30574,488469,to-read
...,...,...,...,...
30195,2693801,8717,1,currently-reading
479243,3061,14552,1,history
526263,452157,32865,1,writing
724531,7011879,6750,1,chick-lit


This has helped us keep tags to a minimum, helping computation, while still adding a lot of value! 

Let's add them to the books dataframe. 

In [17]:
# Get the top 3 tags for each book
top_3 = {}
for book_id in new_tags.goodreads_book_id.unique(): 
    book_tags = new_tags[new_tags.goodreads_book_id == book_id].sort_values('count', ascending=False)
    top_3[book_id] = list(zip(book_tags.tag_id[:3], book_tags.tag_name[:3]))

print(len(new_tags.goodreads_book_id.unique()))
print(len(top_3.keys()))

10000
10000


In [18]:
print(books.shape)

(9986, 22)


In [19]:
# make a new row per item per tag
books=books.join(
    pd.concat(
        [new_tags.set_index('goodreads_book_id')['tag_name']],
        axis=1, keys='tags' #list(range(len(new_tags)))
    ), on='goodreads_book_id')

In [20]:
# # Aggregate the tags into a single column
books=books.groupby('book_id').agg({'goodreads_book_id':'first', 'best_book_id':'first', 'work_id':'first',
       'books_count':'first', 'isbn':'first', 'isbn13':'first', 'authors':'first', 'original_publication_year':'first',
       'original_title':'first', 'title':'first', 'language_code':'first', 'average_rating':'first',
       'ratings_count':'first', 'work_ratings_count':'first', 'work_text_reviews_count':'first',
       'ratings_1':'first', 'ratings_2':'first', 'ratings_3':'first', 'ratings_4':'first', 'ratings_5':'first',
       'to_read_count':'first','t': ', '.join}).reset_index()

In [21]:
books.rename(columns={'t':'tags'}, inplace=True)

In [22]:
print(books.shape)
books.head()

(9986, 23)


Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,to_read_count,tags
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,973,"to-read, fantasy, favorites, currently-reading..."
1,2,3,3,4640799,491,439554934,9780440000000.0,J.K. Rowling,1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,400,"to-read, fantasy, favorites, currently-reading..."
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,287,"to-read, fantasy, favorites, currently-reading..."
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,1478,"to-read, favorites, currently-reading, young-a..."
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,1293,"to-read, favorites, currently-reading, young-a..."


### Average vs baysean average

The Bayesian average adjusts the average rating of products whose rating counts fall below a threshold. I'll calcualte it below and show how it impacts averages. 

In [23]:
book_stats = ratings.groupby('book_id')[['rating']].agg(['count', 'mean'])
book_stats.columns = book_stats.columns.droplevel()

In [24]:
C = book_stats['count'].mean()
m = book_stats['mean'].mean()

def bayesian_avg(ratings):
    bayesian_avg = (C*m+ratings.sum())/(C+ratings.count())
    return bayesian_avg

bayesian_avg_ratings = ratings.groupby('book_id')['rating'].agg(bayesian_avg).reset_index()
bayesian_avg_ratings.columns = ['book_id', 'bayesian_avg']
book_stats = book_stats.merge(bayesian_avg_ratings, on='book_id')

In [25]:
book_stats = book_stats.merge(books[['book_id', 'title']])
book_stats.sort_values('bayesian_avg', ascending=False).head()

Unnamed: 0,book_id,count,mean,bayesian_avg,title
24,25,15304,4.525941,4.502533,Harry Potter and the Deathly Hallows (Harry Po...
421,422,1915,4.65953,4.479617,"Harry Potter Boxset (Harry Potter, #1-7)"
861,862,1373,4.70284,4.46031,"Words of Radiance (The Stormlight Archive, #2)"
779,780,1394,4.661406,4.433867,Calvin and Hobbes
26,27,15081,4.443339,4.422748,Harry Potter and the Half-Blood Prince (Harry ...


Above the mean rating and the average rating are shown. The values of the top rated movies are close to their true average, which is good. 

In [26]:
book_stats.sort_values('bayesian_avg', ascending=True).head()

Unnamed: 0,book_id,count,mean,bayesian_avg,title
33,34,7724,3.092439,3.150662,"Fifty Shades of Grey (Fifty Shades, #1)"
2,3,16931,3.214341,3.237825,"Twilight (Twilight, #1)"
48,49,9712,3.21736,3.257114,"New Moon (Twilight, #2)"
1792,1793,295,2.077966,3.299958,One Night at the Call Center
341,342,2863,3.174991,3.300741,The Casual Vacancy


Here we see that the average rating for lower rated films has more varience. This makes sense, as we are 

In [27]:
books=books.merge(book_stats[['book_id','bayesian_avg']], on='book_id')

Now that we have added the tags and info about the to-read shelf, I am ready to move to the modeling phase. 

In [28]:
# Save the data
books.to_csv("../data/processed/books.csv", index=False)

# Modeling

1. Split the data into train and test
2. Performace criterion
3. Write evaluate function

### 1. Split data into train and test

In [29]:
user_ids = ratings.user_id.unique()
len(user_ids)

53424

At first, I want to take only 5k user IDS, then at the end I can scale to the full 53,424. 

In [30]:
users_train, users_test = train_test_split(user_ids, test_size=0.3, random_state=42)

In [31]:
ratings_train = ratings[ratings.user_id.isin(users_train)]
ratings_test = ratings[~ratings.user_id.isin(users_train)]

In [32]:
print(ratings_train.shape, ratings_test.shape)

(4183807, 3) (1792672, 3)


In [33]:
ratings_test.columns

Index(['user_id', 'book_id', 'rating'], dtype='object')

### 2. Performace criterion

Performance can be evaluated in a number of ways. Options include RMSE, ROC curves, Precision/Recall/F-Scores, and cost curves. I am going to focus on RMSE as our main criterion, and also examine ROC curves. 

In [34]:
ratings_train_avg = ratings_train.rating.mean()
ratings_train_avg

3.9213512956023067

In [35]:
def evaluate(estimate_f):
    # RMSE based predictive performace evaluation 
    ids_to_estimate = zip(ratings_test.user_id, ratings_test.book_id)
    estimated = np.array([estimate_f(u,b) for (u,b) in (ids_to_estimate)])
    real = ratings_test.rating.values
    return mean_squared_error(estimated, real, squared=False)

In [36]:
def avg_rating(user_id, book_id):
    return(ratings_train_avg)

In [37]:
ts = time.time()
print("RMSE for content-mean estimate: {:.4f}".format(evaluate(avg_rating)))
print("took {:.2f} seconds ({:.2f} minutes) to evaluate".format(time.time()-ts, (time.time()-ts)/60.0))

RMSE for content-mean estimate: 0.9937
took 0.49 seconds (0.01 minutes) to evaluate


## Collaborative filtering

In [38]:
# Make a sparce matrix

from scipy.sparse import csr_matrix

def create_X(df):
    """
    Generates a sparse matrix from ratings dataframe.
    
    Args:
        df: pandas dataframe
    
    Returns:
        X: sparse matrix
        user_mapper: dict that maps user id's to user indices
        user_inv_mapper: dict that maps user indices to user id's
        book_mapper: dict that maps movie id's to movie indices
        book_inv_mapper: dict that maps movie indices to movie id's
    """
    N = df['user_id'].nunique()
    M = df['book_id'].nunique()

    user_mapper = dict(zip(np.unique(df["user_id"]), list(range(N))))
    book_mapper = dict(zip(np.unique(df["book_id"]), list(range(M))))
    
    user_inv_mapper = dict(zip(list(range(N)), np.unique(df["user_id"])))
    book_inv_mapper = dict(zip(list(range(M)), np.unique(df["book_id"])))
    
    user_index = [user_mapper[i] for i in df['user_id']]
    book_index = [book_mapper[i] for i in df['book_id']]

    X = csr_matrix((df["rating"], (book_index, user_index)), shape=(M, N))
    
    return X, user_mapper, book_mapper, user_inv_mapper, book_inv_mapper

In [39]:
X, user_mapper, book_mapper, user_inv_mapper, book_inv_mapper = create_X(ratings)

In [40]:
sparsity = X.count_nonzero()/(X.shape[0]*X.shape[1])

print(f"Matrix sparsity: {round(sparsity*100,2)}%")

Matrix sparsity: 1.12%


Only 1.12% of cells in our user-item matrix are populated with ratings. This is fine, since it is more than 0.5%

In [41]:
# Save the data
books.to_csv("../data/processed/user_item_sparce.csv", index=False)

## Finding similar movies using k-Nearest Neighbours

This approach looks for the k nearest neighbours of a given book by identifying k points in the dataset that are closest to book. kNN makes use of distance metrics such as:

* Cosine similarity
* Euclidean distance
* Manhattan distance
* Pearson correlation

In [42]:
from sklearn.neighbors import NearestNeighbors

def find_similar_books(book_id, X, k, metric='cosine', show_distance=False):
    """
    Finds k-nearest neighbours for a given book id.
    
    Args:
        book_id: id of the book of interest
        X: user-item utility matrix
        k: number of similar movies to retrieve
        metric: distance metric for kNN calculations
    
    Returns:
        list of k similar movie ID's
    """
    neighbour_ids = []
    
    book_ind = book_mapper[book_id]
    book_vec = X[book_id]
    k+=1
    kNN = NearestNeighbors(n_neighbors=k, algorithm="brute", metric=metric)
    kNN.fit(X)
    if isinstance(book_vec, (np.ndarray)):
        book_vec = book_vec.reshape(1,-1)
    neighbour = kNN.kneighbors(book_vec, return_distance=show_distance)
    for i in range(0,k):
        n = neighbour.item(i)
        neighbour_ids.append(book_inv_mapper[n])
    neighbour_ids.pop(0)
    return neighbour_ids

In [43]:
book_titles = dict(zip(books['book_id'], books['title']))

book_id = 1

similar_ids = find_similar_books(book_id, X, k=10)
book_title = book_titles[book_id]

print(f"Because you read {book_title}:")
for i in similar_ids:
    print(book_titles[i])

Because you read The Hunger Games (The Hunger Games, #1):
Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)
Harry Potter and the Chamber of Secrets (Harry Potter, #2)
Harry Potter and the Goblet of Fire (Harry Potter, #4)
Harry Potter and the Order of the Phoenix (Harry Potter, #5)
Harry Potter and the Half-Blood Prince (Harry Potter, #6)
Harry Potter and the Deathly Hallows (Harry Potter, #7)
The Hunger Games (The Hunger Games, #1)
The Hobbit
Twilight (Twilight, #1)
The Fellowship of the Ring (The Lord of the Rings, #1)


In [44]:
book_id = books[books.title=="The Book Thief"].book_id.values[0]

similar_ids = find_similar_books(book_id, X, k=10)
book_title = book_titles[book_id]

print(f"Because you read {book_title}:")
for i in similar_ids:
    print(book_titles[i])

Because you read The Book Thief:
1984
Brave New World
Animal Farm
Lord of the Flies
Slaughterhouse-Five
The Great Gatsby
The Catcher in the Rye
To Kill a Mockingbird
The Hobbit
The Hitchhiker's Guide to the Galaxy (Hitchhiker's Guide to the Galaxy, #1)


In [45]:
book_id = books[books.title=="The Iliad"].book_id.values[0]

similar_ids = find_similar_books(book_id, X, k=10)
book_title = book_titles[book_id]

print(f"Because you read {book_title}:")
for i in similar_ids:
    print(book_titles[i])

Because you read The Iliad:
The Cuckoo's Calling (Cormoran Strike, #1)
The Silkworm (Cormoran Strike, #2)
Gone Girl
Career of Evil (Cormoran Strike, #3)
The Girl on the Train
The Fault in Our Stars
The Hunger Games (The Hunger Games, #1)
Harry Potter and the Sorcerer's Stone (Harry Potter, #1)
The Goldfinch
Harry Potter and the Deathly Hallows (Harry Potter, #7)


## Types of recommendation systems
* Content-based filtering - Recommend based on user's rating history. 
* Collaborative filtering - Recommend based on other user's rating histories. 
* Hybrid solutions - there are a number of ways to combine the two above, from combining the two predictions to incorperating one system into the other. 

All of my functions will take a single user id and a single movie id and return a single float that is the estimated rating. 

### Content-based filtering

In [46]:
def content_mean(user_id, book_id): 
    # Simple content-based filtering based on mean ratings. 
    if user_id not in ratings_train.user_id: 
        return(ratings_train_avg)
    else: 
        user_condition = ratings_train.user_id == user_id
        return(ratings_train.loc[user_condition, 'rating'].mean())

In [47]:
# ts = time.time()
# print("RMSE for content-mean estimate: {:.4f}".format(evaluate(content_mean)))
# print("took {:.2f} seconds ({:.2f} minutes) to evaluate".format(time.time()-ts, (time.time()-ts)/60.0))

The above takes too long

### Collaborative filtering

In [48]:
def collab_mean(user_id, book_id):
    # index into all ratings of this movie
    user_condition = ratings_train.user_id != user_id
    item_condition = ratings_train.book_id == book_id
    
    ratings_by_others = ratings_train.loc[user_condition & item_condition, 'rating']
    if ratings_by_others.empty: 
        return ratings_train_avg
    else:
        return(ratings_by_others.mean())

In [49]:
# ts = time.time()
# print("RMSE for content-mean estimate: {:.4f}".format(evaluate(collab_mean)))
# print("took {:.2f} seconds ({:.2f} minutes) to evaluate".format(time.time()-ts, (time.time()-ts)/60.0))

The above takes too long

In [50]:
# ratings_mtx_df = ratings_train.pivot_table(values='rating', index='user_id',columns='book_id')

Similarity function: pearson 

In [51]:
def pearson(s1, s2):
    # Take two series and return pearson correlation
    s1_c = s1 - s1.mean()
    s2_c = s2 - s2.mean()
    return(np.sum(s1_c * s2_c) / np.sqrt(np.sum(s1_c ** 2) * (np.sum(s2_c ** 2))))

In [129]:
class CollabPearsonReco:
    
    def learn(self):
        self.all_user_profiles = ratings.pivot_table(values='rating', index='book_id',columns='user_id')
    
    def estimate(self, user_id, book_id):
        ratings_by_others = ratings_train[ratings_train.book_id == book_id]
        if ratings_by_others.empty: 
            return(ratings_train_avg)
        
        ratings_by_others.set_index('user_id', inplace=True)
        their_ids = ratings_by_others.index
        their_ratings = ratings_by_others.rating
        their_profiles = self.all_user_profiles[their_ids]
        user_profile = self.all_user_profiles[user_id]
        sims = their_profiles.apply(lambda profile: pearson(profile, user_profile), axis=0)
        ratings_sims = pd.DataFrame({'sim': sims, 'rating':their_ratings})
        ratings_sims = ratings_sims[ratings_sims.sim > 0]
        
        if ratings_sims.empty: 
            return(their_ratings.mean())
        else:
            return(np.average(ratings_sims.rating, weights=ratings_sims.sim))

In [130]:
ts = time.time()
reco = CollabPearsonReco()
reco.learn()
print("took {:.2f} seconds ({:.2f} minutes) to learn".format(time.time()-ts, (time.time()-ts)/60.0))

took 19.32 seconds (0.32 minutes) to learn


In [None]:
ts = time.time()
print("RMSE for content-mean estimate: {:.4f}".format(evaluate(reco.estimate)))
print("took {:.2f} seconds ({:.2f} minutes) to evaluate".format(time.time()-ts, (time.time()-ts)/60.0))

Try again with all data?

In [None]:
class CollabPearsonReco2:
    
    def learn(self):
        self.all_user_profiles = ratings.pivot_table(values='rating', index='book_id',columns='user_id')
    
    def estimate(self, user_id, book_id):
        ratings_by_others = ratings[ratings.book_id == book_id]
        if ratings_by_others.empty: 
            return(ratings_train_avg)
        
        ratings_by_others.set_index('user_id', inplace=True)
        their_ids = ratings_by_others.index
        their_ratings = ratings_by_others.rating
        their_profiles = self.all_user_profiles[their_ids]
        user_profile = self.all_user_profiles[user_id]
        sims = their_profiles.apply(lambda profile: pearson(profile, user_profile), axis=0)
        ratings_sims = pd.DataFrame({'sim': sims, 'rating':their_ratings})
        ratings_sims = ratings_sims[ratings_sims.sim > 0]
        
        if ratings_sims.empty: 
            return(their_ratings.mean())
        else:
            return(np.average(ratings_sims.rating, weights=ratings_sims.sim))

In [None]:
ts = time.time()
reco2 = CollabPearsonReco2()
reco2.learn()
print("took {:.2f} seconds ({:.2f} minutes) to learn".format(time.time()-ts, (time.time()-ts)/60.0))

In [None]:
ts = time.time()
print("RMSE for content-mean estimate: {:.4f}".format(evaluate(reco2.estimate)))
print("took {:.2f} seconds ({:.2f} minutes) to evaluate".format(time.time()-ts, (time.time()-ts)/60.0))