In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from scipy.sparse import hstack
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import pairwise_distances
from math import log, sqrt

In [2]:
df = pd.read_csv('../../think_data/user_data.csv')

In [3]:
patterns_df = pd.read_csv('../../think_data/patterns_data.csv')
patterns_df.index = patterns_df.pattern_id

In [4]:
len(df)

3756714

In [5]:
counts = df.groupby('pattern_id')['user_id'].count()

In [6]:
filtered_df = df[df.pattern_id.map(counts) >= 5]

In [7]:
from sklearn.feature_extraction.text import TfidfTransformer

def make_matrix(df):
    users = list(df.user_id.unique())
    products = list(df.pattern_id.unique())
    data = np.ones(len(df))
    col = df.user_id.astype('category', categories=users).cat.codes
    row = df.pattern_id.astype('category', categories=products).cat.codes
    N = len(users)
    #idf = [1. + log(N / (1. + p)) for p in df.groupby('user_id').size()]
    #weighted = [sqrt(hits) * idf[userid] for hits, userid in zip(data, col)]
    
    return csr_matrix((data, (row, col)), shape=(len(products), len(users))), products

In [8]:
transformer = TfidfTransformer()

matrix, products = make_matrix(filtered_df)
matrix = transformer.fit_transform(matrix)

In [9]:
import cPickle as pickle
import gzip

f = gzip.open('product_list.pklz','wb')
pickle.dump(products,f)
f.close()

In [9]:
matrix.shape

(148797, 17393)

In [10]:
from sklearn.decomposition import TruncatedSVD

In [38]:
shrinky = TruncatedSVD(35)

In [39]:
shrunk = shrinky.fit_transform(matrix)

In [40]:
pattern_names = patterns_df.permalink
pattern_names.index = patterns_df.pattern_id

In [41]:
target = products.index(pattern_names[pattern_names =='mr-dangly'].index[0])
similars = [products[n] for n in pd.Series([i[0] for i in \
                                            pairwise_distances(shrunk, shrunk[target], metric='cosine')]).argsort()[:10].values]



In [42]:
[pattern_names[i] for i in similars]

['mr-dangly',
 'socktopus',
 'owlsocks',
 'cthulhuclava',
 'sensible-socks',
 'jacques-crusteau',
 'praying-mantis',
 'robosocks',
 'felted-knit-shroomy',
 'loch-ness-monster']

In [43]:
import urllib
import StringIO
import gzip
import cPickle as pickle

def load_compressed_pickled_object(file_url):
	opener = urllib.URLopener()
	myfile = opener.open(file_url)
	compressed_string = myfile.read()
	compressedFile = StringIO.StringIO(compressed_string)
	decompressedFile = gzip.GzipFile(fileobj=compressedFile)
	loaded_object = pickle.load(decompressedFile)
	return loaded_object

pattern_names_url = 'https://s3.amazonaws.com/ravelry-data/pattern_names.pklz'
print 'loading patterns'
pattern_names = load_compressed_pickled_object(pattern_names_url)

loading patterns


In [20]:
matrix[161395].toarray()

18.665023335059495