In [1]:
import pandas as pd
from copy import deepcopy

from collections import Counter
from scipy.sparse import hstack
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import pairwise_distances

In [3]:
df = pd.read_csv('../think_data/patterns_data.csv')

In [22]:
likes_df = pd.read_csv('../think_data/user_data.csv')
counts = likes_df.groupby('pattern_id')['user_id'].count()
filtered_df = df[df.pattern_id.map(counts) >= 5]

In [23]:
required = ['keywords',
    'category',
    'difficulty',
    'permalink',
    'difficulty_average',
    'craft']

In [6]:
fillna_dict = {'gauge_divisor': 0,
    'gauge': 0,
    'row_gauge': 0,
    'gauge_pattern': 'xxxBonusWordxxx',
    'yardage': 0,
    'ply': 0}  

In [24]:
df = filtered_df.dropna(subset=required)

In [25]:
df = df.fillna(fillna_dict)

In [26]:
df.columns

Index([u'pattern_id', u'keywords', u'category', u'difficulty', u'permalink',
       u'difficulty_average', u'gauge_divisor', u'gauge', u'row_gauge',
       u'gauge_pattern', u'yardage', u'ply', u'craft'],
      dtype='object')

In [27]:
print([i for i in df.category.value_counts()[0:5].index])

['shawl-wrap', 'pullover', 'cardigan', 'beanie-toque', 'scarf']


## Transforming the data

Here I'm importing a few helper methods I wrote to make the data transformation a bit easier. They let me define all the transformations I want in a simple dictionary, and then pass that to the transformer to work on. I wrote this stuff a while ago, so it's not my best work, but it gets the job done.

In [2]:
from util.data_transformation_helpers import *

In [28]:
transformers = {
    'bag of words': NameGettingPipeline([('vectoriser', CountVectorizer(min_df=0.002, max_df=0.2, stop_words='english')), 
                              ('weighting', TfidfTransformer())
                                        ]),
    'keyword list': NameGettingPipeline([('vectoriser', CountVectorizer(tokenizer=lambda x: x.split('|'))), 
                              ('weighting', TfidfTransformer())]),
    'minmax': MinMaxWrapper(),
    'one-hot': OneHotWrapper()
}

Here's the dictionary which describes how I'm transforming the data. A "keyword list" is basically turning each word in the list into a seperate column, "minmax" is scaling numeric values to be between 0 and 1, "one-hot" is turning a categorical column into seperate columns, and "bag of words" is extracting important words from free text.

The numbers that follow are the "weights" for each column. The transformers ensure that every column has a value between 1 and 0, and then they are mutiplied by the weight to arrive at a final value.

In [29]:
data_transform = [
    ('keywords', 'keyword list', 1),
    ('category', 'keyword list', 2),
    ('difficulty', 'minmax', 2),
    ('ply', 'minmax', 3),
    ('gauge', 'minmax', 1),
    ('yardage', 'minmax', 1),
    ('craft', 'one-hot', 4),
    ('gauge_pattern', 'bag of words', 1)
]

In [30]:
transform_set = [(column, NameGettingPipeline([(
                    'selector', ItemSelector(column)), 
                ('transformer', deepcopy(transformers[transform_type]))
                   ]))  for column, transform_type, weight in data_transform]

In [31]:
weights = {column: weight for column, transform_type, weight in data_transform}

fu = FeatureUnion(transform_set, transformer_weights=weights)

In [32]:

fu.fit(df)

FeatureUnion(n_jobs=1,
       transformer_list=[('keywords', NameGettingPipeline(steps=[('selector', <util.data_transformation_helpers.ItemSelector object at 0x12c340b10>), ('transformer', NameGettingPipeline(steps=[('vectoriser', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy....g', TfidfTransformer(norm=u'l2', smooth_idf=True, sublinear_tf=False,
         use_idf=True))]))]))],
       transformer_weights={'category': 2, 'yardage': 1, 'gauge_pattern': 1, 'difficulty': 2, 'craft': 4, 'gauge': 1, 'ply': 3, 'keywords': 1})

In [33]:
features = fu.transform(df)

In [35]:
target = list(df.permalink).index('mr-dangly')

In [40]:
df.iloc[target]

pattern_id                                                 3150
keywords              fringe|seamed|written-pattern|worked-flat
category                                                 animal
difficulty                                              2.83824
permalink                                             mr-dangly
difficulty_average                                      2.83824
gauge_divisor                                                 0
gauge                                                         0
row_gauge                                                     0
gauge_pattern                                   xxxBonusWordxxx
yardage                                                       0
ply                                                           0
craft                                                  knitting
Name: 505, dtype: object

In [41]:
def get_closest_n(target, matrix, n):
    distances = pd.Series([i[0] for i in pairwise_distances(features, features[target])])
    return distances.argsort()[:n]

In [42]:
df.iloc[get_closest_n(target, features, 10)]

Unnamed: 0,pattern_id,keywords,category,difficulty,permalink,difficulty_average,gauge_divisor,gauge,row_gauge,gauge_pattern,yardage,ply,craft
505,3150,fringe|seamed|written-pattern|worked-flat,animal,2.838235,mr-dangly,2.838235,0.0,0.0,0.0,xxxBonusWordxxx,0.0,0.0,knitting
22137,169279,fringe|seamed|amigurumi|3-dimensional|written-...,animal,2.0,spring-collection,2.0,1.0,7.0,9.0,xxxBonusWordxxx,0.0,0.0,knitting
224579,179532,seamed|written-pattern|worked-flat,animal,1.857143,spring-lambs,1.857143,0.0,0.0,0.0,xxxBonusWordxxx,0.0,0.0,knitting
55869,521500,seamed|written-pattern|worked-flat,animal,1.75,pocket-fox,1.75,4.0,0.0,0.0,xxxBonusWordxxx,0.0,0.0,knitting
207269,1253,felted|in-the-round|fringe|one-piece|seamless|...,animal,1.571429,jellyfishin,1.571429,0.0,0.0,0.0,xxxBonusWordxxx,54.0,0.0,knitting
60124,33254,seamed|written-pattern|worked-flat,animal,1.5,knitted-kitten,1.5,0.0,0.0,0.0,xxxBonusWordxxx,0.0,0.0,knitting
511,3208,seamed|written-pattern|worked-flat,animal,3.090909,snoozing-ned,3.090909,4.0,28.0,36.0,xxxBonusWordxxx,0.0,0.0,knitting
253387,54441,fringe|seamed|written-pattern|worked-flat,animal,4.315789,leo-the-lion-3,4.315789,0.0,0.0,0.0,xxxBonusWordxxx,0.0,4.0,knitting
223101,688076,seamed|written-pattern,animal,3.0,toys-from-the-toybox,3.0,4.0,0.0,0.0,xxxBonusWordxxx,164.0,0.0,knitting
371434,229623,seamed|written-pattern,animal,2.333333,cats,2.333333,0.0,0.0,0.0,xxxBonusWordxxx,0.0,0.0,knitting
