## Coicop classification

In [13]:
import os
import time
import datetime
import numpy as np
import pandas as pd
from sklearn import metrics

In [2]:
# Path variables
os.chdir("/Users/Alessandra/Downloads/foodies")
ROOT = os.getcwd()

In [3]:
strip = lambda x : x.strip()
foodies = pd.read_csv(os.path.join(ROOT, "clean2014Q3.csv") , sep = ",", header=0,
                      names=["coicop", "EXPDESC", "Paid1", "Shop", "MAFFQuan", "MAFFUnit"],
                      converters = {'coicop' : strip,
                                    'EXPDESC' : strip,
                                    'Paid1' : strip,
                                    'Shop' : strip,
                                    'MAFFQuan': float,
                                    'MAFFUnit': strip})

In [4]:
foodies.head()

Unnamed: 0,coicop,EXPDESC,Paid1,Shop,MAFFQuan,MAFFUnit
0,11111,loaf wht unsl fh,80,20,400.0,Grams
1,11111,loaf wht unsl fh,80,20,400.0,Grams
2,11111,loaf wht unsl fh,85,20,600.0,Grams
3,11111,wht bread un sl,100,20,400.0,Grams
4,11111,wht bread un sl,100,20,400.0,Grams


In [5]:
foodies.columns

Index([u'coicop', u'EXPDESC', u'Paid1', u'Shop', u'MAFFQuan', u'MAFFUnit'], dtype='object')

In [6]:
# Predict full coicop
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
# Create doc-term matrix
vect = CountVectorizer(min_df=1, analyzer='word')
X_vect = vect.fit_transform(np.array(foodies['EXPDESC']))

In [8]:
np.shape(X_vect)

(373525, 9288)

In [35]:
# Column names of X_vect: the descriptions array
vocab = vect.get_feature_names()
feature_names = np.array(vocab)

In [11]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False) # Just normalise
X_tf = tf_transformer.fit_transform(X_vect)
X_tf.shape

(373525, 9288)

In [12]:
# Train / Test split here
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_tf, foodies["coicop"], test_size=0.2, random_state=10)

In [14]:
def top_keywords(clf, k = 10):
    if hasattr(clf, 'coef_'):
        print("top 10 keywords per class:")
        for i, label in enumerate(set(y_train)):
            top10 = np.argsort(clf.coef_[i])[-10:]
            print("%s: %s" % (label, " ".join(feature_names[top10])))

## Naïve Bayes classifier

In [15]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB().fit(X_train, y_train)

In [16]:
# Top 10 keywords per class
top_keywords(nb)

top 10 keywords per class:
11465: up mbuy disc loaf fh bloomer tiger unsl wht bread
12131: x2 toastie mbuy fh disc up df sl wht bread
11821: up x2 wv bloomer unsliced unsl sl bread wht pr
11823: warburtons fh loaf sl wv disc bread batch wht seeded
11822: granary hovis fh disc wheatgerm seeded sl up brn bread
11784: mbuy up fh hovis sl kingsmill disc 5050 wmeal bread
11785: baps x4 fh x6 disc roll wht mbuy bread rolls
11786: veda wv fh mbuy soreen bread disc malt fruit loaf
11541: petit fh stick baguettes wht disc baton french bread baguette
11781: watchers weight loaf sl disc fh nimble wht danish bread
11782: roll wrap filled chicken ns swch mbuy disc retail swich
11783: thins rolls cheese bagels pitta mbuy naan disc garlic bread
11361: fh cross mbuy pancakes hot croissants disc scones buns crumpets
12121: melba toast crackerbread croutons crispbreads disc pretzels breadsticks crispbread ryvita
11661: chip mbuy jaffa cakes disc kat kit cookies bisc choc
11111: digestives shortbread coo

In [17]:
nb_pred = nb.predict(X_test)

In [19]:
# 5 digits coicops metrics
print(metrics.classification_report(y_test, nb_pred))

             precision    recall  f1-score   support

      11111       1.00      0.22      0.36       217
      11112       0.82      1.00      0.90      1241
      11113       0.00      0.00      0.00         1
      11114       0.00      0.00      0.00        18
      11121       1.00      0.25      0.40       163
      11122       0.92      0.99      0.95       879
      11131       0.88      0.99      0.93       955
      11132       1.00      0.61      0.76        90
      11133       0.99      0.75      0.85       318
      11134       1.00      0.03      0.07        58
      11135       1.00      0.81      0.89       252
      11136       0.88      0.93      0.90       953
      11141       0.96      0.93      0.95       803
      11142       1.00      0.82      0.90       183
      11143       0.92      0.95      0.94      1197
      11144       0.83      0.97      0.89      1808
      11145       0.99      0.71      0.82       326
      11151       1.00      0.86      0.92   

## Random Forest

In [20]:
from sklearn.ensemble import RandomForestClassifier

In [21]:
rf = RandomForestClassifier(n_estimators=100, random_state = 4, verbose = 1).fit(X_train, y_train)

[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed: 13.6min
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 28.1min finished


In [22]:
rf_pred = rf.predict(X_test)

[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:   10.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   22.1s finished


In [23]:
# 5 digits coicops metrics
print(metrics.classification_report(y_test, rf_pred))

             precision    recall  f1-score   support

      11111       0.97      0.94      0.95       217
      11112       0.99      1.00      0.99      1241
      11113       0.33      1.00      0.50         1
      11114       0.68      0.94      0.79        18
      11121       0.97      0.93      0.95       163
      11122       0.98      0.99      0.98       879
      11131       0.99      0.99      0.99       955
      11132       0.98      0.91      0.94        90
      11133       0.96      0.96      0.96       318
      11134       1.00      0.93      0.96        58
      11135       0.99      0.97      0.98       252
      11136       0.96      0.96      0.96       953
      11141       0.97      0.98      0.98       803
      11142       0.99      0.96      0.98       183
      11143       0.98      0.96      0.97      1197
      11144       0.98      0.98      0.98      1808
      11145       0.97      0.96      0.96       326
      11151       0.98      0.97      0.98   

In [24]:
feature_names = vect.get_feature_names()
print "Features sorted by their score:"
print sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), feature_names), 
             reverse=True)[:100]

Features sorted by their score:
[(0.0166, u'semi'), (0.0159, u'yoghurt'), (0.0152, u'crisps'), (0.0151, u'milk'), (0.0145, u'bananas'), (0.0133, u'toms'), (0.0128, u'wine'), (0.0128, u'fh'), (0.0114, u'eggs'), (0.0105, u'bisc'), (0.0103, u'choc'), (0.0102, u'apples'), (0.01, u'sce'), (0.0093, u'tin'), (0.0087, u'lc'), (0.0087, u'chicken'), (0.0087, u'carrots'), (0.0085, u'onions'), (0.0084, u'pizza'), (0.0084, u'cheddar'), (0.0078, u'rolls'), (0.0077, u'reg'), (0.0077, u'mushrooms'), (0.0076, u'soup'), (0.0076, u'bread'), (0.0075, u'grapes'), (0.0073, u'sweets'), (0.0069, u'pure'), (0.0066, u'whl'), (0.0066, u'df'), (0.0064, u'cucumber'), (0.0062, u'sl'), (0.0062, u'ns'), (0.0056, u'wht'), (0.0055, u'fz'), (0.0055, u'butter'), (0.0053, u'wmeal'), (0.0053, u'cake'), (0.0052, u'broc'), (0.0051, u'uc'), (0.005, u'sugar'), (0.005, u'peppers'), (0.005, u'cheese'), (0.0049, u'strawberries'), (0.0049, u'skm'), (0.0048, u'jce'), (0.0047, u'lager'), (0.0047, u'bacon'), (0.0046, u'potatoes'), (0

## Support Vector Machine Classifier

In [25]:
from sklearn import svm
lsvm = svm.LinearSVC().fit(X_train, y_train) 

In [26]:
svm_pred = lsvm.predict(X_test)

In [36]:
# Top 10 keywords per class
top_keywords(lsvm)

top 10 keywords per class:
11465: whtun farmhouse unsliced uncut bloomerwht giraffe unsl bloomer tiger un
12131: whit bread round wht sl slc whtsl toastie breaddf df
11821: bloomer up x2 wv bread unsl sl unsliced wht pr
11823: seed multigrain eht whtrcd batch batchwht wht wheat softgrain seeded
11822: wheat brwn brnbread brown altamura wgerm wheatie wheatbread brn wheatgerm
11784: grain halfhalfbread both granary whlmeal bob multigrain wml 5050 wmeal
11785: rollsrcd breadcakesrolls cobs bap breadcake breadcakes rollsms roll baps rolls
11786: sultana soreen bread maltbread fruit malt veda date loaf maltloaf
11541: pains artisan pain gluten french batons vienna baguettes baguette baton
11781: watches watchers loaf brd bread wwatchers weight weightwatchers nimble danish
11782: swichmdeal swichn swiche sandwich filled wrap swiches retail swch swich
11783: pannini thins doughballs naan pittas topped panini pitta bagels ciabatta
11361: buns barm lardy teacakes croissant scone croissants scon

In [28]:
# 5 digits coicops metrics
print(metrics.classification_report(y_test, svm_pred))

             precision    recall  f1-score   support

      11111       0.99      0.96      0.97       217
      11112       0.99      1.00      1.00      1241
      11113       0.00      0.00      0.00         1
      11114       0.76      0.89      0.82        18
      11121       0.97      0.97      0.97       163
      11122       0.99      0.99      0.99       879
      11131       0.98      1.00      0.99       955
      11132       0.98      0.93      0.95        90
      11133       0.96      0.99      0.98       318
      11134       1.00      1.00      1.00        58
      11135       1.00      0.98      0.99       252
      11136       0.97      0.95      0.96       953
      11141       0.96      0.99      0.97       803
      11142       0.99      0.98      0.98       183
      11143       0.96      0.96      0.96      1197
      11144       0.97      0.99      0.98      1808
      11145       0.96      0.96      0.96       326
      11151       0.98      0.98      0.98   