In [1]:
import gzip
import random
from collections import defaultdict
import numpy as np
import json
from sklearn import linear_model

In [2]:
def parseData(fname):
  for l in open(fname):
    if l is "null": continue
    yield json.loads(l)

In [3]:
print("Reading data...")
data = list(parseData("modcloth_final_data.json"))
print("done")

Reading data...
done


In [4]:
data[0]

{'item_id': '123373',
 'waist': '29',
 'size': 7,
 'quality': 5,
 'cup size': 'd',
 'hips': '38',
 'bra size': '34',
 'category': 'new',
 'bust': '36',
 'height': '5ft 6in',
 'user_name': 'Emily',
 'length': 'just right',
 'fit': 'small',
 'user_id': '991571'}

In [5]:
len(data)


82790

In [18]:
len([d for d in data if "cup size" in d])

76535

In [19]:
len([d for d in data if "bra size" in d])

76772

In [21]:
len([d for d in data if "hips" in d])

56064

In [22]:
len([d for d in data if "bust" in d])

11854

In [23]:
len([d for d in data if "height" in d])

81683

In [30]:
modData = [d for d in data if "cup size" in d and "bra size" in d and "hips" in d
    and "height" in d]
len(modData)

54345

In [31]:
np.unique([d['cup size'] for d in modData])

array(['a', 'aa', 'b', 'c', 'd', 'dd/e', 'ddd/f', 'dddd/g', 'h', 'i', 'j',
       'k'], dtype='<U6')

In [32]:
np.unique([d['bra size'] for d in modData])

array(['28', '30', '32', '34', '36', '38', '40', '42', '44', '46', '48'],
      dtype='<U2')

In [33]:
np.unique([d['hips'] for d in modData])

array(['30', '30.0', '31', '31.0', '32', '32.0', '33', '33.0', '34',
       '34.0', '35', '35.0', '36', '36.0', '37', '37.0', '38', '38.0',
       '39', '39.0', '40', '40.0', '41', '41.0', '42', '42.0', '43',
       '43.0', '44', '44.0', '45', '45.0', '46', '46.0', '47', '47.0',
       '48', '48.0', '49', '49.0', '50', '50.0', '51', '51.0', '52',
       '52.0', '53', '53.0', '54', '54.0', '55', '55.0', '56', '56.0',
       '57', '57.0', '58', '58.0', '59', '59.0', '60', '60.0'],
      dtype='<U4')

In [34]:
np.unique([d['height'] for d in modData])

array(['3ft', '3ft 11in', '3ft 2in', '3ft 3in', '3ft 4in', '4ft 10in',
       '4ft 11in', '4ft 2in', '4ft 5in', '4ft 7in', '4ft 8in', '4ft 9in',
       '5ft', '5ft 10in', '5ft 11in', '5ft 1in', '5ft 2in', '5ft 3in',
       '5ft 4in', '5ft 5in', '5ft 6in', '5ft 7in', '5ft 8in', '5ft 9in',
       '6ft', '6ft 1in', '6ft 2in', '6ft 3in', '6ft 4in', '6ft 5in',
       '6ft 6in', '6ft 8in', '7ft 11in', '7ft 7in'], dtype='<U8')

In [8]:
categoryCounts = defaultdict(int)
for d in modData:
    categoryCounts[d['category']] += 1
categories = [c for c in categoryCounts if categoryCounts[c] > 500]
catID = dict(zip(list(categories),range(len(categories))))
catID

{'romper': 0,
 'gown': 1,
 'sheath': 2,
 'dress': 3,
 'top': 4,
 'jumpsuit': 5,
 'sweater': 6,
 'jacket': 7,
 'shirtdress': 8,
 'maxi': 9,
 'shift': 10,
 'mini': 11,
 'skirt': 12,
 'blouse': 13,
 'coat': 14,
 'blazer': 15}

In [9]:
def feat(d):
    feat = []
    feat = [0] * len(catID)
    if d['category'] in catID:
        feat[catID[d['category']]] = 1
    return feat + [1]

In [10]:
X = [feat(d) for d in modData]
y = [int(d['rating'])>=8 for d in modData]

In [11]:
N = len(X)
Xtrain = X[:N//2]
Xvalid = X[N//2:3*N//4]
Xtest = X[3*N//4:]
ytrain = y[:N//2]
yvalid = y[N//2:3*N//4]
ytest = y[3*N//4:]

In [12]:
mod = linear_model.LogisticRegression(C=1000, class_weight='balanced', max_iter=1000)

In [13]:
Xtrain[:5]

[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
 [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
 [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
 [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
 [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]]

In [14]:
y[:5]

[True, True, True, True, True]

In [15]:
mod.fit(Xtrain,ytrain)

LogisticRegression(C=1000, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
ypredValid = mod.predict(Xvalid)

# validation

TP = sum([(a and b) for (a,b) in zip(yvalid, ypredValid)])
TN = sum([(not a and not b) for (a,b) in zip(yvalid, ypredValid)])
FP = sum([(not a and b) for (a,b) in zip(yvalid, ypredValid)])
FN = sum([(a and not b) for (a,b) in zip(yvalid, ypredValid)])

TPR = TP / (TP + FN)
TNR = TN / (TN + FP)

BER = 1 - 0.5*(TPR + TNR)
print("Accuracy is:", (TP+TN)/len(yvalid))
print("C = 1000" + "; validation BER = " + str(BER))

Accuracy is: 0.38501506806609165
C = 1000; validation BER = 0.4578911925680753
