In [84]:
import gzip
import random
from collections import defaultdict
import numpy as np
import json
from sklearn import linear_model
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
def parseData(fname):
  for l in open(fname):
    if l is "null": continue
    yield json.loads(l)

  if l is "null": continue


In [3]:
print("Reading data...")
data = list(parseData("modcloth_final_data.json"))
print("done")

Reading data...
done


In [4]:
data[0]

{'item_id': '123373',
 'waist': '29',
 'size': 7,
 'quality': 5,
 'cup size': 'd',
 'hips': '38',
 'bra size': '34',
 'category': 'new',
 'bust': '36',
 'height': '5ft 6in',
 'user_name': 'Emily',
 'length': 'just right',
 'fit': 'small',
 'user_id': '991571'}

In [5]:
len(data)


82790

In [6]:
len([d for d in data if "cup size" in d])

76535

In [7]:
len([d for d in data if "bra size" in d])

76772

In [8]:
len([d for d in data if "hips" in d])

56064

In [9]:
len([d for d in data if "bust" in d])

11854

In [10]:
len([d for d in data if "height" in d])

81683

In [11]:
modData = [d for d in data if "cup size" in d and "bra size" in d
    and "height" in d and d['size'] <= 22]
len(modData)

65848

In [12]:
len(np.unique([d['cup size'] for d in modData]))

12

In [13]:
len(np.unique([d['bra size'] for d in modData]))

11

In [15]:
#np.unique([d['hips'] for d in modData])

In [16]:
np.unique([d['height'] for d in modData])

array(['3ft', '3ft 11in', '3ft 2in', '3ft 3in', '3ft 4in', '3ft 6in',
       '4ft 10in', '4ft 11in', '4ft 2in', '4ft 4in', '4ft 5in', '4ft 7in',
       '4ft 8in', '4ft 9in', '5ft', '5ft 10in', '5ft 11in', '5ft 1in',
       '5ft 2in', '5ft 3in', '5ft 4in', '5ft 5in', '5ft 6in', '5ft 7in',
       '5ft 8in', '5ft 9in', '6ft', '6ft 1in', '6ft 2in', '6ft 3in',
       '6ft 4in', '6ft 5in', '6ft 6in', '6ft 8in', '7ft 11in', '7ft 3in',
       '7ft 5in', '7ft 7in'], dtype='<U8')

In [17]:
for d in modData:
    h = d['height']
    s = h.strip().split()
    ft = (int)(s[0].split('ft')[0])
    try:
        inches = (int)(s[1].split('in')[0])
    except:
        inches = 0
    finally:
        height = (ft*12) + inches 
        d['modHeight'] = height

In [18]:
# for d in modData:
#     d['hips'] = int((float)(d['hips']))
# modData[0]

KeyError: 'hips'

In [19]:
catCups = np.unique([d['cup size'] for d in modData])
catCupsID = dict(zip(list(catCups),range(len(catCups))))
catCupsID

{'a': 0,
 'aa': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'dd/e': 5,
 'ddd/f': 6,
 'dddd/g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11}

In [20]:
catBra = np.unique([d['bra size'] for d in modData])
catBraID = dict(zip(list(catBra),range(len(catBra))))
catBraID

{'28': 0,
 '30': 1,
 '32': 2,
 '34': 3,
 '36': 4,
 '38': 5,
 '40': 6,
 '42': 7,
 '44': 8,
 '46': 9,
 '48': 10}

In [21]:
def featCups(d):
    feat = []
    feat = [0] * len(catCupsID)
    feat[catCupsID[d['cup size']]] = 1
    return feat

In [22]:
def featBra(d):
    feat = []
    feat = [0] * len(catBraID)
    feat[catBraID[d['bra size']]] = 1
    return feat

In [65]:
for d in modData:
    size = d['size']
    if size <= 2:
        d['catSize'] = 0
    elif size <= 6:
        d['catSize'] =1
    elif size <= 10:
        d['catSize']=2
    elif size <= 14:
        d['catSize'] = 3
    elif size <= 18:
        d['catSize'] = 4
    else:
        d['catSize'] =5
modData[0]

{'item_id': '123373',
 'waist': '29',
 'size': 7,
 'quality': 5,
 'cup size': 'd',
 'hips': 38,
 'bra size': '34',
 'category': 'new',
 'bust': '36',
 'height': '5ft 6in',
 'user_name': 'Emily',
 'length': 'just right',
 'fit': 'small',
 'user_id': '991571',
 'modHeight': 66,
 'catSize': 2}

In [66]:
X = [[1] + featCups(d) + featBra(d) + [d['modHeight']]  for d in modData] 
# X_ = [featBra(d) for d in modData]
y = [d['catSize'] for d in modData]
# X_[:5]

In [67]:
len(X[0])

25

In [68]:
N = len(X)
Xtrain = X[:N//2]
Xvalid = X[N//2:3*N//4]
Xtest = X[3*N//4:]
ytrain = y[:N//2]
yvalid = y[N//2:3*N//4]
ytest = y[3*N//4:]

In [145]:
mod = linear_model.LogisticRegression(C=10**4, class_weight='balanced', max_iter=10**4, fit_intercept = False)

In [70]:
Xtrain[:5]

[[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 66],
 [1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 62],
 [1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 67],
 [1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 62],
 [1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 64]]

In [71]:
y[:5]

[2, 3, 2, 4, 3]

In [146]:
mod.fit(Xtrain,ytrain)

LogisticRegression(C=10000, class_weight='balanced', fit_intercept=False,
                   max_iter=10000)

In [147]:
ypredValid = mod.predict(Xvalid)

# validation

TP = sum([(a == b) for (a,b) in zip(yvalid, ypredValid)])
# TN = sum([(not a and not b) for (a,b) in zip(yvalid, ypredValid)])
# FP = sum([(not a and b) for (a,b) in zip(yvalid, ypredValid)])
# FN = sum([(a and not b) for (a,b) in zip(yvalid, ypredValid)])

# TPR = TP / (TP + FN)
# TNR = TN / (TN + FP)

# BER = 1 - 0.5*(TPR + TNR)
# print("Accuracy is:", (TP+TN)/len(yvalid))
# print("C = 1000" + "; validation BER = " + str(BER))

In [148]:
np.unique([d['fit'] for d in modData])


array(['fit', 'large', 'small'], dtype='<U5')

In [149]:
accuracy = ypredValid == yvalid
sum(accuracy)/len(accuracy)

0.4170817640626898

In [150]:
yvalid[:10]

[4, 3, 3, 4, 2, 4, 2, 3, 1, 0]

In [151]:
ypredValid[:10]

array([3, 3, 3, 4, 3, 3, 2, 2, 0, 0])

In [152]:
sum([d <= 1 for d in abs(yvalid - ypredValid)])/len(yvalid)

0.8790547928562751

In [153]:
ypredTest = mod.predict(Xtest)

In [154]:
acc_test = ypredTest == ytest
sum(acc_test)/len(acc_test)

0.40523630178593123

In [155]:
sum([d <= 1 for d in abs(ytest - ypredTest)])/len(ytest)

0.8721904993317945

In [99]:
svclassifier = SVC(kernel='linear')
svclassifier.fit(Xtrain, ytrain)

SVC(kernel='linear')

In [100]:
ypredValid2 = svclassifier.predict(Xvalid)

In [101]:
accuracy2 = ypredValid2 == yvalid
sum(accuracy2)/len(accuracy2)

0.4732717774268011

In [102]:
print(confusion_matrix(yvalid,ypredValid2))
print(classification_report(yvalid,ypredValid2))

[[   0  258  101   14    0    1]
 [   1 1644 1539  380    0    6]
 [   0  737 2140 1858    0   34]
 [   0  140 1004 3111    1  261]
 [   0   18   84  882    3  286]
 [   0   13   51 1001    1  893]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       374
           1       0.59      0.46      0.52      3570
           2       0.44      0.45      0.44      4769
           3       0.43      0.69      0.53      4517
           4       0.60      0.00      0.00      1273
           5       0.60      0.46      0.52      1959

    accuracy                           0.47     16462
   macro avg       0.44      0.34      0.33     16462
weighted avg       0.49      0.47      0.45     16462



In [103]:
sum([d <= 1 for d in abs(yvalid - ypredValid2)])/len(yvalid)

0.8721904993317945

In [106]:
ytestValid2 = svclassifier.predict(Xtest)

In [107]:
acc_test2 = ytestValid2 == ytest
sum(acc_test2)/len(acc_test2)

0.4257684363989795

In [111]:
sum([d <= 1 for d in abs(ytest - ytestValid2)])/len(ytestValid2)

0.8768679382821042