In [None]:
import gzip
import random
from collections import defaultdict
import numpy as np
import json
from sklearn import linear_model
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB
import itertools

In [None]:
def parseData(fname):
    for l in open(fname):
        if l is "null": continue
        yield json.loads(l)

In [None]:
print("Reading data...")
data = list(parseData("modcloth_final_data.json"))
print("done")

In [None]:
data[0]

In [None]:
len(data)

In [None]:
len([d for d in data if "cup size" in d])

In [None]:
len([d for d in data if "bra size" in d])

In [None]:
len([d for d in data if "height" in d])

In [None]:
modData = [d for d in data if "cup size" in d and "bra size" in d
    and "height" in d and d['size'] <= 22]
len(modData)

In [None]:
len(np.unique([d['cup size'] for d in modData]))

In [None]:
len(np.unique([d['bra size'] for d in modData]))

In [None]:
#np.unique([d['hips'] for d in modData])

In [None]:
np.unique([d['height'] for d in modData])

In [None]:
for d in modData:
    h = d['height']
    s = h.strip().split()
    ft = (int)(s[0].split('ft')[0])
    try:
        inches = (int)(s[1].split('in')[0])
    except:
        inches = 0
    finally:
        height = (ft*12) + inches 
        d['modHeight'] = height

In [None]:
# for d in modData:
#     d['hips'] = int((float)(d['hips']))
# modData[0]

In [None]:
catCups = np.unique([d['cup size'] for d in modData])
catCupsID = dict(zip(list(catCups),range(len(catCups))))

In [None]:
catBra = np.unique([d['bra size'] for d in modData])
catBraID = dict(zip(list(catBra),range(len(catBra))))

In [None]:
def featCups(d):
    feat = []
    feat = [0] * len(catCupsID)
    feat[catCupsID[d['cup size']]] = 1
    return feat

In [None]:
def featBra(d):
    feat = []
    feat = [0] * len(catBraID)
    feat[catBraID[d['bra size']]] = 1
    return feat

In [None]:
for d in modData:
    size = d['size']
    if size <= 2:
        d['catSize'] = 0
    elif size <= 6:
        d['catSize'] =1
    elif size <= 10:
        d['catSize']=2
    elif size <= 14:
        d['catSize'] = 3
    elif size <= 18:
        d['catSize'] = 4
    else:
        d['catSize'] =5

In [None]:
X = [[1] + featCups(d) + featBra(d) + [d['modHeight']]  for d in modData] 
y = [d['catSize'] for d in modData]

In [None]:
N = len(X)
Xtrain = X[:N//2]
Xvalid = X[N//2:3*N//4]
Xtest = X[3*N//4:]
ytrain = y[:N//2]
yvalid = y[N//2:3*N//4]
ytest = y[3*N//4:]

In [None]:
def printAcc(model):
    # In validation set
    # Predict one size
    T=0
    ypredValid = model.predict(Xvalid)
    T = sum([(a == b) for (a,b) in zip(yvalid, ypredValid)])
    print("In validation set, accuracy when predict one size is:",format(T/len(yvalid)*100, '.2f'), "%", sep="")
    # Predict two size
    yprob = model.predict_proba(Xvalid)
    yprob_ = []
    for y in yprob:
        sorted_dict = {}
        data = dict(zip(range(len(y)),list(y.round(2))))
        sorted_y = sorted(data, key=data.get, reverse=True)
        for w in sorted_y:
            sorted_dict[w] = data[w]
        yprob_.append(dict(itertools.islice(sorted_dict.items(), 2)))
    y2size = [[list(d.keys())[0], list(d.keys())[1]] for d in yprob_]
    T_ = 0
    T_ = sum([(a == b[0] or a == b[1]) for (a,b) in zip(yvalid, y2size)])
    print("In validation set, accuracy when predict two size is:", format(T_/len(yvalid)*100, '.2f'), "%", sep="")
    
    # In test set
    # Predict one size
    T=0
    ypredTest = model.predict(Xtest)
    T = sum([(a == b) for (a,b) in zip(ytest, ypredTest)])
    print("In test set, accuracy when predict one size is:",format(T/len(yvalid)*100, '.2f'), "%", sep="")
    # Predict two size
    yprob = model.predict_proba(Xtest)
    yprob_ = []
    for y in yprob:
        sorted_dict = {}
        data = dict(zip(range(len(y)),list(y.round(2))))
        sorted_y = sorted(data, key=data.get, reverse=True)
        for w in sorted_y:
            sorted_dict[w] = data[w]
        yprob_.append(dict(itertools.islice(sorted_dict.items(), 2)))
    y2size = [[list(d.keys())[0], list(d.keys())[1]] for d in yprob_]
    T_ = 0
    T_ = sum([(a == b[0] or a == b[1]) for (a,b) in zip(ytest, y2size)])
    print("In test set, accuracy when predict two size is:", format(T_/len(ytest)*100, '.2f'), "%", sep="")
    

### Naive Bayes

In [None]:
modelNB = GaussianNB()
modelNB.fit(Xtrain,ytrain)

In [None]:
printAcc(modelNB)

### Logistic Regression

In [None]:
modelLR = linear_model.LogisticRegression(C=10**4, max_iter=10**4, fit_intercept = False)
modelLR.fit(Xtrain,ytrain)

In [None]:
printAcc(modelLR)

In [None]:
# sum([d <= 1 for d in abs(ytest - ypredTest)])/len(ytest)

### SVM

In [None]:
svclassifier = SVC(kernel='linear', probability=True)
svclassifier.fit(Xtrain, ytrain)

In [None]:
T=0
ypredValid = svclassifier.predict(Xvalid)
T = sum([(a == b) for (a,b) in zip(yvalid, ypredValid)])
print("In validation set, accuracy when predict one size is:",format(T/len(yvalid)*100, '.2f'), "%", sep="")

In [None]:
printAcc(svclassifier)

In [None]:
print(confusion_matrix(yvalid,ypredValid2))
print(classification_report(yvalid,ypredValid2))

In [None]:
# sum([d <= 1 for d in abs(yvalid - ypredValid2)])/len(yvalid)