In [1]:
import gzip
import random
from collections import defaultdict
import numpy as np
import json
from sklearn import linear_model

In [2]:
def parseData(fname):
  for l in open(fname):
    if l is "null": continue
    yield json.loads(l)

In [3]:
print("Reading data...")
data = list(parseData("renttherunway_final_data.json"))
print("done")

Reading data...
done


In [4]:
usersPerItem = defaultdict(set)
itemsPerUser = defaultdict(set)
for d in data:
    userID, itemID = d['user_id'], d['item_id']
    usersPerItem[itemID].add(userID)
    itemsPerUser[userID].add(itemID)

In [5]:
data[0]

{'fit': 'fit',
 'user_id': '420272',
 'bust size': '34d',
 'item_id': '2260466',
 'weight': '137lbs',
 'rating': '10',
 'rented for': 'vacation',
 'review_text': "An adorable romper! Belt and zipper were a little hard to navigate in a full day of wear/bathroom use, but that's to be expected. Wish it had pockets, but other than that-- absolutely perfect! I got a million compliments.",
 'body type': 'hourglass',
 'review_summary': 'So many compliments!',
 'category': 'romper',
 'height': '5\' 8"',
 'size': 14,
 'age': '28',
 'review_date': 'April 20, 2016'}

In [6]:
len(data)


192544

In [7]:
modData = [d for d in data if d['rating'] is not None]

In [8]:
categoryCounts = defaultdict(int)
for d in modData:
    categoryCounts[d['category']] += 1
categories = [c for c in categoryCounts if categoryCounts[c] > 500]
catID = dict(zip(list(categories),range(len(categories))))
catID

{'romper': 0,
 'gown': 1,
 'sheath': 2,
 'dress': 3,
 'top': 4,
 'jumpsuit': 5,
 'sweater': 6,
 'jacket': 7,
 'shirtdress': 8,
 'maxi': 9,
 'shift': 10,
 'mini': 11,
 'skirt': 12,
 'blouse': 13,
 'coat': 14,
 'blazer': 15}

In [9]:
def feat(d):
    feat = []
    feat = [0] * len(catID)
    if d['category'] in catID:
        feat[catID[d['category']]] = 1
    return feat + [1]

In [10]:
X = [feat(d) for d in modData]
y = [int(d['rating'])>=8 for d in modData]

In [11]:
N = len(X)
Xtrain = X[:N//2]
Xvalid = X[N//2:3*N//4]
Xtest = X[3*N//4:]
ytrain = y[:N//2]
yvalid = y[N//2:3*N//4]
ytest = y[3*N//4:]

In [12]:
mod = linear_model.LogisticRegression(C=1000, class_weight='balanced', max_iter=1000)

In [13]:
Xtrain[:5]

[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
 [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
 [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
 [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
 [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]]

In [14]:
y[:5]

[True, True, True, True, True]

In [15]:
mod.fit(Xtrain,ytrain)

LogisticRegression(C=1000, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
ypredValid = mod.predict(Xvalid)

# validation

TP = sum([(a and b) for (a,b) in zip(yvalid, ypredValid)])
TN = sum([(not a and not b) for (a,b) in zip(yvalid, ypredValid)])
FP = sum([(not a and b) for (a,b) in zip(yvalid, ypredValid)])
FN = sum([(a and not b) for (a,b) in zip(yvalid, ypredValid)])

TPR = TP / (TP + FN)
TNR = TN / (TN + FP)

BER = 1 - 0.5*(TPR + TNR)
print("Accuracy is:", (TP+TN)/len(yvalid))
print("C = 1000" + "; validation BER = " + str(BER))

Accuracy is: 0.38501506806609165
C = 1000; validation BER = 0.4578911925680753
