In [55]:
import gzip
import collections
from sklearn import linear_model
from sklearn import metrics
import random
import numpy as np
import string
import math
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from collections import defaultdict
import re

In [56]:
def readGz(path):
    for l in gzip.open(path, 'rt', encoding='utf-8'):
        if 'null' not in l:
            yield eval(l)

In [57]:
all_cloth = list(readGz("renttherunway_final_data.json.gz"))

In [58]:
all_cloth[0]

{'fit': 'fit',
 'user_id': '420272',
 'bust size': '34d',
 'item_id': '2260466',
 'weight': '137lbs',
 'rating': '10',
 'rented for': 'vacation',
 'review_text': "An adorable romper! Belt and zipper were a little hard to navigate in a full day of wear/bathroom use, but that's to be expected. Wish it had pockets, but other than that-- absolutely perfect! I got a million compliments.",
 'body type': 'hourglass',
 'review_summary': 'So many compliments!',
 'category': 'romper',
 'height': '5\' 8"',
 'size': 14,
 'age': '28',
 'review_date': 'April 20, 2016'}

In [59]:
len(all_cloth)

192462

In [60]:
for item in all_cloth:
    keys_with_spaces = list(item.keys())  # Extracting keys to modify while iterating
    for key in keys_with_spaces:
        if ' ' in key:
            new_key = key.replace(' ', '_')  # Replacing spaces with underscores
            item[new_key] = item.pop(key)  # Replace the key in the dictionary

In [61]:
all_cloth[133]

{'fit': 'large',
 'user_id': '649616',
 'item_id': '1840637',
 'weight': '161lbs',
 'rating': '8',
 'review_text': 'I wore this beautiful dress to a golf outing at a prestigious country club as an appearance with my husband. I love golfing in dresses because it gives you that polished look with out being stuffed into too hot shorts (the shoes are hot enough)  I am normally a medium, but per recommendations, I sized down to a small and it still fit very loose. I tied the bow around the back of the dress because I did not like how the dropped hem looked with the bow in the front. I have a pretty big bottom and hips, but this dress did a great job of dressing them up. I also stayed cool in this material for eight hours on the course!',
 'review_summary': 'This was the PERFECT outfit for having to be outside all day but still chic!!',
 'category': 'shirtdress',
 'height': '5\' 8"',
 'size': 8,
 'age': '29',
 'review_date': 'July 12, 2016',
 'bust_size': '34c',
 'rented_for': 'work',
 'body

In [62]:
#Data cleaning:
#1: We noticed that some clothing data between user and item is incomplete
cloths_key_len = []
for item in all_cloth:
    cloths_key_len.append(len(item.keys()))


In [63]:
#Incomplete clothing data for the item if the item has less than the standard 15 variables to describe it. 
unique_values = list(set(cloths_key_len))
print(unique_values)

[12, 13, 14, 15]


In [64]:
#Cleaned Dataset 
cloth_cleaned = []
for item in all_cloth:
    if len(item.keys()) == 15:
        cloth_cleaned.append(item)
len(cloth_cleaned)

146381

In [65]:
cloth_cleaned[0]

{'fit': 'fit',
 'user_id': '420272',
 'item_id': '2260466',
 'weight': '137lbs',
 'rating': '10',
 'review_text': "An adorable romper! Belt and zipper were a little hard to navigate in a full day of wear/bathroom use, but that's to be expected. Wish it had pockets, but other than that-- absolutely perfect! I got a million compliments.",
 'review_summary': 'So many compliments!',
 'category': 'romper',
 'height': '5\' 8"',
 'size': 14,
 'age': '28',
 'review_date': 'April 20, 2016',
 'bust_size': '34d',
 'rented_for': 'vacation',
 'body_type': 'hourglass'}

In [66]:
#User item pair for recommendation:
user_dict = defaultdict(list)
for item in cloth_cleaned:
    user_dict[item['user_id']].append(item['item_id'])

In [67]:
#Different occasions of clothes
cloth_occasions = []
for item in cloth_cleaned:
    cloth_occasions.append(item['rented_for'])

In [68]:
unique_occasions = list(set(cloth_occasions))
print(unique_occasions)

['party: cocktail', 'everyday', 'vacation', 'date', 'other', 'wedding', 'work', 'formal affair', 'party']


In [69]:
len(unique_occasions)

9

## Data Standardization

In [82]:
def standardize_data(data):
    """
    Standardizes the clothing dataset in place, focusing on bust size and height.

    Parameters:
    data (list of dict): List of dictionaries containing clothing item and user information.
    """
    for entry in data:
        # Standardizing bust size
        if 'bust_size' in entry and isinstance(entry['bust_size'], str):
            bust_match = re.match(r'(\d+)([a-zA-Z]+)', entry['bust_size'])
            if bust_match:
                entry['bra_size'] = int(bust_match.group(1))
                entry['cup_size'] = bust_match.group(2)

        # Converting height to inches
        if 'height' in entry and isinstance(entry['height'], str):
            height_match = re.match(r"(\d+)' (\d+)", entry['height'])
            if height_match:
                feet, inches = height_match.groups()
                entry['height_inches'] = int(feet) * 12 + int(inches)



In [83]:
standardize_data(cloth_cleaned)

In [84]:
cloth_cleaned[0]

{'fit': 'fit',
 'user_id': '420272',
 'item_id': '2260466',
 'weight': '137lbs',
 'rating': '10',
 'review_text': "An adorable romper! Belt and zipper were a little hard to navigate in a full day of wear/bathroom use, but that's to be expected. Wish it had pockets, but other than that-- absolutely perfect! I got a million compliments.",
 'review_summary': 'So many compliments!',
 'category': 'romper',
 'height': '5\' 8"',
 'size': 14,
 'age': '28',
 'review_date': 'April 20, 2016',
 'bust_size': '34d',
 'rented_for': 'vacation',
 'body_type': 'hourglass',
 'bra_size': 34,
 'cup_size': 'd',
 'height_inches': 68}