In [1]:
import pandas as pd
import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

In [2]:
nltk.download('vader_lexicon')


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/zmc/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
rent_csv_file = 'data/renttherunway_final_data.csv' 
# mete_csv_file = 'modcloth_final_data.csv' 

rent_data = pd.read_csv(rent_csv_file)
# mete_data = pd.read_csv(mete_csv_file)

In [4]:
rent_data = rent_data.dropna(subset=['height', 'weight'])

In [5]:
def height_to_cm(height):
    if not isinstance(height, str): 
        return None 
    match = re.match(r"(\d+)' (\d+)\"", height)
    if match:
        feet = int(match.group(1)) 
        inches = int(match.group(2)) 
        return feet * 30.48 + inches * 2.54
    else:
        return None

In [6]:
def calculate_bmi(weight_kg, height_cm):
    if weight_kg is None or height_cm is None:
        return None
    try:
        height_m = height_cm / 100
        return weight_kg / (height_m ** 2)
    except ZeroDivisionError:
        return None



In [7]:
sia = SentimentIntensityAnalyzer()


def analyze_sentiment(text):
    if pd.isna(text): 
        return None
    scores = sia.polarity_scores(text)
    return scores['compound']  

rent_data['sentiment_score'] = rent_data['review_text'].apply(analyze_sentiment)





In [8]:
def sentiment_label(score):
    if score is None:
        return 'neutral' 
    if score > 0.05:
        return 'positive'
    elif score < -0.05:
        return 'negative'
    else:
        return 'neutral'


In [9]:
rent_data['height_cm'] = rent_data['height'].apply(height_to_cm)
rent_data['weight_kg'] = rent_data['weight'].str.replace('lbs', '').astype(float) * 0.453592
rent_data['bmi'] = rent_data.apply(lambda row: calculate_bmi(row['weight_kg'], row['height_cm']), axis=1)
rent_data['sentiment_label'] = rent_data['sentiment_score'].apply(sentiment_label)

In [10]:
rent_data = rent_data.drop(columns = ['weight', 'height'])


In [11]:
rent_data

Unnamed: 0,fit,user_id,bust size,item_id,rating,rented for,review_text,body type,review_summary,category,size,age,review_date,sentiment_score,height_cm,weight_kg,bmi,sentiment_label
0,fit,420272,34d,2260466,10.0,vacation,An adorable romper! Belt and zipper were a lit...,hourglass,So many compliments!,romper,14,28.0,"April 20, 2016",0.9453,172.72,62.142104,20.830548,positive
1,fit,273551,34b,153475,10.0,other,I rented this dress for a photo shoot. The the...,straight & narrow,I felt so glamourous!!!,gown,12,36.0,"June 18, 2013",0.8991,167.64,59.874144,21.305121,positive
3,fit,909926,34c,126335,8.0,formal affair,I rented this for my company's black tie award...,pear,Dress arrived on time and in perfect condition.,dress,8,34.0,"February 12, 2014",0.9559,165.10,61.234920,22.464927,positive
4,fit,151944,34b,616682,10.0,wedding,I have always been petite in my upper body and...,athletic,Was in love with this dress !!!,gown,12,27.0,"September 26, 2016",0.6948,175.26,65.770840,21.412520,positive
5,fit,734848,32b,364092,8.0,date,Didn't actually wear it. It fit perfectly. The...,athletic,Traditional with a touch a sass,dress,8,45.0,"April 30, 2016",0.8910,172.72,62.595696,20.982596,positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192539,fit,66386,34dd,2252812,10.0,work,Fit like a glove!,hourglass,LOVE IT!!! First Item Im thinking of buying!,jumpsuit,8,42.0,"May 18, 2016",0.6476,175.26,63.502880,20.674157,positive
192540,fit,118398,32c,682043,10.0,work,The pattern contrast on this dress is really s...,petite,LOVE it!,dress,4,29.0,"September 30, 2016",0.8951,154.94,45.359200,18.894625,positive
192541,fit,47002,36a,683251,6.0,everyday,"Like the other DVF wraps, the fit on this is f...",straight & narrow,"Loud patterning, flattering fit",dress,8,31.0,"March 4, 2016",0.4072,172.72,61.234920,20.526452,positive
192542,fit,961120,36c,126335,10.0,wedding,This dress was PERFECTION. it looked incredib...,pear,loved this dress it was comfortable and photog...,dress,16,31.0,"November 25, 2015",0.9623,167.64,74.842680,26.631402,positive


In [16]:
rent_data[rent_data["sentiment_label"] == "negative"]

Unnamed: 0,fit,user_id,bust size,item_id,rating,rented for,review_text,body type,review_summary,category,size,age,review_date,sentiment_score,height_cm,weight_kg,bmi,sentiment_label
10,small,185966,34b,1077123,8.0,party,The dress arrived with a small hole in the bea...,athletic,It was fun to wear a dress I wouldn't normally...,dress,12,33.0,"January 2, 2018",-0.7853,160.02,61.234920,23.913912,negative
47,fit,804645,36d,1312996,10.0,formal affair,The side zipper was being difficult due to the...,hourglass,Everyone complimented me on the gown. It look...,gown,28,48.0,"January 19, 2015",-0.1901,172.72,86.182480,28.889081,negative
57,fit,880778,36d,1726756,10.0,party,The dress was a little tight across the chest ...,full bust,I rented this for my 35th birthday dinner. It ...,dress,20,37.0,"November 18, 2015",-0.1177,175.26,78.925008,25.695024,negative
102,fit,172378,34a,146684,10.0,other,"I wore this to my senior prom and honestly, I ...",pear,The ease of the transaction and the satisfacti...,gown,4,22.0,"April 21, 2013",-0.6059,157.48,56.245408,22.679645,negative
179,fit,483798,36d,1526552,4.0,work,this was not flattering on my curves. Didn't ...,pear,Not cute on an hourglass shape,dress,24,41.0,"October 2, 2017",-0.3395,170.18,80.739376,27.878432,negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192273,large,449266,,2598896,8.0,vacation,The reason this shirt works is because it's he...,,Fun shirt for going out,top,8,33.0,"May 11, 2016",-0.1568,172.72,58.966960,19.766213,negative
192313,fit,367858,36d,1675905,6.0,party,Wore this for my engagement party and wish I w...,hourglass,Not for large busts!,dress,24,30.0,"September 19, 2014",-0.0816,175.26,72.574720,23.627608,negative
192380,fit,856591,34d,2055795,10.0,everyday,It is super cute but arrived damaged. Didn't ...,pear,It is so cute and classy but it was damaged. ...,jacket,14,42.0,"November 2, 2016",-0.1027,167.64,71.213944,25.340182,negative
192461,fit,546344,34c,1184628,8.0,everyday,Runs true to size. Wore it to a pool party. N...,athletic,Cute and Casual,dress,12,37.0,"June 7, 2017",-0.3468,170.18,62.595696,21.613616,negative


In [17]:
data = rent_data[['fit', 'height_cm', 'weight_kg', 'bmi', 'sentiment_score']]

data = data.dropna()

fit_mapping = {'small': 0, 'fit': 1, 'large': 2}
data['fit'] = data['fit'].map(fit_mapping)

X = data[['height_cm', 'weight_kg', 'bmi', 'sentiment_score']]
y = data['fit']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [19]:
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)


In [20]:
print(classification_report(y_test, y_pred, target_names=['small', 'fit', 'large']))


              precision    recall  f1-score   support

       small       0.15      0.11      0.13      4324
         fit       0.74      0.81      0.77     23950
       large       0.14      0.10      0.12      4140

    accuracy                           0.63     32414
   macro avg       0.34      0.34      0.34     32414
weighted avg       0.59      0.63      0.60     32414



In [21]:
X = rent_data[['user_id', 'item_id', 'height_cm', 'weight_kg', 'bmi', 'sentiment_score']]
y = rent_data['fit'].map({'small': 0, 'fit': 1, 'large': 2})


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgb_model = XGBClassifier(objective='multi:softmax', num_class=3, random_state=42)
xgb_model.fit(X_train, y_train)

pred = xgb_model.predict(X_test)


print(classification_report(y_test, y_pred, target_names=['small', 'fit', 'large']))


              precision    recall  f1-score   support

       small       0.37      0.01      0.01      4297
         fit       0.74      1.00      0.85     23916
       large       0.37      0.01      0.02      4211

    accuracy                           0.74     32424
   macro avg       0.49      0.34      0.29     32424
weighted avg       0.64      0.74      0.63     32424

