In [1]:
import pandas as pd
df3 = pd.read_json('renttherunway_final_data.json', lines=True)
df3.head(5)

Unnamed: 0,fit,user_id,bust size,item_id,weight,rating,rented for,review_text,body type,review_summary,category,height,size,age,review_date
0,fit,420272,34d,2260466,137lbs,10.0,vacation,An adorable romper! Belt and zipper were a lit...,hourglass,So many compliments!,romper,"5' 8""",14,28.0,"April 20, 2016"
1,fit,273551,34b,153475,132lbs,10.0,other,I rented this dress for a photo shoot. The the...,straight & narrow,I felt so glamourous!!!,gown,"5' 6""",12,36.0,"June 18, 2013"
2,fit,360448,,1063761,,10.0,party,This hugged in all the right places! It was a ...,,It was a great time to celebrate the (almost) ...,sheath,"5' 4""",4,116.0,"December 14, 2015"
3,fit,909926,34c,126335,135lbs,8.0,formal affair,I rented this for my company's black tie award...,pear,Dress arrived on time and in perfect condition.,dress,"5' 5""",8,34.0,"February 12, 2014"
4,fit,151944,34b,616682,145lbs,10.0,wedding,I have always been petite in my upper body and...,athletic,Was in love with this dress !!!,gown,"5' 9""",12,27.0,"September 26, 2016"


In [2]:
df3.isna().sum()

fit                   0
user_id               0
bust size         18411
item_id               0
weight            29982
rating               82
rented for           10
review_text           0
body type         14637
review_summary        0
category              0
height              677
size                  0
age                 960
review_date           0
dtype: int64

In [3]:
columns_to_fill = ['rating', 'age']
df3[columns_to_fill] = df3[columns_to_fill].fillna(df3[columns_to_fill].mean())

In [4]:
df3.isna().sum()

fit                   0
user_id               0
bust size         18411
item_id               0
weight            29982
rating                0
rented for           10
review_text           0
body type         14637
review_summary        0
category              0
height              677
size                  0
age                   0
review_date           0
dtype: int64

In [5]:
# Filling NaN in weight
df3['weight'] = df3['weight'].str.replace('lbs', '').astype(float)

In [6]:
# Filling weight with median
df3['weight'].fillna(df3['weight'].mean(), inplace=True)

In [7]:
df3.isna().sum()

fit                   0
user_id               0
bust size         18411
item_id               0
weight                0
rating                0
rented for           10
review_text           0
body type         14637
review_summary        0
category              0
height              677
size                  0
age                   0
review_date           0
dtype: int64

In [8]:
# Filling NaN in height
# First, define a function to convert the height format to inches
def convert_to_inches(height_str):
    if pd.isnull(height_str):
        return height_str

    if "'" in str(height_str):  # Check if the value is a string containing "'"
        feet, inches = map(int, str(height_str).replace('"', '').split("'"))
        return feet * 12 + inches
    else:
        return float(height_str)

# Apply the conversion function to the 'height' column
df3['height'] = df3['height'].apply(convert_to_inches)

# Now, fill NaN values in the 'height' column with the mean or any other appropriate value
mean_height = df3['height'].mean()
df3['height'].fillna(mean_height, inplace=True)

In [9]:
df3.isna().sum()

fit                   0
user_id               0
bust size         18411
item_id               0
weight                0
rating                0
rented for           10
review_text           0
body type         14637
review_summary        0
category              0
height                0
size                  0
age                   0
review_date           0
dtype: int64

In [10]:
# Filling NaN in body type
mode_body_type = df3['body type'].mode()[0]
df3['body type'].fillna(mode_body_type, inplace=True)

In [11]:
df3.isna().sum()

fit                   0
user_id               0
bust size         18411
item_id               0
weight                0
rating                0
rented for           10
review_text           0
body type             0
review_summary        0
category              0
height                0
size                  0
age                   0
review_date           0
dtype: int64

In [12]:
# Dropping NaN in rented for
df3.dropna(subset=['rented for'], inplace=True)

In [13]:
df3.isna().sum()

fit                   0
user_id               0
bust size         18408
item_id               0
weight                0
rating                0
rented for            0
review_text           0
body type             0
review_summary        0
category              0
height                0
size                  0
age                   0
review_date           0
dtype: int64

In [14]:
df3.shape

(192534, 15)

In [15]:
# Filling NaN in bust size
mode_bust_size = df3['bust size'].mode()[0]
df3['bust size'].fillna(mode_bust_size, inplace=True)

In [16]:
df3.isna().sum()

fit               0
user_id           0
bust size         0
item_id           0
weight            0
rating            0
rented for        0
review_text       0
body type         0
review_summary    0
category          0
height            0
size              0
age               0
review_date       0
dtype: int64

In [17]:
pip install textblob

Note: you may need to restart the kernel to use updated packages.


To combine the rating and review_summary columns, you can create a new column, let's call it hybrid_rating, that incorporates both pieces of information. 

TextBlob is used for sentiment analysis, giving a polarity score ranging from -1 to 1. The sentiment score is added to the original rating column to create the hybrid_rating. This hybrid_rating column can then be used as your new rating in building the recommendation system. 

In [18]:
from textblob import TextBlob  # For sentiment analysis

# Calculate sentiment scores for review summaries
df3['sentiment_score'] = df3['review_summary'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)

# Combine rating and sentiment to create hybrid_rating
df3['hybrid_rating'] = df3['rating'] + df3['sentiment_score']

# Display the modified DataFrame
df3[['rating', 'review_summary', 'sentiment_score', 'hybrid_rating']].head(5)

Unnamed: 0,rating,review_summary,sentiment_score,hybrid_rating
0,10.0,So many compliments!,0.625,10.625
1,10.0,I felt so glamourous!!!,0.0,10.0
2,10.0,It was a great time to celebrate the (almost) ...,0.525,10.525
3,8.0,Dress arrived on time and in perfect condition.,1.0,9.0
4,10.0,Was in love with this dress !!!,0.976562,10.976562


In [19]:
df3.head()

Unnamed: 0,fit,user_id,bust size,item_id,weight,rating,rented for,review_text,body type,review_summary,category,height,size,age,review_date,sentiment_score,hybrid_rating
0,fit,420272,34d,2260466,137.0,10.0,vacation,An adorable romper! Belt and zipper were a lit...,hourglass,So many compliments!,romper,68.0,14,28.0,"April 20, 2016",0.625,10.625
1,fit,273551,34b,153475,132.0,10.0,other,I rented this dress for a photo shoot. The the...,straight & narrow,I felt so glamourous!!!,gown,66.0,12,36.0,"June 18, 2013",0.0,10.0
2,fit,360448,34b,1063761,137.391709,10.0,party,This hugged in all the right places! It was a ...,hourglass,It was a great time to celebrate the (almost) ...,sheath,64.0,4,116.0,"December 14, 2015",0.525,10.525
3,fit,909926,34c,126335,135.0,8.0,formal affair,I rented this for my company's black tie award...,pear,Dress arrived on time and in perfect condition.,dress,65.0,8,34.0,"February 12, 2014",1.0,9.0
4,fit,151944,34b,616682,145.0,10.0,wedding,I have always been petite in my upper body and...,athletic,Was in love with this dress !!!,gown,69.0,12,27.0,"September 26, 2016",0.976562,10.976562


In [20]:
df3['fit'].value_counts()

fit      142049
small     25779
large     24706
Name: fit, dtype: int64

Building a recommendation system using SVD. Collaborative filtering

In [21]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# Assuming df3 is your DataFrame with the hybrid_rating column
reader = Reader(rating_scale=(df3['hybrid_rating'].min(), df3['hybrid_rating'].max()))
data = Dataset.load_from_df(df3[['user_id', 'item_id', 'hybrid_rating']], reader)

# Split the dataset
trainset, testset = train_test_split(data, test_size=0.2)

# Use SVD as an example collaborative filtering algorithm
model = SVD()
model.fit(trainset)

# Make predictions on the test set
predictions = model.test(testset)

# Evaluate the model
rmse = accuracy.rmse(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")

RMSE: 1.5199
Root Mean Squared Error (RMSE): 1.5198611042864896


In [22]:
# Making a prediction
user_328254_prediction = model.predict("328254", "1505652", "8.15" )
user_328254_prediction

Prediction(uid='328254', iid='1505652', r_ui='8.15', est=9.547484494365357, details={'was_impossible': False})

Fine tuning the SVD model

In [23]:
from surprise.model_selection import cross_validate
from surprise import SVD

# Define parameter combinations to try
param_combinations = {'n_epochs': [5, 10, 15], 'lr_all': [0.002, 0.005, 0.01], 'reg_all': [0.02, 0.1, 0.4]}

# Load the dataset
data = Dataset.load_from_df(df3[['user_id', 'item_id', 'hybrid_rating']], reader)

# Iterate over parameter combinations
for n_epochs in param_combinations['n_epochs']:
    for lr_all in param_combinations['lr_all']:
        for reg_all in param_combinations['reg_all']:
            # Create SVD model with current parameters
            svd_model = SVD(n_epochs=n_epochs, lr_all=lr_all, reg_all=reg_all)

            # Cross-validate the model
            results = cross_validate(svd_model, data, measures=['RMSE'], cv=3, verbose=False)

            # Print results
            print(f"Parameters: {{'n_epochs': {n_epochs}, 'lr_all': {lr_all}, 'reg_all': {reg_all}}}")
            print(f"Mean RMSE across folds: {results['test_rmse'].mean()}\n")


Parameters: {'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.02}
Mean RMSE across folds: 1.5184605100410693

Parameters: {'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.1}
Mean RMSE across folds: 1.5187707626600948

Parameters: {'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.4}
Mean RMSE across folds: 1.5193212521562784

Parameters: {'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.02}
Mean RMSE across folds: 1.5091684809575951

Parameters: {'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.1}
Mean RMSE across folds: 1.508441669346558

Parameters: {'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.4}
Mean RMSE across folds: 1.509626050336286

Parameters: {'n_epochs': 5, 'lr_all': 0.01, 'reg_all': 0.02}
Mean RMSE across folds: 1.5038606785148723

Parameters: {'n_epochs': 5, 'lr_all': 0.01, 'reg_all': 0.1}
Mean RMSE across folds: 1.503708372715877

Parameters: {'n_epochs': 5, 'lr_all': 0.01, 'reg_all': 0.4}
Mean RMSE across folds: 1.5028145439205691

Parameters: {'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0

Best parameters for our model are:

Parameters: {'n_epochs': 15, 'lr_all': 0.01, 'reg_all': 0.4}

Mean RMSE across folds: 1.5003414059251847

In [30]:
# Create the final SVD model with optimal parameters
final_svd_model = SVD(n_epochs=15, lr_all=0.01, reg_all=0.4)

# Load the dataset
data = Dataset.load_from_df(df3[['user_id', 'item_id', 'hybrid_rating']], reader)

# Build the full training set
trainset = data.build_full_trainset()

# Train the final model on the entire dataset
final_svd_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1061c816910>

In [31]:
# Making a prediction
user_328254_prediction = final_svd_model.predict("328254", "1505652", "8.15" )
user_328254_prediction

Prediction(uid='328254', iid='1505652', r_ui='8.15', est=9.546660162791381, details={'was_impossible': False})

Collaborative filtering using ALS

In [32]:
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms.matrix_factorization import SVD

# Load the dataset
reader = Reader(rating_scale=(df3['hybrid_rating'].min(), df3['hybrid_rating'].max()))
data = Dataset.load_from_df(df3[['user_id', 'item_id', 'hybrid_rating']], reader)

# Define the SVD model with ALS optimization
als_model = SVD(n_factors=50, reg_all=0.02, biased=True)

# Evaluate the SVD model using cross-validation
cv_results = cross_validate(als_model, data, measures=['RMSE'], cv=3, verbose=False)

# Print the mean RMSE across folds
print(f"Mean RMSE across folds: {cv_results['test_rmse'].mean()}")

# Build the full training set
trainset = data.build_full_trainset()

# Train the SVD model on the entire dataset
als_model.fit(trainset)

# Make predictions for a specific user (replace user_id with an actual user ID)
user_id_to_predict = 420272
item_ids_to_predict = df3['item_id'].unique()

# Get predictions for the specified user and items
predictions = [als_model.predict(user_id_to_predict, item_id) for item_id in item_ids_to_predict]

# Extract item IDs and predicted ratings
predicted_ratings = [(prediction.iid, prediction.est) for prediction in predictions]

# Sort the predictions by rating in descending order
sorted_predictions = sorted(predicted_ratings, key=lambda x: x[1], reverse=True)

# Display the top N recommended items
top_n = 10
top_recommendations = sorted_predictions[:top_n]
print(f"Top {top_n} Recommendations for User {user_id_to_predict}:")
for item_id, rating in top_recommendations:
    print(f"Item ID: {item_id}, Predicted Rating: {rating}")


Mean RMSE across folds: 1.5043259759129575
Top 10 Recommendations for User 420272:
Item ID: 154002, Predicted Rating: 10.923462681733147
Item ID: 131533, Predicted Rating: 10.391970045728955
Item ID: 168592, Predicted Rating: 10.26438167265091
Item ID: 377662, Predicted Rating: 10.146160762059218
Item ID: 1108555, Predicted Rating: 10.08867237136762
Item ID: 740349, Predicted Rating: 10.08505083095506
Item ID: 1186923, Predicted Rating: 10.064091959910694
Item ID: 1142945, Predicted Rating: 10.01861248884452
Item ID: 141761, Predicted Rating: 10.018104993086375
Item ID: 795320, Predicted Rating: 10.009298215621765


NOTE: ALS is often used when dealing with sparse matrices (a matrix with many missing values), as it allows for efficient parallelization and handles missing values well.In this case, our data has no missing values and thus the SVD model fits perfectly.

In [33]:
df3.head(5)

Unnamed: 0,fit,user_id,bust size,item_id,weight,rating,rented for,review_text,body type,review_summary,category,height,size,age,review_date,sentiment_score,hybrid_rating,content
0,fit,420272,34d,2260466,137.0,10.0,vacation,An adorable romper! Belt and zipper were a lit...,hourglass,So many compliments!,romper,68.0,14,28.0,"April 20, 2016",0.625,10.625,so many compliments ! romper hourglass vacation
1,fit,273551,34b,153475,132.0,10.0,other,I rented this dress for a photo shoot. The the...,straight & narrow,I felt so glamourous!!!,gown,66.0,12,36.0,"June 18, 2013",0.0,10.0,i felt so glamourous ! ! ! gown straight & nar...
2,fit,360448,34b,1063761,137.391709,10.0,party,This hugged in all the right places! It was a ...,hourglass,It was a great time to celebrate the (almost) ...,sheath,64.0,4,116.0,"December 14, 2015",0.525,10.525,it was a great time to celebrate the ( almost ...
3,fit,909926,34c,126335,135.0,8.0,formal affair,I rented this for my company's black tie award...,pear,Dress arrived on time and in perfect condition.,dress,65.0,8,34.0,"February 12, 2014",1.0,9.0,dress arrived on time and in perfect condition...
4,fit,151944,34b,616682,145.0,10.0,wedding,I have always been petite in my upper body and...,athletic,Was in love with this dress !!!,gown,69.0,12,27.0,"September 26, 2016",0.976562,10.976562,was in love with this dress ! ! ! gown athleti...


Building a content-based recommendation system

In [34]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')

# Combine relevant text features into a single column for content-based filtering
df3['content'] = df3['review_summary'].fillna('') + ' ' + df3['category'].fillna('') + ' ' + df3['body type'].fillna('') + ' ' + df3['rented for'].fillna('')

# NLP preprocessing
def preprocess_text(text):
    # Tokenization using NLTK
    tokens = nltk.word_tokenize(text)
    
    # Lowercasing
    tokens = [token.lower() for token in tokens]
    
    # Join the tokens back into a single string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

# Apply the preprocessing function to the 'content' column
df3['content'] = df3['content'].apply(preprocess_text)

# Display the preprocessed content
print("Preprocessed Content:")
print(df3['content'].head())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Preprocessed Content:
0      so many compliments ! romper hourglass vacation
1    i felt so glamourous ! ! ! gown straight & nar...
2    it was a great time to celebrate the ( almost ...
3    dress arrived on time and in perfect condition...
4    was in love with this dress ! ! ! gown athleti...
Name: content, dtype: object


In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

# Reduce the sample size
sample_size = 20000
df_sample = df3.sample(n=sample_size, random_state=42)

# Combine relevant text features into a single column for content-based filtering
df_sample['content'] = df_sample['review_summary'].fillna('') + ' ' + df_sample['category'].fillna('') + ' ' + df_sample['body type'].fillna('') + ' ' + df_sample['rented for'].fillna('')

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the content column
tfidf_matrix = tfidf_vectorizer.fit_transform(df_sample['content'])

# Implement batch processing for cosine similarity
def calculate_cosine_similarity_in_batches(matrix, batch_size=1000):
    num_items = matrix.shape[0]
    cosine_sim_batches = []

    for i in range(0, num_items, batch_size):
        start_idx = i
        end_idx = min((i + batch_size), num_items)
        batch_matrix = matrix[start_idx:end_idx, :]
        cosine_sim_batch = cosine_similarity(batch_matrix, matrix, dense_output=False)
        cosine_sim_batches.append(cosine_sim_batch)

    return np.vstack(cosine_sim_batches)

# Calculate cosine similarity in batches
cosine_sim = calculate_cosine_similarity_in_batches(tfidf_matrix)

# Apply dimensionality reduction using TruncatedSVD
num_components = 100
svd = TruncatedSVD(n_components=num_components)
tfidf_matrix_reduced = svd.fit_transform(tfidf_matrix)
cosine_sim_reduced = cosine_similarity(tfidf_matrix_reduced, dense_output=False)

In [37]:
# Display the items in df_sample
df_sample['item_id'].head()

85703     2578545
43749     1697200
70        1113191
30691      174086
121968    2110410
Name: item_id, dtype: int64

In [38]:
# Function to get content-based recommendations for a given item after dimensionality reduction
def get_content_based_recommendations_reduced(item_id, cosine_sim=cosine_sim_reduced, df=df_sample, svd=svd):
    idx = df[df['item_id'] == item_id].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Exclude the item itself and take top 10
    item_indices = [score[0] for score in sim_scores]
    return df['item_id'].iloc[item_indices]

In [39]:
# Example: Get content-based recommendations for an item
item_id_to_recommend_for = 1113191
content_based_recommendations_reduced = get_content_based_recommendations_reduced(item_id_to_recommend_for)

# Display the recommendations
print(f"Content-Based Recommendations (Reduced) for Item {item_id_to_recommend_for}:")
print(content_based_recommendations_reduced)

Content-Based Recommendations (Reduced) for Item 1113191:
101884    1207456
95898     1769937
74597      126335
30780      682043
4459      1875147
155714     982932
21128      128730
152505     158794
126649    1310167
116215    1630965
Name: item_id, dtype: int64


In [46]:
item_id_to_view = 1113191
item_raw = df3[df3['item_id'] == item_id_to_view]
item_raw

Unnamed: 0,fit,user_id,bust size,item_id,weight,rating,rented for,review_text,body type,review_summary,category,height,size,age,review_date,sentiment_score,hybrid_rating,content
70,fit,769732,34d,1113191,137.391709,10.0,wedding,The dress fit well - both the 8 and 10. I went...,full bust,Comfortable and put together for a late summer...,dress,63.0,20,33.0,"August 31, 2014",0.012500,10.012500,comfortable and put together for a late summer...
1412,small,329034,40d,1113191,180.000000,4.0,formal affair,Do not rent it unless you have a small waist. ...,hourglass,Poor fit. Too tight at waist.,dress,65.0,39,53.0,"March 27, 2015",-0.059524,3.940476,poor fit . too tight at waist . dress hourglas...
5109,small,999518,38d+,1113191,137.391709,6.0,party,I loved this and thought it would be perfect f...,full bust,I wanted to... but I just didn't fit,dress,63.0,32,46.0,"July 13, 2015",0.400000,6.400000,i wanted to ... but i just did n't fit dress f...
5137,fit,495314,34a,1113191,125.000000,10.0,wedding,"I'm a LBD girl, through and through. But when ...",hourglass,Stylish and the right amount of special,dress,68.0,4,32.0,"December 18, 2014",0.380952,10.380952,stylish and the right amount of special dress ...
5190,fit,773974,34d,1113191,130.000000,10.0,work,Wore with a half cardigan for work. Very comfo...,athletic,Work is better with fancy dresses,dress,66.0,8,33.0,"October 23, 2016",0.500000,10.500000,work is better with fancy dresses dress athlet...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187748,small,783896,34b,1113191,137.391709,6.0,wedding,RTR had a crazy fulfillment issue the weekend ...,hourglass,"Little snug, cute dress.",dress,66.0,51,36.0,"October 26, 2015",0.156250,6.156250,"little snug , cute dress . dress hourglass wed..."
188384,fit,31444,34b,1113191,130.000000,10.0,party,"Loved the fit, and the material.",straight & narrow,It was both Easter and my birthday. The scuba ...,dress,70.0,8,36.0,"September 14, 2017",1.000000,11.000000,it was both easter and my birthday . the scuba...
189086,fit,651371,40f,1113191,200.000000,10.0,wedding,This was my first RTR for my cousins wedding. ...,full bust,Fun Dress. Fun Print. Fun Night!!,dress,65.0,45,25.0,"October 2, 2015",0.356250,10.356250,fun dress . fun print . fun night ! ! dress fu...
189833,fit,514655,38d,1113191,185.000000,8.0,wedding,I really liked this dress. Thought it was goin...,pear,Palm Springs wedding!,dress,66.0,24,58.0,"February 28, 2015",0.000000,8.000000,palm springs wedding ! dress pear wedding
