
    ## Section 1: Import Libraries and Load Data
    This section imports necessary libraries and loads the dataset, preparing it for analysis. We begin by importing libraries essential for data manipulation, visualization, and machine learning.
    

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from scipy import stats
from scipy.cluster.hierarchy import dendrogram, linkage

In [None]:
data = pd.read_csv('renttherunway.csv')


    ## Section 2: Data Cleansing and Exploratory Data Analysis
    Here, we clean the data by removing duplicates and unnecessary columns, and conduct exploratory data analysis to understand the dataset better. This involves checking the structure, removing redundant information, and preparing the data for further analysis.
    

In [None]:
data = data.drop_duplicates()

In [None]:
data = data.drop(['user_id', 'review_text'], axis=1)

In [None]:
data['weight'] = data['weight'].str.replace('lbs', '').astype(float)

# data['rented_for'] = data['rented_for'].replace({'party: cocktail': 'party'})

data['height'] = data['height'].apply(lambda x: float(str(x).split(' ')[0].replace("'", ''))*12 + float(str(x).split(' ')[-1].replace('"', '')))

data.fillna(data.mean(), inplace=True)
data.fillna(data.mode().iloc[0], inplace=True)

numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
data = data[(np.abs(stats.zscore(data[numeric_cols])) < 3).all(axis=1)]


    ## Section 3: Feature Engineering and Data Preparation
    We add new features and prepare the dataset for modeling by encoding categorical variables and scaling numerical values. This is crucial for effective model performance, especially in clustering and PCA.
    

In [None]:
data['weight_height_ratio'] = data['weight'] / data['height']
data['review_month'] = pd.to_datetime(data['review_date']).dt.month

le = LabelEncoder()
categorical_features = data.select_dtypes(include=['object']).columns
for col in categorical_features:
    data[col] = le.fit_transform(data[col])

scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data)



    ## Section 4: Principal Component Analysis (PCA) and Clustering
    Apply PCA to reduce dimensions and cluster the data using K-means and Agglomerative clustering methods. We will determine the optimal number of clusters using the elbow method and validate the clustering quality using the silhouette score.
    

In [None]:
data.describe()


In [None]:
data_pca.shape


    ## Conclusion
    This section summarizes the findings from the clustering analysis, discussing how the clusters could be interpreted and suggesting potential strategies for customer segmentation based on the clustered data.
    

In [None]:

    # Summary of clustering results and potential marketing strategies based on customer segments.
    # (User should add detailed analysis and conclusions here based on the outputs from previous sections.)
    