In [1]:
# Importing required libraries

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt


In [2]:
import os
os.getcwd()

'C:\\Users\\Aashish'

In [3]:
#1 Load the dataset
df = pd.read_csv("renttherunway.csv")


In [4]:
#2 Displaying the first few samples
print("First few samples:")
print(df.head())


First few samples:
   Unnamed: 0  fit  user_id bust size  item_id  weight  rating     rented for  \
0           0  fit   420272       34d  2260466  137lbs    10.0       vacation   
1           1  fit   273551       34b   153475  132lbs    10.0          other   
2           2  fit   360448       NaN  1063761     NaN    10.0          party   
3           3  fit   909926       34c   126335  135lbs     8.0  formal affair   
4           4  fit   151944       34b   616682  145lbs    10.0        wedding   

                                         review_text          body type  \
0  An adorable romper! Belt and zipper were a lit...          hourglass   
1  I rented this dress for a photo shoot. The the...  straight & narrow   
2  This hugged in all the right places! It was a ...                NaN   
3  I rented this for my company's black tie award...               pear   
4  I have always been petite in my upper body and...           athletic   

                                      revie

In [5]:
# Checking the shape of the data
print("\nShape of the data:")
print(df.shape)


Shape of the data:
(192544, 16)


In [6]:
# Getting information about the data
print("\nInformation about the data:")
print(df.info())


Information about the data:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192544 entries, 0 to 192543
Data columns (total 16 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Unnamed: 0      192544 non-null  int64  
 1   fit             192544 non-null  object 
 2   user_id         192544 non-null  int64  
 3   bust size       174133 non-null  object 
 4   item_id         192544 non-null  int64  
 5   weight          162562 non-null  object 
 6   rating          192462 non-null  float64
 7   rented for      192534 non-null  object 
 8   review_text     192476 non-null  object 
 9   body type       177907 non-null  object 
 10  review_summary  192197 non-null  object 
 11  category        192544 non-null  object 
 12  height          191867 non-null  object 
 13  size            192544 non-null  int64  
 14  age             191584 non-null  float64
 15  review_date     192544 non-null  object 
dtypes: float64(2), int64(4), ob

In [7]:
#3 Check for duplicate records
duplicate_count = df.duplicated().sum()

# If duplicates are found, drop them
if duplicate_count > 0:
    print(f"There are {duplicate_count} duplicate records in the dataset. Dropping duplicates...")
    df.drop_duplicates(inplace=True)
else:
    print("No duplicate records found in the dataset.")
   



No duplicate records found in the dataset.


In [8]:
df

Unnamed: 0.1,Unnamed: 0,fit,user_id,bust size,item_id,weight,rating,rented for,review_text,body type,review_summary,category,height,size,age,review_date
0,0,fit,420272,34d,2260466,137lbs,10.0,vacation,An adorable romper! Belt and zipper were a lit...,hourglass,So many compliments!,romper,"5' 8""",14,28.0,"April 20, 2016"
1,1,fit,273551,34b,153475,132lbs,10.0,other,I rented this dress for a photo shoot. The the...,straight & narrow,I felt so glamourous!!!,gown,"5' 6""",12,36.0,"June 18, 2013"
2,2,fit,360448,,1063761,,10.0,party,This hugged in all the right places! It was a ...,,It was a great time to celebrate the (almost) ...,sheath,"5' 4""",4,116.0,"December 14, 2015"
3,3,fit,909926,34c,126335,135lbs,8.0,formal affair,I rented this for my company's black tie award...,pear,Dress arrived on time and in perfect condition.,dress,"5' 5""",8,34.0,"February 12, 2014"
4,4,fit,151944,34b,616682,145lbs,10.0,wedding,I have always been petite in my upper body and...,athletic,Was in love with this dress !!!,gown,"5' 9""",12,27.0,"September 26, 2016"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192539,192539,fit,66386,34dd,2252812,140lbs,10.0,work,Fit like a glove!,hourglass,LOVE IT!!! First Item Im thinking of buying!,jumpsuit,"5' 9""",8,42.0,"May 18, 2016"
192540,192540,fit,118398,32c,682043,100lbs,10.0,work,The pattern contrast on this dress is really s...,petite,LOVE it!,dress,"5' 1""",4,29.0,"September 30, 2016"
192541,192541,fit,47002,36a,683251,135lbs,6.0,everyday,"Like the other DVF wraps, the fit on this is f...",straight & narrow,"Loud patterning, flattering fit",dress,"5' 8""",8,31.0,"March 4, 2016"
192542,192542,fit,961120,36c,126335,165lbs,10.0,wedding,This dress was PERFECTION. it looked incredib...,pear,loved this dress it was comfortable and photog...,dress,"5' 6""",16,31.0,"November 25, 2015"


In [9]:
#4 Dropping redundant columns
df = df.drop(['user_id', 'item_id', 'review_text'], axis=1)

In [10]:
#5 Check if there are any string data in 'weight' column
if df['weight'].dtype == 'object':
    # Remove the 'lbs' suffix and convert to float
    df['weight'] = df['weight'].str.replace('lbs', '').astype(float)

# Check the datatype of 'weight' column after conversion
print(df['weight'].dtype)

float64


In [11]:
#6 Check unique categories for 'rented for' column
print("Unique categories for 'rented for' column:")
print(df['rented for'].unique())

# Group 'party: cocktail' category with 'party'
df['rented for'] = df['rented for'].replace('party: cocktail', 'party')

# Check unique categories again after grouping
print("\nUnique categories after grouping:")
print(df['rented for'].unique())

Unique categories for 'rented for' column:
['vacation' 'other' 'party' 'formal affair' 'wedding' 'date' 'everyday'
 'work' nan 'party: cocktail']

Unique categories after grouping:
['vacation' 'other' 'party' 'formal affair' 'wedding' 'date' 'everyday'
 'work' nan]


In [12]:
#7 Check if there are any string data in 'weight' column
if df['weight'].dtype == 'object':
    # Remove the 'lbs' suffix and convert to float
    df['weight'] = df['weight'].str.replace('lbs', '').astype(float)

# Check the datatype of 'weight' column after conversion
print(df['weight'].dtype)

float64


In [13]:
#8 Check for missing values in each column
missing_values = df.isnull().sum()
print("Missing values in each column:")
print(missing_values)

# Impute missing values with appropriate methods
# For numerical columns, you can impute with mean, median, or mode
df['numerical_column'] = df['numerical_column'].fillna(df['numerical_column'].median())

# For categorical columns, you can impute with mode
df['categorical_column'] = df['categorical_column'].fillna(df['categorical_column'].mode()[0])


Missing values in each column:
Unnamed: 0            0
fit                   0
bust size         18411
weight            29982
rating               82
rented for           10
body type         14637
review_summary      347
category              0
height              677
size                  0
age                 960
review_date           0
dtype: int64


KeyError: 'numerical_column'

In [None]:
#9 Statistical summary for numerical columns
numerical_summary = df.describe()
print("Statistical summary for numerical columns:")
print(numerical_summary)

# Statistical summary for categorical columns
categorical_summary = df.describe()
print("Statistical summary for categorical columns:")
print(categorical_summary)

In [None]:
# Check statistical summary
print(df.describe(include='all'))


In [None]:

#10)1it Calculate the interquartile range (IQR) for 'age' column
Q1 = df['age'].quantile(0.25)
Q3 = df['age'].quantile(0.75)
IQR = Q3 - Q1

# Define the lower and upper bounds to identify outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers = df[(df['age'] < lower_bound) | (df['age'] > upper_bound)]

# Print the outliers
print("Outliers in the 'age' column:")
print(outliers)

# Treat outliers
# Replace outliers with the median value
df['age'] = df['age'].apply(lambda x: df['age'].median() if x < lower_bound or x > upper_bound else x)


In [None]:
#11. Check the distribution of the different categories in the column 'rented for'using appropriate plot.

import seaborn as sns
import matplotlib.pyplot as plt

# Count the occurrences of each category in 'rented for' column
category_counts = df['rented for'].value_counts()

# Create a bar plot
plt.figure(figsize=(8, 6))
sns.barplot(x=category_counts.index, y=category_counts.values)
plt.title('Distribution of Categories in "rented for" Column')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

In [None]:

df

In [None]:
#12. Encode the categorical variables in the dataset.
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode categorical variables
df['category'] = label_encoder.fit_transform(df['category'])

# Display the encoded data
print(df.head())


In [9]:
#13. Standardize the data, so that the values are within a particular range
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Assuming 'data' is your DataFrame
# Select only numerical columns
numerical_df = df.select_dtypes(include=['number'])

# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform the numerical data
scaled_df= scaler.fit_transform(numerical_df)

# Convert scaled data back to DataFrame
scaled_df = pd.DataFrame(scaled_df, columns=numerical_df.columns)

# Combine with non-numeric columns from the original DataFrame
for column in df.columns:
    if column not in numerical_df.columns:
        scaled_df[column] = df[column]

# Display the scaled data
print(scaled_df.head())


   Unnamed: 0   user_id   item_id    rating      size        age  fit  \
0   -1.732042 -0.274069  1.508460  0.634687  0.206575  -0.728589  fit   
1   -1.732024 -0.781651 -1.107903  0.634687 -0.028862   0.264205  fit   
2   -1.732006 -0.481030  0.022448  0.634687 -0.970608  10.192151  fit   
3   -1.731988  1.419890 -1.141604 -0.763875 -0.499735   0.016007  fit   
4   -1.731970 -1.202350 -0.532714  0.634687 -0.028862  -0.852689  fit   

  bust size  weight     rented for  \
0       34d  137lbs       vacation   
1       34b  132lbs          other   
2       NaN     NaN          party   
3       34c  135lbs  formal affair   
4       34b  145lbs        wedding   

                                         review_text          body type  \
0  An adorable romper! Belt and zipper were a lit...          hourglass   
1  I rented this dress for a photo shoot. The the...  straight & narrow   
2  This hugged in all the right places! It was a ...                NaN   
3  I rented this for my company'

In [2]:
# 14. Apply PCA on the above dataset and determine the number of PCAcomponents to be used so that 90-95% of the variance in data is explained bythe same
    import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
import numpy as np
import matplotlib.pyplot as plt

# Load your data into a DataFrame
# Replace 'your_data.csv' with the path to your data file
df = pd.read_csv("renttherunway.csv")

# Separate numeric and non-numeric columns
numeric_cols = df.select_dtypes(include=np.number)
non_numeric_cols = df.select_dtypes(exclude=np.number)

# Impute missing values for numeric columns
numeric_imputer = SimpleImputer(strategy='mean')
imputed_numeric_df = numeric_imputer.fit_transform(numeric_cols)

# Impute missing values for non-numeric columns
non_numeric_imputer = SimpleImputer(strategy='most_frequent')
imputed_non_numeric_data = non_numeric_imputer.fit_transform(non_numeric_cols)

# Concatenate imputed numeric and non-numeric data
imputed_df= np.hstack((imputed_numeric_df, imputed_non_numeric_df))

# Scale the data
scaler = StandardScaler()
scaled_df = scaler.fit_transform(imputed_df)

# Apply PCA
pca = PCA()
pca.fit(scaled_df)

# Calculate cumulative explained variance
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

# Find the number of components explaining 90-95% variance
n_components_90 = np.argmax(cumulative_variance >= 0.9) + 1
n_components_95 = np.argmax(cumulative_variance >= 0.95) + 1

print("Number of components to explain 90% variance:", n_components_90)
print("Number of components to explain 95% variance:", n_components_95)

# Plot cumulative explained variance
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', linestyle='--')
plt.title('Cumulative Explained Variance Plot')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.axvline(x=n_components_90, color='r', linestyle='--', label='90% Variance')
plt.axvline(x=n_components_95, color='g', linestyle='--', label='95% Variance')
plt.legend()
plt.grid(True)
plt.show()


NameError: name 'imputed_non_numeric_df' is not defined

In [3]:
#15. Apply K-means clustering and segment the data. (You may use originaldata or PCA transformed data)  
    
    import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np
import matplotlib.pyplot as plt

# Load your data into a DataFrame
# Replace 'your_data.csv' with the path to your data file
df = pd.read_csv('renttherunway.csv')

# Assuming you have selected the appropriate features for clustering
# You can also use PCA transformed data if you have applied PCA earlier

# Step a: Find the optimal K value using the elbow plot method
inertia = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(df)
    inertia.append(kmeans.inertia_)

# Plot the elbow plot
plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), inertia, marker='o')
plt.title('Elbow Plot')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.xticks(range(1, 11))
plt.grid(True)
plt.show()

# Find the optimal K value by locating the elbow point
optimal_k = np.argmin(inertia) + 1
print("Optimal K value:", optimal_k)

# Step b: Build a K-means clustering model using the obtained optimal K value
kmeans_model = KMeans(n_clusters=optimal_k, random_state=42)
clusters = kmeans_model.fit_predict(df)

# Step c: Compute silhouette score for evaluating the quality of the K Means clustering technique
silhouette_avg = silhouette_score(df, clusters)
print("Silhouette Score:", silhouette_avg)


ValueError: could not convert string to float: 'fit'

In [4]:
#16. Apply Agglomerative clustering and segment the data. (You may use
#original data or PCA transformed data)

import pandas as pd
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage
import numpy as np
import matplotlib.pyplot as plt

# Load your data into a DataFrame
# Replace 'your_data.csv' with the path to your data file
df = pd.read_csv("renttherunway.csv")

# Assuming you have selected the appropriate features for clustering
# You can also use PCA transformed data if you have applied PCA earlier

# Step a: Find the optimal K value using dendrogram for Agglomerative clustering
# Take a sample of the dataset to reduce computational time
sample_df = df.sample(frac=0.5, random_state=42)  # Adjust the fraction as needed

# Compute the linkage matrix using Ward's method
linkage_matrix = linkage(sample_df, method='ward')

# Plot the dendrogram
plt.figure(figsize=(12, 6))
dendrogram(linkage_matrix)
plt.title('Dendrogram for Agglomerative Clustering')
plt.xlabel('Sample Index')
plt.ylabel('Distance')
plt.show()

# Step b: Build an Agglomerative clustering model using the obtained optimal K value observed from the dendrogram
# Visual inspection of the dendrogram can help determine the optimal number of clusters (K)
kmeans_model = KMeans(n_clusters=optimal_k, random_state=42)
clusters = kmeans_model.fit_predict(df)


# Step c: Compute silhouette score for evaluating the quality of the Agglomerative clustering technique
# Assuming you have determined the optimal K value manually from the dendrogram
optimal_k = 3  # Adjust this based on dendrogram observation

# Build Agglomerative clustering model
agg_model = AgglomerativeClustering(n_clusters=optimal_k)
clusters = agg_model.fit_predict(df)

# Compute silhouette score
silhouette_avg = silhouette_score(df, clusters)
print("Silhouette Score:", silhouette_avg)


ValueError: could not convert string to float: 'fit'

In [5]:
#17. Perform cluster analysis by doing bivariate analysis between cluster labels
#and different features and write your conclusion on the results. 
 
    import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import f_oneway

# Load your data into a DataFrame
# Replace 'your_data.csv' with the path to your data file
df = pd.read_csv("renttherunway.csv")

# Assuming you have performed K-means clustering and assigned cluster labels to your dataset
# Replace 'cluster_labels' with the actual column name containing cluster labels
cluster_labels = df['cluster_labels']

# Assuming you have other features in your dataset
# Replace 'feature1', 'feature2', ... with actual feature names
features = ['feature1', 'feature2', ...]

# Bivariate analysis for each feature
for feature in features:
    # Box plot or violin plot to visualize feature distribution across clusters
    plt.figure(figsize=(8, 6))
    sns.boxplot(x=cluster_labels, y=df[feature])
    plt.title(f'{feature} by Cluster')
    plt.xlabel('Cluster Labels')
    plt.ylabel(feature)
    plt.show()
    
    # Statistical test (ANOVA) to determine if there are significant differences in feature distributions
    f_statistic, p_value = f_oneway(*(df[feature][cluster_labels == label] for label in cluster_labels.unique()))
    print(f"ANOVA p-value for {feature}: {p_value}")

# Conclusion
# Based on the visualizations and statistical tests:
# - Identify features that show significant differences in distributions across clusters (low p-value).
# - Interpret how these features contribute to cluster differentiation.
# - Draw conclusions about the characteristics of each cluster based on the features analyzed.



KeyError: 'cluster_labels'