# Customer Segmentation and Sentiment Analysis
This notebook loads the dataset, cleans review text, performs sentiment analysis, and runs K-means clustering to identify customer segments.


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
nltk.download("vader_lexicon")
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import re


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/randalburks/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
df = pd.read_csv("Womens Clothing E-Commerce Reviews.csv")
df.head()


In [None]:
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r"[^a-z\s]", "", text)
        text = re.sub(r"\s+", " ", text).strip()
        return text
    return ""


In [None]:
df["Cleaned_Review"] = df["Review Text"].apply(clean_text)
df[["Review Text", "Cleaned_Review"]].head()


In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

def get_vader_sentiment(text):
    if text:
        score = sia.polarity_scores(text)
        return score["compound"]
    return 0.0

df["Sentiment"] = df["Cleaned_Review"].apply(get_vader_sentiment)
df[["Cleaned_Review", "Sentiment"]].head()




In [None]:
features = df[["Age", "Rating", "Positive Feedback Count"]].dropna()
scaler = StandardScaler()
X_scaled = scaler.fit_transform(features)

kmeans = KMeans(n_clusters=3, random_state=42)
labels = kmeans.fit_predict(X_scaled)
features["Cluster"] = labels
features.head()


In [None]:
sns.scatterplot(data=features, x="Age", y="Positive Feedback Count", hue="Cluster", palette="tab10")
plt.title("Customer Clusters")
plt.show()


In [None]:
sns.histplot(df["Sentiment"], kde=True)
plt.title("Sentiment Score Distribution")
plt.show()


## Sentiment by Cluster

In [None]:
merged_df = df[["Cleaned_Review", "Sentiment"]].join(features["Cluster"])
sns.boxplot(data=merged_df, x="Cluster", y="Sentiment")
plt.title("Sentiment by Customer Cluster")
plt.show()


## Average Rating by Cluster

In [None]:
cluster_ratings = df[["Rating"]].join(features["Cluster"])
sns.barplot(data=cluster_ratings, x="Cluster", y="Rating", ci=None)
plt.title("Average Rating per Cluster")
plt.ylim(0, 5)
plt.show()


In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt


for c in sorted(features["Cluster"].unique()):
    cluster_reviews = df.loc[features["Cluster"] == c, "Cleaned_Review"]
    text = " ".join(cluster_reviews.dropna())

    wordcloud = WordCloud(width=800, height=400, background_color="white").generate(text)

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title(f"Word Cloud for Cluster {c}")
    plt.show()
