In [None]:

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from nltk.sentiment import SentimentIntensityAnalyzer

# Load NLTK's sentiment analyzer
sid = SentimentIntensityAnalyzer()

data = pd.read_csv('Product listing.csv')

# Data preprocessing
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Tokenization can be done using regex or libraries like NLTK or spaCy
    # Here, a simple split by space is used
    tokens = text.split()
    # Remove stopwords (you may need to download the stopwords list for your language)
    stopwords = set(['the', 'and', 'is', 'in', 'to', 'it', 'this', 'of', 'for', 'with', 'as'])
    tokens = [token for token in tokens if token not in stopwords]
    return ' '.join(tokens)

data['clean_text'] = data['product'].apply(preprocess_text)

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust max_features as needed
tfidf_matrix = tfidf_vectorizer.fit_transform(data['clean_text'])

# Clustering with K-means
k = 5  # Number of clusters (you can adjust this)
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(tfidf_matrix)

# Assign cluster labels to each review
data['cluster_label'] = kmeans.labels_

# Sentiment Analysis
def get_sentiment(text):
    # NLTK's sentiment analyzer
    sentiment_scores = sid.polarity_scores(text)
    # Classify sentiment based on compound score
    if sentiment_scores['compound'] >= 0.05:
        return 'Positive'
    elif sentiment_scores['compound'] <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'
    
data['sentiment'] = data['clean_text'].apply(get_sentiment)


# Evaluate clustering using silhouette score
silhouette_avg = silhouette_score(tfidf_matrix, kmeans.labels_)
print(f"Silhouette Score: {silhouette_avg}")

# Display some reviews from each cluster
for cluster_id in range(k):
    cluster_samples = data[data['cluster_label'] == cluster_id].sample(5)  # Displaying 5 samples per cluster
    print(f"\nCluster {cluster_id}:")
    for index, row in cluster_samples.iterrows():
        print(row['product'])
        print("Sentiment:", row['sentiment'])
        print('-' * 50)

# You can further analyze the clusters and refine the process as needed
