**In this task, you will cluster the product reviews in the test dataset. You will need to create word
features from the data and use that for k-means clustering. Clustering will be done by product
types, i.e., in this case, the labels will be product categories. You will use the Silhouette score
and Rand index to analyze the quality of clustering.**

In [None]:
#All the required library importing
import pandas as pd
import numpy as np

#uploading the test data
Testing_Data = pd.read_csv('Test.csv')

#Selecting only category and style and making it as a new dataframe
NewDesired_Dataframe = Testing_Data[['category', 'style']]
# Encode the product categories as integer labels
NewDesired_Dataframe['category_id'] = pd.factorize(NewDesired_Dataframe['category'])[0]
NewDesired_Dataframe = NewDesired_Dataframe.drop(columns=['category'])
NewDesired_Dataframe = NewDesired_Dataframe.replace(np.nan, '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NewDesired_Dataframe['category_id'] = pd.factorize(NewDesired_Dataframe['category'])[0]


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

#TfidfVectorizer
tfidf_vectorizer  = TfidfVectorizer(max_df=0.5,min_df=0.1,max_features=3)

#for cluster---->style
train_new_features = tfidf_vectorizer .fit_transform(NewDesired_Dataframe['style'])

In [None]:
#The number clusters and categories--> to be equal
X = len(Testing_Data['category'].unique())

#Training K-means model--> with category and style--->for clustering
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=X, random_state=2, init='random', max_iter= 300)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

#Column transformer
column_trans = ColumnTransformer([('style_tf', TfidfVectorizer(),'style'),],remainder='passthrough', verbose_feature_names_out=True)
#Pipeline to tune kmeans and vectorizer)
X_pipeline = Pipeline(steps=[('column_trans', column_trans),('model', kmeans)])

#hyper parameters of the model
X_pipeline.get_params()

{'memory': None,
 'steps': [('column_trans',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('style_tf', TfidfVectorizer(), 'style')])),
  ('model', KMeans(init='random', n_clusters=6, random_state=2))],
 'verbose': False,
 'column_trans': ColumnTransformer(remainder='passthrough',
                   transformers=[('style_tf', TfidfVectorizer(), 'style')]),
 'model': KMeans(init='random', n_clusters=6, random_state=2),
 'column_trans__n_jobs': None,
 'column_trans__remainder': 'passthrough',
 'column_trans__sparse_threshold': 0.3,
 'column_trans__transformer_weights': None,
 'column_trans__transformers': [('style_tf', TfidfVectorizer(), 'style')],
 'column_trans__verbose': False,
 'column_trans__verbose_feature_names_out': True,
 'column_trans__style_tf': TfidfVectorizer(),
 'column_trans__style_tf__analyzer': 'word',
 'column_trans__style_tf__binary': False,
 'column_trans__style_tf__decode_error': 'strict',
 'column_trans__style_tf__dtype': numpy.flo

In [None]:
from sklearn.model_selection import GridSearchCV

#parameters for tuning
param = {
    'column_trans__style_tf__max_df': (0.5, 0.7),
    'column_trans__style_tf__max_features': (None, 3, 6),
    'column_trans__style_tf__min_df':(0.0,0.1),
    'model__max_iter':[300,450,600]
}

In [None]:
# perform grid search with 5-fold cross validation
GridSearch = GridSearchCV(X_pipeline, param, cv=5, n_jobs=-1, verbose=3)
GridSearch.fit(NewDesired_Dataframe)

#Print best score and params
print("Best Parameters: ", GridSearch.best_params_)
print("Best Score: ", GridSearch.best_score_)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best Parameters:  {'column_trans__style_tf__max_df': 0.5, 'column_trans__style_tf__max_features': 3, 'column_trans__style_tf__min_df': 0.1, 'model__max_iter': 300}
Best Score:  -744.3143375271096




In [None]:
#fitting the model
kmeans.fit(train_new_features)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import silhouette_score, adjusted_rand_score
from IPython.display import display, HTML
# Define HTML tags for bold text
BOLD = '<b>'
RESET = '</b>'

prediction_labels = kmeans.predict(train_new_features)
display(HTML("-----------------------------------------------------------"))
#Rand Index
RandIndex_Score = adjusted_rand_score(NewDesired_Dataframe['category_id'], prediction_labels)
display(HTML(f"{BOLD}Adjusted Rand Index:{RESET}{RandIndex_Score} "))
#Silhouette Score
Silhouette_Score = silhouette_score(train_new_features, prediction_labels,metric='euclidean')
display(HTML(f"{BOLD}Silhouette score:{RESET}{Silhouette_Score} "))

display(HTML("-----------------------------------------------------------"))

In [None]:
#try
#now with different combination---> category, style, reviewText
#all required libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import silhouette_score, adjusted_rand_score
from IPython.display import display, HTML

#uploading the test data
Testing_Data = pd.read_csv('Test.csv')
#category, style, reviewText
NewDesired_Dataframe = Testing_Data[['category', 'style', 'reviewText']]
# Encode the product categories as integer labels
NewDesired_Dataframe['category_id'] = pd.factorize(NewDesired_Dataframe['category'])[0]
NewDesired_Dataframe = NewDesired_Dataframe.drop(columns=['category'])
NewDesired_Dataframe = NewDesired_Dataframe.replace(np.nan, '')

column_trans = ColumnTransformer([('style_tf', TfidfVectorizer(max_df=0.7,min_df=0.1),'style'),('review_tf', TfidfVectorizer(max_df=0.7,min_df=0.1), 'reviewText')],remainder='passthrough', verbose_feature_names_out=True)

kmeans_n = KMeans(n_clusters=6, random_state=0, init='random', max_iter= 300)
#Pipeline to tune kmeans and vectorizer)
X_pipeline = Pipeline(steps=[('column_trans', column_trans),('model', kmeans_n)])

X_pipeline.fit(NewDesired_Dataframe)
prediction_labels = X_pipeline.predict(NewDesired_Dataframe)
train_new_features = column_trans.fit_transform(NewDesired_Dataframe)

# Define HTML tags for bold text
BOLD = '<b>'
RESET = '</b>'
#Rand Index
RandIndex_Score = adjusted_rand_score(NewDesired_Dataframe['category_id'], prediction_labels)
display(HTML(f"{BOLD}Adjusted Rand Index:{RESET}{RandIndex_Score} "))
#Silhouette Score
Silhouette_Score = silhouette_score(train_new_features, prediction_labels,metric='euclidean')
display(HTML(f"{BOLD}Silhouette score:{RESET}{Silhouette_Score} "))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NewDesired_Dataframe['category_id'] = pd.factorize(NewDesired_Dataframe['category'])[0]
