In [25]:
# Importing the libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans



In [None]:
# Downloading the NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')



In [27]:
# Loading the dataset
data = pd.read_csv('comments.csv')

# Exploring the dataset
data.head()

# Cleaning the comments
corpus = []
for i in range(0, len(data)):
    review = re.sub('[^a-zA-Z]', ' ', str(data['comment'][i]))
    review = review.lower()
    review = review.split()
    lemmatizer = WordNetLemmatizer()
    review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

# Adding the cleaned comments to the DataFrame
data['Cleaned Comment'] = corpus

# Creating the feature variable
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(corpus).toarray()

# Applying the KMeans algorithm
kmeans = KMeans(n_clusters=3, init='k-means++', random_state=42)
y_kmeans = kmeans.fit_predict(X)

# Adding the cluster labels to the DataFrame
data['Cluster'] = y_kmeans

# Printing the types of comments identified by KMeans
for i in range(0, 3):
    print('Type of Comment', i, ':')
    print(data[data['Cluster'] == i]['comment'].head(), '\n')


Type of Comment 0 :
9     ... done well by me appear to have a good shel...
13    ... got em so I can't really comment on how go...
28    ... THEM BUT HOPEFULLY ALL OF THEM WILL REMAIN...
43      1st purchase was good, this time they are junk!
65                                      A 'GREAT' Value
Name: comment, dtype: object 

Type of Comment 1 :
51                      5 star
52    5 Star review for the AA
53                   5 Star!!!
54                     5 stars
55                     5 Stars
Name: comment, dtype: object 

Type of Comment 2 :
0    ... 3 of them and one of the item is bad quali...
1    ... always the less expensive way to go for pr...
2    ... are not Duracell but for the price i am ha...
3    ... as well as name brand batteries at a much ...
4    ... batteries are very long lasting the price ...
Name: comment, dtype: object 



