In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import re

# Download the necessary NLTK data
nltk.download('movie_reviews')
nltk.download('stopwords')
nltk.download('wordnet')

# Get the reviews
reviews = [movie_reviews.raw(fileid) for fileid in movie_reviews.fileids()]

# Preprocess the text
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
reviews = [' '.join(lemmatizer.lemmatize(word) for word in re.sub('[^a-zA-Z]', ' ', review).lower().split() if word not in stop_words) for review in reviews]

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Transform the reviews into TF-IDF features
X_tfidf = vectorizer.fit_transform(reviews)

# Cluster the reviews using K-means
kmeans = KMeans(n_clusters=3).fit(X_tfidf)

# Define the labels for the clusters
cluster_labels = {0: "positive", 1: "negative", 2: "neutral"}

# Test the classifier with custom sentences
custom_sentences = ["I loved the movie and good and amazing movie I have seen this year.", 
                    "The movie was terrible. The plot was non-existent and the acting was subpar.", 
                    "I have mixed feelings about the movie.it is partly good and partly not good."]

for sentence in custom_sentences:
    # Preprocess the sentence
    sentence = ' '.join(lemmatizer.lemmatize(word) for word in re.sub('[^a-zA-Z]', ' ', sentence).lower().split() if word not in stop_words)
    
    # Transform the sentence into TF-IDF features
    features = vectorizer.transform([sentence])
    
    # Predict the cluster of the sentence
    cluster = kmeans.predict(features)
    
    # Get the label for the cluster
    label = cluster_labels[cluster[0]]
    
    print(f"Sentence: {sentence}\nLabel: {label}\n")

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/sudachk/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sudachk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/sudachk/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Sentence: loved movie good amazing movie seen year
Label: positive

Sentence: movie terrible plot non existent acting subpar
Label: neutral

Sentence: mixed feeling movie partly good partly good
Label: neutral

