In [None]:
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import nltk
nltk.download('stopwords')
import nltk
nltk.download('punkt')


# Load the data
data = pd.read_excel("/content/imdb datasets.xlsx")

# Preprocess the text data
stop_words = set(stopwords.words("english"))
stemmer = SnowballStemmer("english")

def preprocess_text(text):
    # Remove non-alphanumeric characters and convert to lowercase
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    # Tokenize the text into individual words
    words = word_tokenize(text)
    # Remove stop words
    words = [word for word in words if word not in stop_words]
    # Stem the remaining words
    words = [stemmer.stem(word) for word in words]
    # Join the stemmed words back into a string
    return " ".join(words)

data["review"] = data["review"].apply(preprocess_text)

# Create a bag-of-words representation of the preprocessed text data
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data["review"])
y = data["sentiment"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5000)

# Create a KNN classifier with k=150
knn = KNeighborsClassifier(n_neighbors=150)

# Train the classifier on the training data
knn.fit(X_train, y_train)

# Use the classifier to make predictions on the testing data
y_pred = knn.predict(X_test)

# Evaluate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Accuracy: 0.6908


In [None]:
# Load the trained KNN classifier
knn = KNeighborsClassifier(n_neighbors=300)
knn.fit(X_train, y_train)

# Preprocess the new review text
new_review = "This movie is good"
new_review_processed = preprocess_text(new_review)

# Convert the processed review text to a bag-of-words representation
new_review_bow = vectorizer.transform([new_review_processed])

# Use the trained KNN classifier to predict the sentiment of the new review
predicted_sentiment = knn.predict(new_review_bow)

print("Predicted sentiment:", predicted_sentiment)


Predicted sentiment: ['positive']


In [None]:
# Load the trained KNN classifier
knn = KNeighborsClassifier(n_neighbors=67)
knn.fit(X_train, y_train)

# Preprocess the new review text
new_review = "This movie is terrible"
new_review_processed = preprocess_text(new_review)

# Convert the processed review text to a bag-of-words representation
new_review_bow = vectorizer.transform([new_review_processed])

# Use the trained KNN classifier to predict the sentiment of the new review
predicted_sentiment = knn.predict(new_review_bow)

print("Predicted sentiment:", predicted_sentiment)


Predicted sentiment: ['negative']
