Labelling text data using Logistic regression

In [38]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import nltk
from nltk.corpus import movie_reviews
from nltk.sentiment import SentimentAnalyzer
from nltk.classify import NaiveBayesClassifier

# Download the movie reviews dataset
# Download the movie reviews dataset and necessary NLTK data
nltk.download('movie_reviews')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')  # Add this line

# Create a sentiment analyzer
sentiment_analyzer = SentimentAnalyzer()

# Get the IDs of the movie reviews
ids = movie_reviews.fileids()

# Preprocessing
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
from nltk.tokenize import word_tokenize

def preprocess(document):
    words = word_tokenize(document)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Feature selection
vectorizer = TfidfVectorizer(preprocessor=preprocess, ngram_range=(1, 2))

# Create the feature matrix
X = vectorizer.fit_transform([movie_reviews.raw(fileid) for fileid in ids])

# Create the target vector
y = [movie_reviews.categories([f])[0] for f in ids]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model selection and training
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluation
accuracy = model.score(X_test, y_test)
print(f"Accuracy: {accuracy:.2%}")

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/sudachk/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package wordnet to /Users/sudachk/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/sudachk/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /Users/sudachk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Accuracy: 81.75%


In [42]:
# Test the classifier with custom sentences
custom_sentences = [
    "I loved the movie and it was amazing. Best movie I have seen this year.",  # Positive sentiment
    "The movie was terrible. The plot was non-existent and the acting was subpar.",  # Negative sentiment
    "I have mixed feelings about the movie. Some parts were good, but some were not.",  # Mixed sentiment
]

for sentence in custom_sentences:
    # Preprocess the sentence
    preprocessed_sentence = preprocess(sentence)
    
    # Transform the sentence into features
    features = vectorizer.transform([preprocessed_sentence])
    
    # Predict the sentiment
    sentiment = model.predict(features)
    
    print(f"Sentence: {sentence}\nSentiment: {sentiment[0]}\n")

Sentence: I loved the movie and it was amazing. Best movie I have seen this year.
Sentiment: pos

Sentence: The movie was terrible. The plot was non-existent and the acting was subpar.
Sentiment: neg

Sentence: I have mixed feelings about the movie. Some parts were good, but some were not.
Sentiment: neg

