In [8]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from imblearn.over_sampling import RandomOverSampler

# Ensure necessary NLTK packages are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
data = pd.read_csv('/content/sorna.csv') # Update the path to your dataset

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Text preprocessing function
def preprocess_text(text):
    words = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    words = [lemmatizer.lemmatize(word) for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(words)

# Preprocess the comments
data['Processed_Comment'] = data['Comment'].apply(preprocess_text)

# Display class distribution before oversampling
print("Class distribution before oversampling:")
print(data['Sentiment'].value_counts())

# Split features and labels
X = data['Processed_Comment']
y = data['Sentiment']

# Oversample to handle class imbalance
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X.values.reshape(-1, 1), y)

# Convert back to series for processing
X_resampled = pd.Series([x[0] for x in X_resampled])

# Display class distribution after oversampling
print("\nClass distribution after oversampling:")
print(y_resampled.value_counts())

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Build the TF-IDF + Naive Bayes pipeline
model_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
    ('classifier', MultinomialNB())
])

# Train the model
model_pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = model_pipeline.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Predict the sentiment of a sample review
sample_comment = input("Enter a book review comment: ")
processed_sample = preprocess_text(sample_comment)
predicted_sentiment = model_pipeline.predict([processed_sample])
print(f"\nThe predicted sentiment for the comment is: {predicted_sentiment[0]}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Class distribution before oversampling:
Sentiment
Positive    15
Neutral     13
Negative    12
Name: count, dtype: int64

Class distribution after oversampling:
Sentiment
Positive    15
Negative    15
Neutral     15
Name: count, dtype: int64

Classification Report:
              precision    recall  f1-score   support

    Negative       0.67      0.50      0.57         4
     Neutral       0.67      0.67      0.67         3
    Positive       0.33      0.50      0.40         2

    accuracy                           0.56         9
   macro avg       0.56      0.56      0.55         9
weighted avg       0.59      0.56      0.57         9

Enter a book review comment: boring

The predicted sentiment for the comment is: Negative
