In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, classification_report
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Step 1: Prepare the dataset with data augmentation
data = [
    ("The product is great, I'm very happy!", "positive"),
    ("Terrible, do not buy this product!", "negative"),
    ("Excellent quality, highly recommend.", "positive"),
    ("It broke after a few uses.", "negative"),
    ("Amazing performance and quality!", "positive"),
    ("Disappointed with the purchase.", "negative"),
    ("Works flawlessly, highly satisfied.", "positive"),
    ("Not worth the money, poor build.", "negative"),
    ("Satisfied with the durability.", "positive"),
    ("Poor customer service and support.", "negative")
]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Convert the data to a DataFrame
df = pd.DataFrame(data, columns=["review", "label"])

# Step 2: Enhanced Preprocessing Function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize, remove stopwords, and lemmatize
    words = text.split()
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

In [None]:
# Apply preprocessing
df['cleaned_review'] = df['review'].apply(preprocess_text)

# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_review'], df['label'], test_size=0.2, random_state=42)

# Step 4: Text Vectorization using TF-IDF with bigrams
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=1000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Step 5: Ensemble Model (Logistic Regression + Naive Bayes)
log_reg = LogisticRegression(max_iter=300)
nb = MultinomialNB()

# Voting Classifier (soft voting for stability)
ensemble = VotingClassifier(estimators=[
    ('lr', log_reg),
    ('nb', nb)
], voting='soft')


In [None]:
# Hyperparameter tuning using GridSearchCV for Logistic Regression
param_grid = {
    'lr__C': [0.1, 1, 10],
    'lr__solver': ['liblinear', 'saga']
}

# Reduced cv to 2 to avoid error caused by small dataset size.
# Ideally, increase your dataset size for more robust results.
grid = GridSearchCV(ensemble, param_grid, cv=2, scoring='accuracy')
grid.fit(X_train_tfidf, y_train)

In [None]:
# Step 6: Train the optimized model
best_model = grid.best_estimator_
best_model.fit(X_train_tfidf, y_train)

In [None]:
# Step 7: Make predictions and evaluate
y_pred = best_model.predict(X_test_tfidf)

# Accuracy and classification report
accuracy = accuracy_score(y_test, y_pred)
print(f"Optimized Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Optimized Accuracy: 50.00%

Classification Report:
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         1
    positive       0.50      1.00      0.67         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Display sample predictions
print("\nSample Predictions:")
for review, actual, predicted in zip(X_test, y_test, y_pred):
    print(f"Review: '{review}' -> Actual: {actual}, Predicted: {predicted}")


Sample Predictions:
Review: 'satisfied durability' -> Actual: positive, Predicted: positive
Review: 'terrible buy product' -> Actual: negative, Predicted: positive
