In [11]:
!pip install pandas scikit-learn

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Larger balanced sample dataset
data = {
    'text': [
        "I love this product, it's amazing!",
        "Horrible experience, will never buy again.",
        "Pretty decent, met my expectations.",
        "Worst purchase ever, very disappointed.",
        "Fantastic quality and fast shipping.",
        "Not good, it broke after one use.",
        "Great value for the price.",
        "Terrible customer service.",
        "I'm very happy with my order.",
        "It doesn't work as advertised.",
        "Absolutely wonderful, exceeded expectations!",
        "This was a waste of money.",
        "Superb item, will buy again.",
        "I hate it, totally useless.",
        "Best thing I've bought all year.",
        "It stopped working after a day.",
        "Highly recommend this to everyone.",
        "The product is defective and slow.",
        "Amazing performance and quality.",
        "Don't bother buying this.",
        "Delighted with the purchase, very happy.",
        "Disappointing, it failed quickly.",
        "Works perfectly, I’m impressed.",
        "Not worth the price at all.",
        "Exceeded my expectations by far.",
        "I regret buying this item.",
        "Love the features and easy to use.",
        "Poor design and functionality.",
        "Top-notch product, highly recommend!",
        "Terrible, it arrived broken.",
        "Very satisfied with the quality.",
        "Disliked it, would not recommend.",
        "Fantastic service and product.",
        "Broke after first use, very bad.",
        "This is my favorite purchase this year.",
        "Do not waste your money on this.",
        "Excellent build and great support.",
        "Stopped working after a week.",
        "I am very pleased with this product.",
        "Worst product I have ever bought."
    ],
    'label': [
        'positive', 'negative', 'positive', 'negative', 'positive',
        'negative', 'positive', 'negative', 'positive', 'negative',
        'positive', 'negative', 'positive', 'negative', 'positive',
        'negative', 'positive', 'negative', 'positive', 'negative',
        'positive', 'negative', 'positive', 'negative', 'positive',
        'negative', 'positive', 'negative', 'positive', 'negative',
        'positive', 'negative', 'positive', 'negative', 'positive',
        'negative', 'positive', 'negative', 'positive', 'negative'
    ]
}

df = pd.DataFrame(data)

# Encode labels: positive=1, negative=0
df['label_num'] = df['label'].map({'positive': 1, 'negative': 0})

# Vectorize text with TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['text'])
y = df['label_num']

# Split dataset into train and test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

grid_search = GridSearchCV(rf, param_grid, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)

# Evaluate on test data
y_pred = grid_search.predict(X_test)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=0))

# Predict new samples
new_texts = [
    "I really enjoyed this, highly recommend!",
    "Completely useless, very disappointed."
]

new_vectors = vectorizer.transform(new_texts)
predictions = grid_search.predict(new_vectors)

for text, pred in zip(new_texts, predictions):
    sentiment = "Positive" if pred == 1 else "Negative"
    print(f"Text: '{text}' => Sentiment: {sentiment}")


Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}

Accuracy: 0.75

Classification Report:
               precision    recall  f1-score   support

           0       0.67      1.00      0.80         4
           1       1.00      0.50      0.67         4

    accuracy                           0.75         8
   macro avg       0.83      0.75      0.73         8
weighted avg       0.83      0.75      0.73         8

Text: 'I really enjoyed this, highly recommend!' => Sentiment: Positive
Text: 'Completely useless, very disappointed.' => Sentiment: Negative
