In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from scraper import scrape_reviews
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

In [2]:
data = scrape_reviews('hellofresh.de', 40)
data.head()

KeyboardInterrupt: 

In [None]:
X, y = data['reviews'], data['ratings']
y

0       4
1       4
2       5
3       5
4       3
       ..
1995    3
1996    1
1997    4
1998    1
1999    1
Name: ratings, Length: 2000, dtype: object

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB())
])

# Parameter für GridSearchCV festlegen
param_grid = {
    'vectorizer__max_features': [500, 1000, 1500],
    'vectorizer__ngram_range': [(1, 1), (1, 2)],
    'classifier__alpha': [0.1, 1, 10]
}

In [None]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [None]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

In [None]:
print("Best Parameters:", grid_search.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Best Parameters: {'classifier__alpha': 1, 'vectorizer__max_features': 1500, 'vectorizer__ngram_range': (1, 2)}
Accuracy: 0.5675
Classification Report:
               precision    recall  f1-score   support

           1       0.69      0.72      0.70       103
           2       0.36      0.30      0.33        46
           3       0.51      0.48      0.49        95
           4       0.46      0.47      0.46        79
           5       0.68      0.73      0.70        77

    accuracy                           0.57       400
   macro avg       0.54      0.54      0.54       400
weighted avg       0.56      0.57      0.56       400

